def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str, supplier_prefix: str, resp_agent: str, ra_index: dict, br_index: dict, re_index_csv: dict, ar_index_csv: dict, vi_index: dict, preexisting_entities: set): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.resp_agent = resp_agent self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri) self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'} self.br_id_schemas = { 'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata', 'wikipedia' } self.schemas = self.ra_id_schemas.union(self.br_id_schemas) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.preexisting_entities = preexisting_entities self.preexisting_graphs = dict() self.data = data
def test_extract_ids(self): resp_agent = 'http://w3c.org/oc/meta/pa/999' g_set = GraphSet('http://w3c.org/oc/meta/') br = g_set.add_br(resp_agent) isbn = g_set.add_id(resp_agent) isbn.create_isbn('978-88-515-2159-2') orcid = g_set.add_id(resp_agent) orcid.create_orcid('0000-0002-1825-0097') wikidata = g_set.add_id(resp_agent) wikidata.create_wikidata('Q9') br.has_identifier(isbn) br.has_identifier(orcid) br.has_identifier(wikidata) result = extract_ids(br) self.assertIsNotNone(result) self.assertDictEqual( result, { 'isbn13': '978-88-515-2159-2', 'isbn10': '88-515-2159-X', # this is automatically inferred 'orcid': '0000-0002-1825-0097', 'wikidata': 'Q9' })
def process_chunk(filename: str) -> None: """ This function wraps the functionality of the external library 'oc_graphenricher'. It imports an OCDM compliant RDF chunk file, it tries to enrich it with external identifiers and then deduplicates its entities. :param filename: a string representing the filename (without the path) of the chunk file to be processed """ filepath: str = os.path.join(rdf_input_dir, filename) filename_without_extension: str = os.path.splitext(filename)[0] g: Graph = Graph() g = g.parse(filepath, format='nt11') reader: Reader = Reader() g_set: GraphSet = GraphSet(base_iri=base_iri, info_dir=info_dir, supplier_prefix=supplier_prefix, wanted_label=False) reader.import_entities_from_graph(g_set, g, enable_validation=False, resp_agent=resp_agent) # Enrichment enriched_filepath: str = rdf_output_dir + os.sep + 'enriched' + os.sep +\ filename_without_extension + '.nt' enriched_prov: str = rdf_output_dir + os.sep + 'enriched' + os.sep + 'prov' + os.sep +\ filename_without_extension + '.nq' # Output folders are created if not already existing if not os.path.exists(os.path.dirname(enriched_filepath)): os.makedirs(os.path.dirname(enriched_filepath)) if not os.path.exists(os.path.dirname(enriched_prov)): os.makedirs(os.path.dirname(enriched_prov)) enricher: GraphEnricher = GraphEnricher(g_set, graph_filename=enriched_filepath, provenance_filename=enriched_prov, info_dir=info_dir, debug=False, serialize_in_the_middle=False) enricher.enrich() # Deduplication deduplicated_filepath: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep +\ filename_without_extension + '.nt' deduplicated_prov: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep + 'prov' + os.sep + \ filename_without_extension + '.nq' # Output folders are created if not already existing if not os.path.exists(os.path.dirname(deduplicated_filepath)): os.makedirs(os.path.dirname(deduplicated_filepath)) if not os.path.exists(os.path.dirname(deduplicated_prov)): os.makedirs(os.path.dirname(deduplicated_prov)) matcher = InstanceMatching(g_set, graph_filename=deduplicated_filepath, provenance_filename=deduplicated_prov, info_dir=info_dir, debug=False) matcher.match()
def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv, ar_index_csv, vi_index): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.data = data
def process_chunk(chunk_filepath: str, citations_mapping: Dict[URIRef, str]): """ This function handles all the steps which are needed to fully process a single chunk of the citations input dataset. - Firstly, the RDF graph serialized inside the chunk file is imported in the form of an oc_ocdm's GraphSet. - Secondly, a loop over each CI entity is performed: the citing and cited OCDM IRIs are extracted and then mapped to the Wikidata IDs contained inside the given mapping dictionary (when possible). A TSV statement is created for each citation to be uploaded. - Lastly, the collected list of statements is appended to the output file. :param chunk_filepath: A string representing the filesystem path to the chunk to be imported :param citations_mapping: A dictionary mapping OCDM IRIs into the corresponding Wikidata IDs """ # PROCESS INITIALIZATION statements: List[str] = [] # DATA IMPORT PHASE graph_chunk: Graph = Graph().parse(location=chunk_filepath, format='nt11') g_set: GraphSet = GraphSet(base_iri, wanted_label=False) Reader.import_entities_from_graph(g_set, graph_chunk, resp_agent, enable_validation=False) # TSV STATEMENTS GENERATION for ci in g_set.get_ci(): citing_uri: Optional[URIRef] = ci.get_citing_entity().res cited_uri: Optional[URIRef] = ci.get_cited_entity().res if citing_uri in citations_mapping and cited_uri in citations_mapping: citing_qid: str = citations_mapping[citing_uri] cited_qid: str = citations_mapping[cited_uri] statements.append('\t'.join( [citing_qid, "P2860", cited_qid, "S248", "Q328"])) # TSV STATEMENTS EXPORT if len(statements) > 0: store_batch(statements)
def process(cur_citations_file: str, conversion_dict: Dict[str, str]) -> None: """ This function takes care of generating an OCDM compliant RDF file containing the Citation entities that describe the relations between citing Wikipedia pages and cited bibliographic resources. Additionally, a CSV file compliant with other OpenCitations tools is produced. Since this is not strictly needed for the 'Wikipedia Citations in Wikidata' workflow, those files can be safely ignored. Please note: the bool flag 'rdf_output_in_chunks' from conf/conf_citations.py MUST be set to True, otherwise the following scripts of the workflow (Enricher and Pusher) won't be able to import the intermediate RDF files produced by this script. :param cur_citations_file: The filename (without the path) of the CSV file to be converted :param conversion_dict: The dictionary that maps 'tmp' identifiers onto their respective 'meta' identifiers """ filepath: str = os.path.join(citations_csv_dir, cur_citations_file) df: pd.DataFrame = pd.read_csv(filepath, usecols=['citing', 'cited'], low_memory=False) # 'tmp-to-meta' mapping is applied to each column of the DataFrame tmp_to_meta_mapping(df['citing'], conversion_dict) tmp_to_meta_mapping(df['cited'], conversion_dict) # Rows containing None values are dropped: we cannot generate valid Citation entities for them df = df.dropna(axis=0, how='any', subset=['citing', 'cited']) df = df.reset_index(drop=True) # The DataFrame is enriched with additional columns that are needed for achieving full # compliance with other OpenCitations tools. This is not strictly needed for our workflow: df['id'] = None df['oci'] = None df['creation'] = None # not applicable (the citation comes from a Wikipedia page) df['timespan'] = None # not applicable (the citation comes from a Wikipedia page) df['journal_sc'] = 'no' df['author_sc'] = 'no' # A temporary GraphSet is used to instantiate BibliographicResource entities that are needed # for the creation of Citation entities but that won't be kept in the output RDF file: temp_gs: GraphSet = GraphSet(base_iri) # The actual GraphSet that will contain the Citation entities to be stored in the output RDF file: ci_gs: GraphSet = GraphSet(base_iri, info_dir=info_dir, supplier_prefix=supplier_prefix, wanted_label=False) # Here the DataFrame columns are converted into Numpy arrays # so that we can iterate way faster over their elements: citing_col = df['citing'].to_numpy(copy=False) cited_col = df['cited'].to_numpy(copy=False) id_col = df['id'].to_numpy(copy=False) oci_col = df['oci'].to_numpy(copy=False) for i, (citing_meta_id, cited_meta_id) in enumerate(zip(citing_col, cited_col)): citing_res: URIRef = URIRef(base_iri + citing_meta_id) cited_res: URIRef = URIRef(base_iri + cited_meta_id) # A query is performed to discover if the current citation has already been processed: query_string: str = f''' PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX cito: <http://purl.org/spar/cito/> PREFIX datacite: <http://purl.org/spar/datacite/> PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> SELECT ?ci_res ?oci FROM <https://w3id.org/oc/meta/ci/> WHERE {{ ?ci_res rdf:type cito:Citation ; cito:hasCitingEntity <{citing_res}> ; cito:hasCitedEntity <{cited_res}> . OPTIONAL {{ ?ci_res datacite:hasIdentifier ?id . ?id datacite:usesIdentifierScheme datacite:oci ; literal:hasLiteralValue ?oci . }} }} LIMIT 1 ''' tp: SPARQLWrapper = SPARQLWrapper(triplestore_url) tp.setTimeout(query_timeout) tp.setMethod('GET') tp.setQuery(query_string) tp.setReturnFormat(JSON) results = tp.queryAndConvert() bindings = results["results"]["bindings"] if len( bindings ) >= 1: # 'LIMIT 1' in the query string should guarantee a maximum of 1 returned binding # This citation is already stored in the triplestore! row: Dict = bindings[0] # Update the output dataframe ci_res: URIRef = URIRef(bindings[0]["ci_res"]["value"]) id_col[i] = str(ci_res)[len(base_iri):] if "oci" in row: oci_col[i] = row["oci"]["value"] else: # This citation is currently missing from the triplestore! # Create BR entities in "append mode" by providing 'res' without 'preexisting_graph' citing_br: BibliographicResource = temp_gs.add_br( resp_agent, res=citing_res, preexisting_graph=None) cited_br: BibliographicResource = temp_gs.add_br( resp_agent, res=cited_res, preexisting_graph=None) # Create OCI identifier oci_str: str = str(citing_res)[len(base_iri + 'br/'):] + '-' + str( cited_res)[len(base_iri + 'br/'):] oci: Identifier = ci_gs.add_id(resp_agent) oci.create_oci(oci_str) # Create citation ci: Citation = ci_gs.add_ci(resp_agent) ci.has_identifier(oci) ci.has_citing_entity(citing_br) ci.has_cited_entity(cited_br) # Update the output dataframe id_col[i] = str(ci.res)[len(base_iri):] oci_col[i] = oci_str # Store the dataframe as a CSV file that's compliant with OpenCitations tools: output_filepath: str = os.path.join(converter_citations_csv_output_dir, cur_citations_file) df.to_csv(output_filepath, index=False, chunksize=100000, columns=[ 'id', 'oci', 'citing', 'cited', 'creation', 'timespan', 'journal_sc', 'author_sc' ]) # Store new citations in an RDF file (together with the related provenance). # They should also be uploaded to the triplestore so to update the current state # of execution: by this way, they won't be created again since they will already # be present inside the triplestore. ci_ps: ProvSet = ProvSet(ci_gs, base_iri) ci_ps.generate_provenance() ci_storer: Storer = Storer(ci_gs, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir, output_format='nt11') ci_prov_storer: Storer = Storer(ci_ps, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir, output_format='nquads') if rdf_output_in_chunks: # The RDF files are stored WITHOUT following the folder structure # adopted by OpenCitations: all the newly created citations are kept # in a single file. # In the following steps of the workflow, every script assumes that data was produced in chunks: # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must # be set to True. filename_without_csv: str = cur_citations_file[:-4] # Data f: str = os.path.join(converter_citations_rdf_output_dir, filename_without_csv + ".nt") if not os.path.exists(os.path.dirname(f)): os.makedirs(os.path.dirname(f)) ci_storer.store_graphs_in_file(f, context_path) ci_storer.upload_all(triplestore_url, converter_citations_rdf_output_dir, batch_size=100) # Provenance prov_dir: str = os.path.join(converter_citations_rdf_output_dir, 'prov') f_prov: str = os.path.join(prov_dir, filename_without_csv + '.nq') if not os.path.exists(os.path.dirname(f_prov)): os.makedirs(os.path.dirname(f_prov)) ci_prov_storer.store_graphs_in_file(f_prov, context_path) else: # The RDF files are stored following the folder structure adopted by OpenCitations. # Newly created citations could be split into different files based on # various conditions. # In the following steps of the workflow, every script assumes that data was produced in chunks: # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must # be set to True. ci_storer.upload_and_store(converter_citations_rdf_output_dir, triplestore_url, base_iri, context_path, batch_size=100) ci_prov_storer.store_all(converter_citations_rdf_output_dir, base_iri, context_path)