def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str, supplier_prefix: str, resp_agent: str, ra_index: dict, br_index: dict, re_index_csv: dict, ar_index_csv: dict, vi_index: dict, preexisting_entities: set): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.resp_agent = resp_agent self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri) self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'} self.br_id_schemas = { 'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata', 'wikipedia' } self.schemas = self.ra_id_schemas.union(self.br_id_schemas) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.preexisting_entities = preexisting_entities self.preexisting_graphs = dict() self.data = data
def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv, ar_index_csv, vi_index): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.data = data
def process_chunk(filename: str) -> None: """ This function wraps the functionality of the external library 'oc_graphenricher'. It imports an OCDM compliant RDF chunk file, it tries to enrich it with external identifiers and then deduplicates its entities. :param filename: a string representing the filename (without the path) of the chunk file to be processed """ filepath: str = os.path.join(rdf_input_dir, filename) filename_without_extension: str = os.path.splitext(filename)[0] g: Graph = Graph() g = g.parse(filepath, format='nt11') reader: Reader = Reader() g_set: GraphSet = GraphSet(base_iri=base_iri, info_dir=info_dir, supplier_prefix=supplier_prefix, wanted_label=False) reader.import_entities_from_graph(g_set, g, enable_validation=False, resp_agent=resp_agent) # Enrichment enriched_filepath: str = rdf_output_dir + os.sep + 'enriched' + os.sep +\ filename_without_extension + '.nt' enriched_prov: str = rdf_output_dir + os.sep + 'enriched' + os.sep + 'prov' + os.sep +\ filename_without_extension + '.nq' # Output folders are created if not already existing if not os.path.exists(os.path.dirname(enriched_filepath)): os.makedirs(os.path.dirname(enriched_filepath)) if not os.path.exists(os.path.dirname(enriched_prov)): os.makedirs(os.path.dirname(enriched_prov)) enricher: GraphEnricher = GraphEnricher(g_set, graph_filename=enriched_filepath, provenance_filename=enriched_prov, info_dir=info_dir, debug=False, serialize_in_the_middle=False) enricher.enrich() # Deduplication deduplicated_filepath: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep +\ filename_without_extension + '.nt' deduplicated_prov: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep + 'prov' + os.sep + \ filename_without_extension + '.nq' # Output folders are created if not already existing if not os.path.exists(os.path.dirname(deduplicated_filepath)): os.makedirs(os.path.dirname(deduplicated_filepath)) if not os.path.exists(os.path.dirname(deduplicated_prov)): os.makedirs(os.path.dirname(deduplicated_prov)) matcher = InstanceMatching(g_set, graph_filename=deduplicated_filepath, provenance_filename=deduplicated_prov, info_dir=info_dir, debug=False) matcher.match()
def test_extract_ids(self): resp_agent = 'http://w3c.org/oc/meta/pa/999' g_set = GraphSet('http://w3c.org/oc/meta/') br = g_set.add_br(resp_agent) isbn = g_set.add_id(resp_agent) isbn.create_isbn('978-88-515-2159-2') orcid = g_set.add_id(resp_agent) orcid.create_orcid('0000-0002-1825-0097') wikidata = g_set.add_id(resp_agent) wikidata.create_wikidata('Q9') br.has_identifier(isbn) br.has_identifier(orcid) br.has_identifier(wikidata) result = extract_ids(br) self.assertIsNotNone(result) self.assertDictEqual( result, { 'isbn13': '978-88-515-2159-2', 'isbn10': '88-515-2159-X', # this is automatically inferred 'orcid': '0000-0002-1825-0097', 'wikidata': 'Q9' })
def process_chunk(chunk_filepath: str, citations_mapping: Dict[URIRef, str]): """ This function handles all the steps which are needed to fully process a single chunk of the citations input dataset. - Firstly, the RDF graph serialized inside the chunk file is imported in the form of an oc_ocdm's GraphSet. - Secondly, a loop over each CI entity is performed: the citing and cited OCDM IRIs are extracted and then mapped to the Wikidata IDs contained inside the given mapping dictionary (when possible). A TSV statement is created for each citation to be uploaded. - Lastly, the collected list of statements is appended to the output file. :param chunk_filepath: A string representing the filesystem path to the chunk to be imported :param citations_mapping: A dictionary mapping OCDM IRIs into the corresponding Wikidata IDs """ # PROCESS INITIALIZATION statements: List[str] = [] # DATA IMPORT PHASE graph_chunk: Graph = Graph().parse(location=chunk_filepath, format='nt11') g_set: GraphSet = GraphSet(base_iri, wanted_label=False) Reader.import_entities_from_graph(g_set, graph_chunk, resp_agent, enable_validation=False) # TSV STATEMENTS GENERATION for ci in g_set.get_ci(): citing_uri: Optional[URIRef] = ci.get_citing_entity().res cited_uri: Optional[URIRef] = ci.get_cited_entity().res if citing_uri in citations_mapping and cited_uri in citations_mapping: citing_qid: str = citations_mapping[citing_uri] cited_qid: str = citations_mapping[cited_uri] statements.append('\t'.join( [citing_qid, "P2860", cited_qid, "S248", "Q328"])) # TSV STATEMENTS EXPORT if len(statements) > 0: store_batch(statements)
class Creator(object): def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str, supplier_prefix: str, resp_agent: str, ra_index: dict, br_index: dict, re_index_csv: dict, ar_index_csv: dict, vi_index: dict, preexisting_entities: set): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.resp_agent = resp_agent self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri) self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'} self.br_id_schemas = { 'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata', 'wikipedia' } self.schemas = self.ra_id_schemas.union(self.br_id_schemas) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.preexisting_entities = preexisting_entities self.preexisting_graphs = dict() self.data = data def creator(self, source=None): self.src = source for row in self.data: self.row_meta = '' ids = row['id'] title = row['title'] authors = row['author'] pub_date = row['pub_date'] venue = row['venue'] vol = row['volume'] issue = row['issue'] page = row['page'] self.type = row['type'] publisher = row['publisher'] editor = row['editor'] self.venue_graph = None self.vol_graph = None self.issue_graph = None self.id_action(ids) self.title_action(title) self.author_action(authors) self.pub_date_action(pub_date) self.vvi_action(venue, vol, issue) self.page_action(page) self.type_action(self.type) if publisher: self.publisher_action(publisher) if editor: self.editor_action(editor) return self.setgraph @staticmethod def index_re(id_index): index = dict() for row in id_index: index[row['br']] = row['re'] return index @staticmethod def index_ar(id_index): index = dict() for row in id_index: index[row['meta']] = dict() index[row['meta']]['author'] = Creator.__ar_worker(row['author']) index[row['meta']]['editor'] = Creator.__ar_worker(row['editor']) index[row['meta']]['publisher'] = Creator.__ar_worker( row['publisher']) return index @staticmethod def __ar_worker(s: str) -> dict: if s: ar_dict = dict() couples = s.split('; ') for c in couples: cou = c.split(', ') ar_dict[cou[1]] = cou[0] return ar_dict else: return dict() def indexer_id(self, csv_index): index = dict() for schema in self.schemas: index[schema] = dict() for row in csv_index: for schema in self.schemas: if row['id'].startswith(schema): identifier = row['id'].replace(f'{schema}:', '') index[schema][identifier] = row['meta'] return index def id_action(self, ids): idslist = re.split(one_or_more_spaces, ids) # publication id for identifier in idslist: if 'meta:' in identifier: identifier = identifier.replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False self.row_meta = identifier.replace('br/', '') url = URIRef(self.url + identifier) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs) if preexisting_entity else None self.br_graph = self.setgraph.add_br( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) for identifier in idslist: self.id_creator(self.br_graph, identifier, ra=False) def title_action(self, title): if title: self.br_graph.has_title(title) def author_action(self, authors): if authors: authorslist = re.split(semicolon_in_people_field, authors) aut_role_list = list() for aut in authorslist: aut_and_ids = re.search(name_and_ids, aut) aut_id = aut_and_ids.group(2) aut_id_list = aut_id.split(' ') for identifier in aut_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False url = URIRef(self.url + identifier) aut_meta = identifier.replace('ra/', '') preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs ) if preexisting_entity else None pub_aut = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) author_name = aut_and_ids.group(1) if ',' in author_name: author_name_splitted = re.split( comma_and_spaces, author_name) first_name = author_name_splitted[1] last_name = author_name_splitted[0] if first_name.strip(): pub_aut.has_given_name(first_name) pub_aut.has_family_name(last_name) else: pub_aut.has_name(author_name) # lists of authors' IDs for identifier in aut_id_list: self.id_creator(pub_aut, identifier, ra=True) # Author ROLE AR = self.ar_index[self.row_meta]['author'][aut_meta] ar_id = 'ar/' + str(AR) preexisting_entity = True if ar_id in self.preexisting_entities else False url_ar = URIRef(self.url + ar_id) preexisting_graph = self.finder.get_preexisting_graph( url_ar, self.preexisting_graphs) if preexisting_entity else None pub_aut_role = self.setgraph.add_ar( self.resp_agent, source=self.src, res=url_ar, preexisting_graph=preexisting_graph) pub_aut_role.create_author() self.br_graph.has_contributor(pub_aut_role) pub_aut_role.is_held_by(pub_aut) aut_role_list.append(pub_aut_role) if len(aut_role_list) > 1: aut_role_list[aut_role_list.index(pub_aut_role) - 1].has_next(pub_aut_role) def pub_date_action(self, pub_date): if pub_date: datelist = list() datesplit = pub_date.split('-') if datesplit: for x in datesplit: datelist.append(int(x)) else: datelist.append(int(pub_date)) str_date = create_date(datelist) self.br_graph.has_pub_date(str_date) def vvi_action(self, venue, vol, issue): if venue: venue_and_ids = re.search(name_and_ids, venue) venue_ids = venue_and_ids.group(2) venue_ids_list = venue_ids.split() for identifier in venue_ids_list: if 'meta:' in identifier: ven_id = str(identifier).replace('meta:', '') preexisting_entity = True if ven_id in self.preexisting_entities else False url = URIRef(self.url + ven_id) venue_title = venue_and_ids.group(1) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs ) if preexisting_entity else None self.venue_graph = self.setgraph.add_br( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) try: venue_type = self.get_venue_type( self.type, venue_ids_list) except UnboundLocalError: error_message = f"[INFO:Creator] I found the venue {venue} for the resource of type {self.type}, but I don't know how to handle it" raise UnboundLocalError(msg=error_message) if venue_type: venue_type = venue_type.replace(' ', '_') getattr(self.venue_graph, f'create_{venue_type}')() self.venue_graph.has_title(venue_title) for identifier in venue_ids_list: self.id_creator(self.venue_graph, identifier, ra=False) if self.type == 'journal article' or self.type == 'journal issue' or self.type == 'journal volume': meta_ven = ven_id.replace('br/', '') if vol: vol_meta = self.vi_index[meta_ven]['volume'][vol]['id'] vol_meta = 'br/' + vol_meta preexisting_entity = True if vol_meta in self.preexisting_entities else False vol_url = URIRef(self.url + vol_meta) preexisting_graph = self.finder.get_preexisting_graph( vol_url, self.preexisting_graphs ) if preexisting_entity else None self.vol_graph = self.setgraph.add_br( self.resp_agent, source=self.src, res=vol_url, preexisting_graph=preexisting_graph) self.vol_graph.create_volume() self.vol_graph.has_number(vol) if issue: if vol: issue_meta = self.vi_index[meta_ven]['volume'][vol][ 'issue'][issue]['id'] else: issue_meta = self.vi_index[meta_ven]['issue'][issue][ 'id'] issue_meta = 'br/' + issue_meta preexisting_entity = True if issue_meta in self.preexisting_entities else False issue_url = URIRef(self.url + issue_meta) preexisting_graph = self.finder.get_preexisting_graph( issue_url, self.preexisting_graphs ) if preexisting_entity else None self.issue_graph = self.setgraph.add_br( self.resp_agent, source=self.src, res=issue_url, preexisting_graph=preexisting_graph) self.issue_graph.create_issue() self.issue_graph.has_number(issue) if venue and vol and issue: self.br_graph.is_part_of(self.issue_graph) self.issue_graph.is_part_of(self.vol_graph) self.vol_graph.is_part_of(self.venue_graph) elif venue and vol and not issue: self.br_graph.is_part_of(self.vol_graph) self.vol_graph.is_part_of(self.venue_graph) elif venue and not vol and not issue: self.br_graph.is_part_of(self.venue_graph) elif venue and not vol and issue: self.br_graph.is_part_of(self.issue_graph) self.issue_graph.is_part_of(self.venue_graph) @classmethod def get_venue_type(cls, br_type: str, venue_ids: str) -> str: schemas = {venue_id.split(':')[0] for venue_id in venue_ids} if br_type in {'journal article', 'journal volume', 'journal issue'}: venue_type = 'journal' elif br_type in { 'book chapter', 'book part', 'book section', 'book track' }: venue_type = 'book' elif br_type in {'book', 'edited book', 'monograph', 'reference book'}: venue_type = 'book series' elif br_type == 'proceedings article': venue_type = 'proceedings' elif br_type in {'proceedings', 'report', 'standard', 'series'}: venue_type = 'series' elif br_type == 'reference entry': venue_type = 'reference book' # elif br_type == 'report series': # venue_type = 'report series' elif not br_type or br_type in {'dataset', 'data file'}: venue_type = '' # Check the type based on the identifier scheme if any(identifier for identifier in venue_ids if not identifier.startswith('meta:')): if venue_type in { 'journal', 'book series', 'series', 'report series' }: if 'isbn' in schemas or 'issn' not in schemas: # It is undecidable venue_type = '' elif venue_type in {'book', 'proceedings'}: if 'issn' in schemas or 'isbn' not in schemas: venue_type = '' elif venue_type == 'reference book': if 'isbn' in schemas and 'issn' not in schemas: venue_type = 'reference book' elif 'issn' in schemas and 'isbn' not in schemas: venue_type = 'journal' elif 'issn' in schemas and 'isbn' in schemas: venue_type = '' return venue_type def page_action(self, page): if page: res_em = self.re_index[self.row_meta] re_id = 're/' + str(res_em) preexisting_entity = True if re_id in self.preexisting_entities else False url_re = URIRef(self.url + re_id) preexisting_graph = self.finder.get_preexisting_graph( url_re, self.preexisting_graphs) if preexisting_entity else None form = self.setgraph.add_re(self.resp_agent, source=self.src, res=url_re, preexisting_graph=preexisting_graph) form.has_starting_page(page) form.has_ending_page(page) self.br_graph.has_format(form) def type_action(self, entity_type): if entity_type == 'archival document': self.br_graph.create_archival_document() elif entity_type == 'book': self.br_graph.create_book() elif entity_type == 'book chapter': self.br_graph.create_book_chapter() elif entity_type == 'book part': self.br_graph.create_book_part() elif entity_type == 'book section': self.br_graph.create_book_section() elif entity_type == 'book series': self.br_graph.create_book_series() elif entity_type == 'book set': self.br_graph.create_book_set() elif entity_type in {'data file', 'dataset'}: self.br_graph.create_dataset() elif entity_type == 'dissertation': self.br_graph.create_dissertation() # elif entity_type == 'edited book': # self.br_graph.create_edited_book() elif entity_type == 'journal': self.br_graph.create_journal() elif entity_type == 'journal article': self.br_graph.create_journal_article() elif entity_type == 'journal issue': self.br_graph.create_issue() elif entity_type == 'journal volume': self.br_graph.create_volume() # elif entity_type == 'monograph': # self.br_graph.create_monograph() elif entity_type == 'peer review': self.br_graph.create_peer_review() elif entity_type == 'proceedings': self.br_graph.create_proceedings() elif entity_type == 'proceedings article': self.br_graph.create_proceedings_article() # elif entity_type == 'proceedings series': # self.br_graph.create_proceedings_series() elif entity_type == 'reference book': self.br_graph.create_reference_book() elif entity_type == 'reference entry': self.br_graph.create_reference_entry() elif entity_type == 'report': self.br_graph.create_report() elif entity_type == 'report series': self.br_graph.create_report_series() elif entity_type == 'standard': self.br_graph.create_standard() elif entity_type == 'series': self.br_graph.create_series() # elif entity_type == 'standard series': # self.br_graph.create_standard_series()() elif entity_type == 'web content': self.br_graph.create_web_content() def publisher_action(self, publisher): publ_and_ids = re.search(name_and_ids, publisher) publ_id = publ_and_ids.group(2) publ_id_list = publ_id.split() for identifier in publ_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False pub_meta = identifier.replace('ra/', '') url = URIRef(self.url + identifier) publ_name = publ_and_ids.group(1) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs) if preexisting_entity else None publ = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) publ.has_name(publ_name) for identifier in publ_id_list: self.id_creator(publ, identifier, ra=True) # publisherRole AR = self.ar_index[self.row_meta]['publisher'][pub_meta] ar_id = 'ar/' + str(AR) preexisting_entity = True if ar_id in self.preexisting_entities else False url_ar = URIRef(self.url + ar_id) preexisting_graph = self.finder.get_preexisting_graph( url_ar, self.preexisting_graphs) publ_role = self.setgraph.add_ar(self.resp_agent, source=self.src, res=url_ar, preexisting_graph=preexisting_graph) publ_role.create_publisher() self.br_graph.has_contributor(publ_role) publ_role.is_held_by(publ) def editor_action(self, editor): editorslist = re.split(semicolon_in_people_field, editor) edit_role_list = list() for ed in editorslist: ed_and_ids = re.search(name_and_ids, ed) ed_id = ed_and_ids.group(2) ed_id_list = ed_id.split(' ') for identifier in ed_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False ed_meta = identifier.replace('ra/', '') url = URIRef(self.url + identifier) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs ) if preexisting_entity else None pub_ed = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) editor_name = ed_and_ids.group(1) if ',' in editor_name: editor_name_splitted = re.split( comma_and_spaces, editor_name) firstName = editor_name_splitted[1] lastName = editor_name_splitted[0] if firstName.strip(): pub_ed.has_given_name(firstName) pub_ed.has_family_name(lastName) else: pub_ed.has_name(editor_name) # lists of editor's IDs for identifier in ed_id_list: self.id_creator(pub_ed, identifier, ra=True) # editorRole AR = self.ar_index[self.row_meta]['editor'][ed_meta] ar_id = 'ar/' + str(AR) preexisting_entity = True if ar_id in self.preexisting_entities else False url_ar = URIRef(self.url + ar_id) preexisting_graph = self.finder.get_preexisting_graph( url_ar, self.preexisting_graphs) if preexisting_entity else None pub_ed_role = self.setgraph.add_ar( self.resp_agent, source=self.src, res=url_ar, preexisting_graph=preexisting_graph) if self.type == 'proceedings article' and self.venue_graph: pub_ed_role.create_editor() self.venue_graph.has_contributor(pub_ed_role) elif (self.type == 'book chapter' or self.type == 'book part') and self.venue_graph: pub_ed_role.create_editor() self.venue_graph.has_contributor(pub_ed_role) else: pub_ed_role.create_editor() self.br_graph.has_contributor(pub_ed_role) pub_ed_role.is_held_by(pub_ed) edit_role_list.append(pub_ed_role) if len(edit_role_list) > 1: edit_role_list[edit_role_list.index(pub_ed_role) - 1].has_next(pub_ed_role) def id_creator(self, graph: BibliographicEntity, identifier: str, ra: bool) -> None: new_id = None if ra: for ra_id_schema in self.ra_id_schemas: if identifier.startswith(ra_id_schema): identifier = identifier.replace(f'{ra_id_schema}:', '') res = self.ra_index[ra_id_schema][identifier] preexisting_entity = True if f'id/{res}' in self.preexisting_entities else False url = URIRef(self.url + 'id/' + res) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs ) if preexisting_entity else None new_id = self.setgraph.add_id( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) getattr(new_id, f'create_{ra_id_schema}')(identifier) else: for br_id_schema in self.br_id_schemas: if identifier.startswith(br_id_schema): identifier = identifier.replace(f'{br_id_schema}:', '') res = self.br_index[br_id_schema][identifier] preexisting_entity = True if f'id/{res}' in self.preexisting_entities else False url = URIRef(self.url + 'id/' + res) preexisting_graph = self.finder.get_preexisting_graph( url, self.preexisting_graphs ) if preexisting_entity else None new_id = self.setgraph.add_id( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) getattr(new_id, f'create_{br_id_schema}')(identifier) if new_id: graph.has_identifier(new_id)
class Creator(object): def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv, ar_index_csv, vi_index): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.ra_index = self.indexer_id(ra_index) self.br_index = self.indexer_id(br_index) self.re_index = self.index_re(re_index_csv) self.ar_index = self.index_ar(ar_index_csv) self.vi_index = vi_index self.data = data def creator(self, source=None): self.src = source for row in self.data: self.row_meta = "" ids = row['id'] title = row['title'] authors = row['author'] pub_date = row['pub_date'] venue = row['venue'] vol = row['volume'] issue = row['issue'] page = row['page'] self.type = row['type'] publisher = row['publisher'] editor = row['editor'] self.venue_graph = None self.vol_graph = None self.issue_graph = None self.id_action(ids) self.title_action(title) self.author_action(authors) self.pub_date_action(pub_date) self.vvi_action(venue, vol, issue) self.page_action(page) self.type_action(self.type) if publisher: self.publisher_action(publisher) if editor: self.editor_action(editor) return self.setgraph @staticmethod def index_re(id_index): index = dict() for row in id_index: index[row["br"]] = row["re"] return index @staticmethod def index_ar(id_index): index = dict() for row in id_index: index[row["meta"]] = dict() index[row["meta"]]["author"] = Creator.ar_worker(row["author"]) index[row["meta"]]["editor"] = Creator.ar_worker(row["editor"]) index[row["meta"]]["publisher"] = Creator.ar_worker(row["publisher"]) return index @staticmethod def ar_worker(s): if s: ar_dict = dict() couples = s.split("; ") for c in couples: cou = c.split(", ") ar_dict[cou[1]] = cou[0] return ar_dict else: return dict() @staticmethod def indexer_id(csv_index): index = dict() index['crossref'] = dict() index["doi"] = dict() index["issn"] = dict() index["isbn"] = dict() index["orcid"] = dict() index["pmid"] = dict() index['pmcid'] = dict() index['url'] = dict() index['viaf'] = dict() index['wikidata'] = dict() index['wikipedia'] = dict() for row in csv_index: if row["id"].startswith("crossref"): identifier = row["id"].replace('crossref:', '') index['crossref'][identifier] = row["meta"] elif row["id"].startswith("doi"): identifier = row["id"].replace('doi:', '') index['doi'][identifier] = row["meta"] elif row["id"].startswith("issn"): identifier = row["id"].replace('issn:', '') index['issn'][identifier] = row["meta"] elif row["id"].startswith("isbn"): identifier = row["id"].replace('isbn:', '') index['isbn'][identifier] = row["meta"] elif row["id"].startswith("orcid"): identifier = row["id"].replace('orcid:', '') index['orcid'][identifier] = row["meta"] elif row["id"].startswith("pmid"): identifier = row["id"].replace('pmid:', '') index['pmid'][identifier] = row["meta"] elif row["id"].startswith("pmcid"): identifier = row["id"].replace('pmcid:', '') index['pmcid'][identifier] = row["meta"] elif row["id"].startswith("url"): identifier = row["id"].replace('url:', '') index['url'][identifier] = row["meta"] elif row["id"].startswith("viaf"): identifier = row["id"].replace('viaf:', '') index['viaf'][identifier] = row["meta"] elif row["id"].startswith("wikidata"): identifier = row["id"].replace('wikidata:', '') index['wikidata'][identifier] = row["meta"] elif row["id"].startswith("wikipedia"): identifier = row["id"].replace('wikipedia:', '') index['wikipedia'][identifier] = row["meta"] return index def id_action(self, ids): idslist = re.split(r'\s+', ids) # publication id for identifier in idslist: if "meta:" in identifier: identifier = identifier.replace("meta:", "") self.row_meta = identifier.replace("br/", "") url = URIRef(self.url + identifier) self.br_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url) for identifier in idslist: self.id_creator(self.br_graph, identifier, ra=False) def title_action(self, title): if title: self.br_graph.has_title(title) def author_action(self, authors): if authors: authorslist = re.split(r'\s*;\s*(?=[^]]*(?:\[|$))', authors) aut_role_list = list() for aut in authorslist: aut_id = re.search(r'\[\s*(.*?)\s*]', aut).group(1) aut_id_list = aut_id.split(" ") for identifier in aut_id_list: if "meta:" in identifier: identifier = str(identifier).replace('meta:', "") url = URIRef(self.url + identifier) aut_meta = identifier.replace('ra/', "") pub_aut = self.setgraph.add_ra(resp_agent, source=self.src, res=url) author_name = re.search(r'(.*?)\s*\[.*?]', aut).group(1) if "," in author_name: author_name_splitted = re.split(r'\s*,\s*', author_name) firstName = author_name_splitted[1] lastName = author_name_splitted[0] if firstName.strip(): pub_aut.has_given_name(firstName) pub_aut.has_family_name(lastName) else: pub_aut.has_name(author_name) # lists of authors' IDs for identifier in aut_id_list: self.id_creator(pub_aut, identifier, ra=True) # Author ROLE AR = self.ar_index[self.row_meta]["author"][aut_meta] ar_id = "ar/" + str(AR) url_ar = URIRef(self.url + ar_id) pub_aut_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar) pub_aut_role.create_author() self.br_graph.has_contributor(pub_aut_role) pub_aut_role.is_held_by(pub_aut) aut_role_list.append(pub_aut_role) if len(aut_role_list) > 1: aut_role_list[aut_role_list.index(pub_aut_role)-1].has_next(pub_aut_role) def pub_date_action(self, pub_date): if pub_date: datelist = list() datesplit = pub_date.split("-") if datesplit: for x in datesplit: datelist.append(int(x)) else: datelist.append(int(pub_date)) str_date = create_date(datelist) self.br_graph.has_pub_date(str_date) def vvi_action(self, venue, vol, issue): if venue: venue_id = re.search(r'\[\s*(.*?)\s*]', venue).group(1) venue_id_list = venue_id.split(" ") for identifier in venue_id_list: if "meta:" in identifier: ven_id = str(identifier).replace("meta:", "") url = URIRef(self.url + ven_id) venue_title = re.search(r'(.*?)\s*\[.*?]', venue).group(1) self.venue_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url) if self.type == "journal article" or self.type == "journal volume" or self.type == "journal issue": self.venue_graph.create_journal() elif self.type == "book chapter" or self.type == "book part": self.venue_graph.create_book() elif self.type == "proceedings article": self.venue_graph.create_proceedings() elif self.type == "report": self.venue_graph.create_report_series() elif self.type == "standard": self.venue_graph.create_standard_series() self.venue_graph.has_title(venue_title) for identifier in venue_id_list: self.id_creator(self.venue_graph, identifier, ra=False) if (self.type == "journal article" or self.type == "journal issue") and vol: meta_ven = ven_id.replace("br/", "") vol_meta = self.vi_index[meta_ven]["volume"][vol]["id"] vol_meta = "br/" + vol_meta url = URIRef(self.url + vol_meta) self.vol_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url) self.vol_graph.create_volume() self.vol_graph.has_number(vol) if self.type == "journal article" and issue: meta_ven = ven_id.replace("br/", "") if vol: iss_meta = self.vi_index[meta_ven]["volume"][vol]["issue"][issue]["id"] else: iss_meta = self.vi_index[meta_ven]["issue"][issue]["id"] iss_meta = "br/" + iss_meta url = URIRef(self.url + iss_meta) self.issue_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url) self.issue_graph.create_issue() self.issue_graph.has_number(issue) if venue and vol and issue: self.br_graph.is_part_of(self.issue_graph) self.issue_graph.is_part_of(self.vol_graph) self.vol_graph.is_part_of(self.venue_graph) elif venue and vol and not issue: self.br_graph.is_part_of(self.vol_graph) self.vol_graph.is_part_of(self.venue_graph) elif venue and not vol and not issue: self.br_graph.is_part_of(self.venue_graph) elif venue and not vol and issue: self.br_graph.is_part_of(self.issue_graph) self.issue_graph.is_part_of(self.venue_graph) def page_action(self, page): if page: res_em = self.re_index[self.row_meta] re_id = "re/" + str(res_em) url_re = URIRef(self.url + re_id) form = self.setgraph.add_re(resp_agent, source=self.src, res=url_re) form.has_starting_page(page) form.has_ending_page(page) self.br_graph.has_format(form) def type_action(self, entity_type): if entity_type == "archival document": self.br_graph.create_archival_document() elif entity_type == "book": self.br_graph.create_book() elif entity_type == "book chapter": self.br_graph.create_book_chapter() elif entity_type == "book part": self.br_graph.create_book_part() elif entity_type == "book section": self.br_graph.create_book_section() elif entity_type == "book series": self.br_graph.create_book_series() elif entity_type == "book set": self.br_graph.create_book_set() elif entity_type == "data file": self.br_graph.create_dataset() elif entity_type == "dissertation": self.br_graph.create_dissertation() elif entity_type == "journal": self.br_graph.create_journal() elif entity_type == "journal article": self.br_graph.create_journal_article() elif entity_type == "journal issue": self.br_graph.create_issue() elif entity_type == "journal volume": self.br_graph.create_volume() elif entity_type == "proceedings article": self.br_graph.create_proceedings_article() elif entity_type == "proceedings": self.br_graph.create_proceedings() elif entity_type == "reference book": self.br_graph.create_reference_book() elif entity_type == "reference entry": self.br_graph.create_reference_entry() elif entity_type == "report": self.br_graph.create_report() elif entity_type == "standard": self.br_graph.create_standard() elif entity_type == "series": self.br_graph.create_series() def publisher_action(self, publisher): publ_id = re.search(r'\[\s*(.*?)\s*]', publisher).group(1) publ_id_list = publ_id.split(" ") for identifier in publ_id_list: if "meta:" in identifier: identifier = str(identifier).replace("meta:", "") pub_meta = identifier.replace("ra/", "") url = URIRef(self.url + identifier) publ_name = re.search(r'(.*?)\s*\[.*?]', publisher).group(1) publ = self.setgraph.add_ra(resp_agent, source=self.src, res=url) publ.has_name(publ_name) for identifier in publ_id_list: self.id_creator(publ, identifier, ra=True) # publisherRole AR = self.ar_index[self.row_meta]["publisher"][pub_meta] ar_id = "ar/" + str(AR) url_ar = URIRef(self.url + ar_id) publ_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar) publ_role.create_publisher() self.br_graph.has_contributor(publ_role) publ_role.is_held_by(publ) def editor_action(self, editor): editorslist = re.split(r'\s*;\s*(?=[^]]*(?:\[|$))', editor) edit_role_list = list() for ed in editorslist: ed_id = re.search(r'\[\s*(.*?)\s*]', ed).group(1) ed_id_list = ed_id.split(" ") for identifier in ed_id_list: if "meta:" in identifier: identifier = str(identifier).replace("meta:", "") ed_meta = identifier.replace("ra/", "") url = URIRef(self.url + identifier) pub_ed = self.setgraph.add_ra(resp_agent, source=self.src, res=url) editor_name = re.search(r'(.*?)\s*\[.*?]', ed).group(1) if "," in editor_name: editor_name_splitted = re.split(r'\s*,\s*', editor_name) firstName = editor_name_splitted[1] lastName = editor_name_splitted[0] if firstName.strip(): pub_ed.has_given_name(firstName) pub_ed.has_family_name(lastName) else: pub_ed.has_name(editor_name) # lists of editor's IDs for identifier in ed_id_list: self.id_creator(pub_ed, identifier, ra=True) # editorRole AR = self.ar_index[self.row_meta]["editor"][ed_meta] ar_id = "ar/" + str(AR) url_ar = URIRef(self.url + ar_id) pub_ed_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar) if self.type == "proceedings article" and self.venue_graph: pub_ed_role.create_editor() self.venue_graph.has_contributor(pub_ed_role) elif (self.type == "book chapter" or self.type == "book part") and self.venue_graph: pub_ed_role.create_editor() self.venue_graph.has_contributor(pub_ed_role) else: pub_ed_role.create_editor() self.br_graph.has_contributor(pub_ed_role) pub_ed_role.is_held_by(pub_ed) edit_role_list.append(pub_ed_role) if len(edit_role_list) > 1: edit_role_list[edit_role_list.index(pub_ed_role)-1].has_next(pub_ed_role) def id_creator(self, graph, identifier, ra): new_id = None if ra: if identifier.startswith("crossref"): identifier = identifier.replace('crossref:', '') res = self.ra_index['crossref'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_crossref(identifier) elif identifier.startswith("orcid"): identifier = identifier.replace("orcid:", "") res = self.ra_index['orcid'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_orcid(identifier) elif identifier.startswith("viaf"): identifier = identifier.replace("viaf:", "") res = self.ra_index['viaf'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_viaf(identifier) elif identifier.startswith("wikidata"): identifier = identifier.replace("wikidata:", "") res = self.ra_index['wikidata'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_wikidata(identifier) elif identifier.startswith("doi"): identifier = identifier.replace("doi:", "") res = self.br_index['doi'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_doi(identifier) elif identifier.startswith("issn"): identifier = identifier.replace("issn:", "") res = self.br_index['issn'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_issn(identifier) elif identifier.startswith("isbn"): identifier = identifier.replace("isbn:", "") res = self.br_index['isbn'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_isbn(identifier) elif identifier.startswith("pmid"): identifier = identifier.replace("pmid:", "") res = self.br_index['pmid'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_pmid(identifier) elif identifier.startswith("pmcid"): identifier = identifier.replace("pmcid:", "") res = self.br_index['pmcid'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_pmcid(identifier) elif identifier.startswith("url"): identifier = identifier.replace("url:", "") res = self.br_index['url'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_url(identifier) elif identifier.startswith("wikidata"): identifier = identifier.replace("wikidata:", "") res = self.br_index['wikidata'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_wikidata(identifier) elif identifier.startswith("wikipedia"): identifier = identifier.replace("wikipedia:", "") res = self.br_index['wikipedia'][identifier] url = URIRef(self.url + "id/" + res) new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url) new_id.create_wikipedia(identifier) if new_id: graph.has_identifier(new_id)
class RespAgentsCreator(Creator): def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str, supplier_prefix: str, resp_agent: str, ra_index: dict, preexisting_entities: set): self.url = base_iri self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False) self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri) self.resp_agent = resp_agent self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'} self.br_id_schemas = { 'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata', 'wikipedia' } self.schemas = self.ra_id_schemas.union(self.br_id_schemas) self.ra_index = self.indexer_id(ra_index) self.preexisting_entities = preexisting_entities self.data = data def creator(self, source=None): self.src = source for row in self.data: authors = row['author'] publisher = row['publisher'] editor = row['editor'] self.author_action(authors) if publisher: self.publisher_action(publisher) if editor: self.editor_action(editor) return self.setgraph def author_action(self, authors): if authors: authorslist = re.split(semicolon_in_people_field, authors) for aut in authorslist: aut_and_ids = re.search(name_and_ids, aut) aut_id = aut_and_ids.group(2) aut_id_list = aut_id.split() for identifier in aut_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False url = URIRef(self.url + identifier) preexisting_graph = self.finder.get_preexisting_graph( url) if preexisting_entity else None pub_aut = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) author_name = aut_and_ids.group(1) if ',' in author_name: author_name_splitted = re.split( comma_and_spaces, author_name) first_name = author_name_splitted[1] last_name = author_name_splitted[0] if first_name.strip(): pub_aut.has_given_name(first_name) pub_aut.has_family_name(last_name) else: pub_aut.has_name(author_name) # lists of authors' IDs for identifier in aut_id_list: self.id_creator(pub_aut, identifier, ra=True) def publisher_action(self, publisher): publ_and_ids = re.search(name_and_ids, publisher) publ_id = publ_and_ids.group(2) publ_id_list = publ_id.split() for identifier in publ_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False url = URIRef(self.url + identifier) publ_name = publ_and_ids.group(1) preexisting_graph = self.finder.get_preexisting_graph( url) if preexisting_entity else None publ = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) publ.has_name(publ_name) for identifier in publ_id_list: self.id_creator(publ, identifier, ra=True) def editor_action(self, editor): editorslist = re.split(semicolon_in_people_field, editor) for ed in editorslist: ed_and_ids = re.search(name_and_ids, ed) ed_id = ed_and_ids.group(2) ed_id_list = ed_id.split(' ') for identifier in ed_id_list: if 'meta:' in identifier: identifier = str(identifier).replace('meta:', '') preexisting_entity = True if identifier in self.preexisting_entities else False url = URIRef(self.url + identifier) preexisting_graph = self.finder.get_preexisting_graph( url) if preexisting_entity else None pub_ed = self.setgraph.add_ra( self.resp_agent, source=self.src, res=url, preexisting_graph=preexisting_graph) editor_name = ed_and_ids.group(1) if ',' in editor_name: editor_name_splitted = re.split( comma_and_spaces, editor_name) firstName = editor_name_splitted[1] lastName = editor_name_splitted[0] if firstName.strip(): pub_ed.has_given_name(firstName) pub_ed.has_family_name(lastName) else: pub_ed.has_name(editor_name) # lists of editor's IDs for identifier in ed_id_list: self.id_creator(pub_ed, identifier, ra=True)
def process(cur_citations_file: str, conversion_dict: Dict[str, str]) -> None: """ This function takes care of generating an OCDM compliant RDF file containing the Citation entities that describe the relations between citing Wikipedia pages and cited bibliographic resources. Additionally, a CSV file compliant with other OpenCitations tools is produced. Since this is not strictly needed for the 'Wikipedia Citations in Wikidata' workflow, those files can be safely ignored. Please note: the bool flag 'rdf_output_in_chunks' from conf/conf_citations.py MUST be set to True, otherwise the following scripts of the workflow (Enricher and Pusher) won't be able to import the intermediate RDF files produced by this script. :param cur_citations_file: The filename (without the path) of the CSV file to be converted :param conversion_dict: The dictionary that maps 'tmp' identifiers onto their respective 'meta' identifiers """ filepath: str = os.path.join(citations_csv_dir, cur_citations_file) df: pd.DataFrame = pd.read_csv(filepath, usecols=['citing', 'cited'], low_memory=False) # 'tmp-to-meta' mapping is applied to each column of the DataFrame tmp_to_meta_mapping(df['citing'], conversion_dict) tmp_to_meta_mapping(df['cited'], conversion_dict) # Rows containing None values are dropped: we cannot generate valid Citation entities for them df = df.dropna(axis=0, how='any', subset=['citing', 'cited']) df = df.reset_index(drop=True) # The DataFrame is enriched with additional columns that are needed for achieving full # compliance with other OpenCitations tools. This is not strictly needed for our workflow: df['id'] = None df['oci'] = None df['creation'] = None # not applicable (the citation comes from a Wikipedia page) df['timespan'] = None # not applicable (the citation comes from a Wikipedia page) df['journal_sc'] = 'no' df['author_sc'] = 'no' # A temporary GraphSet is used to instantiate BibliographicResource entities that are needed # for the creation of Citation entities but that won't be kept in the output RDF file: temp_gs: GraphSet = GraphSet(base_iri) # The actual GraphSet that will contain the Citation entities to be stored in the output RDF file: ci_gs: GraphSet = GraphSet(base_iri, info_dir=info_dir, supplier_prefix=supplier_prefix, wanted_label=False) # Here the DataFrame columns are converted into Numpy arrays # so that we can iterate way faster over their elements: citing_col = df['citing'].to_numpy(copy=False) cited_col = df['cited'].to_numpy(copy=False) id_col = df['id'].to_numpy(copy=False) oci_col = df['oci'].to_numpy(copy=False) for i, (citing_meta_id, cited_meta_id) in enumerate(zip(citing_col, cited_col)): citing_res: URIRef = URIRef(base_iri + citing_meta_id) cited_res: URIRef = URIRef(base_iri + cited_meta_id) # A query is performed to discover if the current citation has already been processed: query_string: str = f''' PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> PREFIX cito: <http://purl.org/spar/cito/> PREFIX datacite: <http://purl.org/spar/datacite/> PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/> SELECT ?ci_res ?oci FROM <https://w3id.org/oc/meta/ci/> WHERE {{ ?ci_res rdf:type cito:Citation ; cito:hasCitingEntity <{citing_res}> ; cito:hasCitedEntity <{cited_res}> . OPTIONAL {{ ?ci_res datacite:hasIdentifier ?id . ?id datacite:usesIdentifierScheme datacite:oci ; literal:hasLiteralValue ?oci . }} }} LIMIT 1 ''' tp: SPARQLWrapper = SPARQLWrapper(triplestore_url) tp.setTimeout(query_timeout) tp.setMethod('GET') tp.setQuery(query_string) tp.setReturnFormat(JSON) results = tp.queryAndConvert() bindings = results["results"]["bindings"] if len( bindings ) >= 1: # 'LIMIT 1' in the query string should guarantee a maximum of 1 returned binding # This citation is already stored in the triplestore! row: Dict = bindings[0] # Update the output dataframe ci_res: URIRef = URIRef(bindings[0]["ci_res"]["value"]) id_col[i] = str(ci_res)[len(base_iri):] if "oci" in row: oci_col[i] = row["oci"]["value"] else: # This citation is currently missing from the triplestore! # Create BR entities in "append mode" by providing 'res' without 'preexisting_graph' citing_br: BibliographicResource = temp_gs.add_br( resp_agent, res=citing_res, preexisting_graph=None) cited_br: BibliographicResource = temp_gs.add_br( resp_agent, res=cited_res, preexisting_graph=None) # Create OCI identifier oci_str: str = str(citing_res)[len(base_iri + 'br/'):] + '-' + str( cited_res)[len(base_iri + 'br/'):] oci: Identifier = ci_gs.add_id(resp_agent) oci.create_oci(oci_str) # Create citation ci: Citation = ci_gs.add_ci(resp_agent) ci.has_identifier(oci) ci.has_citing_entity(citing_br) ci.has_cited_entity(cited_br) # Update the output dataframe id_col[i] = str(ci.res)[len(base_iri):] oci_col[i] = oci_str # Store the dataframe as a CSV file that's compliant with OpenCitations tools: output_filepath: str = os.path.join(converter_citations_csv_output_dir, cur_citations_file) df.to_csv(output_filepath, index=False, chunksize=100000, columns=[ 'id', 'oci', 'citing', 'cited', 'creation', 'timespan', 'journal_sc', 'author_sc' ]) # Store new citations in an RDF file (together with the related provenance). # They should also be uploaded to the triplestore so to update the current state # of execution: by this way, they won't be created again since they will already # be present inside the triplestore. ci_ps: ProvSet = ProvSet(ci_gs, base_iri) ci_ps.generate_provenance() ci_storer: Storer = Storer(ci_gs, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir, output_format='nt11') ci_prov_storer: Storer = Storer(ci_ps, dir_split=dir_split_number, n_file_item=items_per_file, default_dir=default_dir, output_format='nquads') if rdf_output_in_chunks: # The RDF files are stored WITHOUT following the folder structure # adopted by OpenCitations: all the newly created citations are kept # in a single file. # In the following steps of the workflow, every script assumes that data was produced in chunks: # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must # be set to True. filename_without_csv: str = cur_citations_file[:-4] # Data f: str = os.path.join(converter_citations_rdf_output_dir, filename_without_csv + ".nt") if not os.path.exists(os.path.dirname(f)): os.makedirs(os.path.dirname(f)) ci_storer.store_graphs_in_file(f, context_path) ci_storer.upload_all(triplestore_url, converter_citations_rdf_output_dir, batch_size=100) # Provenance prov_dir: str = os.path.join(converter_citations_rdf_output_dir, 'prov') f_prov: str = os.path.join(prov_dir, filename_without_csv + '.nq') if not os.path.exists(os.path.dirname(f_prov)): os.makedirs(os.path.dirname(f_prov)) ci_prov_storer.store_graphs_in_file(f_prov, context_path) else: # The RDF files are stored following the folder structure adopted by OpenCitations. # Newly created citations could be split into different files based on # various conditions. # In the following steps of the workflow, every script assumes that data was produced in chunks: # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must # be set to True. ci_storer.upload_and_store(converter_citations_rdf_output_dir, triplestore_url, base_iri, context_path, batch_size=100) ci_prov_storer.store_all(converter_citations_rdf_output_dir, base_iri, context_path)