コード例 #1
0
    def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str,
                 supplier_prefix: str, resp_agent: str, ra_index: dict,
                 br_index: dict, re_index_csv: dict, ar_index_csv: dict,
                 vi_index: dict, preexisting_entities: set):
        self.url = base_iri
        self.setgraph = GraphSet(self.url,
                                 info_dir,
                                 supplier_prefix,
                                 wanted_label=False)
        self.resp_agent = resp_agent
        self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri)

        self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'}
        self.br_id_schemas = {
            'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata',
            'wikipedia'
        }
        self.schemas = self.ra_id_schemas.union(self.br_id_schemas)

        self.ra_index = self.indexer_id(ra_index)
        self.br_index = self.indexer_id(br_index)
        self.re_index = self.index_re(re_index_csv)
        self.ar_index = self.index_ar(ar_index_csv)
        self.vi_index = vi_index
        self.preexisting_entities = preexisting_entities
        self.preexisting_graphs = dict()
        self.data = data
コード例 #2
0
ファイル: creator.py プロジェクト: opencitations/wcw
    def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv,
                 ar_index_csv, vi_index):
        self.url = base_iri

        self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False)

        self.ra_index = self.indexer_id(ra_index)

        self.br_index = self.indexer_id(br_index)

        self.re_index = self.index_re(re_index_csv)

        self.ar_index = self.index_ar(ar_index_csv)

        self.vi_index = vi_index
        self.data = data
コード例 #3
0
def process_chunk(filename: str) -> None:
    """
    This function wraps the functionality of the external library 'oc_graphenricher'.
    It imports an OCDM compliant RDF chunk file, it tries to enrich it with external identifiers
    and then deduplicates its entities.

    :param filename: a string representing the filename (without the path) of the chunk file to be processed
    """
    filepath: str = os.path.join(rdf_input_dir, filename)
    filename_without_extension: str = os.path.splitext(filename)[0]

    g: Graph = Graph()
    g = g.parse(filepath, format='nt11')

    reader: Reader = Reader()
    g_set: GraphSet = GraphSet(base_iri=base_iri,
                               info_dir=info_dir,
                               supplier_prefix=supplier_prefix,
                               wanted_label=False)
    reader.import_entities_from_graph(g_set,
                                      g,
                                      enable_validation=False,
                                      resp_agent=resp_agent)

    # Enrichment
    enriched_filepath: str = rdf_output_dir + os.sep + 'enriched' + os.sep +\
        filename_without_extension + '.nt'
    enriched_prov: str = rdf_output_dir + os.sep + 'enriched' + os.sep + 'prov' + os.sep +\
        filename_without_extension + '.nq'
    # Output folders are created if not already existing
    if not os.path.exists(os.path.dirname(enriched_filepath)):
        os.makedirs(os.path.dirname(enriched_filepath))
    if not os.path.exists(os.path.dirname(enriched_prov)):
        os.makedirs(os.path.dirname(enriched_prov))

    enricher: GraphEnricher = GraphEnricher(g_set,
                                            graph_filename=enriched_filepath,
                                            provenance_filename=enriched_prov,
                                            info_dir=info_dir,
                                            debug=False,
                                            serialize_in_the_middle=False)
    enricher.enrich()

    # Deduplication
    deduplicated_filepath: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep +\
        filename_without_extension + '.nt'
    deduplicated_prov: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep + 'prov' + os.sep + \
        filename_without_extension + '.nq'
    # Output folders are created if not already existing
    if not os.path.exists(os.path.dirname(deduplicated_filepath)):
        os.makedirs(os.path.dirname(deduplicated_filepath))
    if not os.path.exists(os.path.dirname(deduplicated_prov)):
        os.makedirs(os.path.dirname(deduplicated_prov))

    matcher = InstanceMatching(g_set,
                               graph_filename=deduplicated_filepath,
                               provenance_filename=deduplicated_prov,
                               info_dir=info_dir,
                               debug=False)
    matcher.match()
コード例 #4
0
    def test_extract_ids(self):
        resp_agent = 'http://w3c.org/oc/meta/pa/999'
        g_set = GraphSet('http://w3c.org/oc/meta/')
        br = g_set.add_br(resp_agent)

        isbn = g_set.add_id(resp_agent)
        isbn.create_isbn('978-88-515-2159-2')

        orcid = g_set.add_id(resp_agent)
        orcid.create_orcid('0000-0002-1825-0097')

        wikidata = g_set.add_id(resp_agent)
        wikidata.create_wikidata('Q9')

        br.has_identifier(isbn)
        br.has_identifier(orcid)
        br.has_identifier(wikidata)

        result = extract_ids(br)
        self.assertIsNotNone(result)

        self.assertDictEqual(
            result,
            {
                'isbn13': '978-88-515-2159-2',
                'isbn10': '88-515-2159-X',  # this is automatically inferred
                'orcid': '0000-0002-1825-0097',
                'wikidata': 'Q9'
            })
コード例 #5
0
def process_chunk(chunk_filepath: str, citations_mapping: Dict[URIRef, str]):
    """
    This function handles all the steps which are needed to fully process a single
    chunk of the citations input dataset.
      - Firstly, the RDF graph serialized inside the chunk file
        is imported in the form of an oc_ocdm's GraphSet.
      - Secondly, a loop over each CI entity
        is performed: the citing and cited OCDM IRIs are extracted and then mapped
        to the Wikidata IDs contained inside the given mapping dictionary (when possible).
        A TSV statement is created for each citation to be uploaded.
      - Lastly, the collected list of statements is appended to the output file.

    :param chunk_filepath: A string representing the filesystem path to the chunk to be imported
    :param citations_mapping: A dictionary mapping OCDM IRIs into the corresponding Wikidata IDs
    """
    # PROCESS INITIALIZATION
    statements: List[str] = []

    # DATA IMPORT PHASE
    graph_chunk: Graph = Graph().parse(location=chunk_filepath, format='nt11')

    g_set: GraphSet = GraphSet(base_iri, wanted_label=False)
    Reader.import_entities_from_graph(g_set,
                                      graph_chunk,
                                      resp_agent,
                                      enable_validation=False)

    # TSV STATEMENTS GENERATION
    for ci in g_set.get_ci():
        citing_uri: Optional[URIRef] = ci.get_citing_entity().res
        cited_uri: Optional[URIRef] = ci.get_cited_entity().res

        if citing_uri in citations_mapping and cited_uri in citations_mapping:
            citing_qid: str = citations_mapping[citing_uri]
            cited_qid: str = citations_mapping[cited_uri]
            statements.append('\t'.join(
                [citing_qid, "P2860", cited_qid, "S248", "Q328"]))

    # TSV STATEMENTS EXPORT
    if len(statements) > 0:
        store_batch(statements)
コード例 #6
0
class Creator(object):
    def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str,
                 supplier_prefix: str, resp_agent: str, ra_index: dict,
                 br_index: dict, re_index_csv: dict, ar_index_csv: dict,
                 vi_index: dict, preexisting_entities: set):
        self.url = base_iri
        self.setgraph = GraphSet(self.url,
                                 info_dir,
                                 supplier_prefix,
                                 wanted_label=False)
        self.resp_agent = resp_agent
        self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri)

        self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'}
        self.br_id_schemas = {
            'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata',
            'wikipedia'
        }
        self.schemas = self.ra_id_schemas.union(self.br_id_schemas)

        self.ra_index = self.indexer_id(ra_index)
        self.br_index = self.indexer_id(br_index)
        self.re_index = self.index_re(re_index_csv)
        self.ar_index = self.index_ar(ar_index_csv)
        self.vi_index = vi_index
        self.preexisting_entities = preexisting_entities
        self.preexisting_graphs = dict()
        self.data = data

    def creator(self, source=None):
        self.src = source
        for row in self.data:
            self.row_meta = ''
            ids = row['id']
            title = row['title']
            authors = row['author']
            pub_date = row['pub_date']
            venue = row['venue']
            vol = row['volume']
            issue = row['issue']
            page = row['page']
            self.type = row['type']
            publisher = row['publisher']
            editor = row['editor']
            self.venue_graph = None
            self.vol_graph = None
            self.issue_graph = None
            self.id_action(ids)
            self.title_action(title)
            self.author_action(authors)
            self.pub_date_action(pub_date)
            self.vvi_action(venue, vol, issue)
            self.page_action(page)
            self.type_action(self.type)
            if publisher:
                self.publisher_action(publisher)
            if editor:
                self.editor_action(editor)
        return self.setgraph

    @staticmethod
    def index_re(id_index):
        index = dict()
        for row in id_index:
            index[row['br']] = row['re']
        return index

    @staticmethod
    def index_ar(id_index):
        index = dict()
        for row in id_index:
            index[row['meta']] = dict()
            index[row['meta']]['author'] = Creator.__ar_worker(row['author'])
            index[row['meta']]['editor'] = Creator.__ar_worker(row['editor'])
            index[row['meta']]['publisher'] = Creator.__ar_worker(
                row['publisher'])
        return index

    @staticmethod
    def __ar_worker(s: str) -> dict:
        if s:
            ar_dict = dict()
            couples = s.split('; ')
            for c in couples:
                cou = c.split(', ')
                ar_dict[cou[1]] = cou[0]
            return ar_dict
        else:
            return dict()

    def indexer_id(self, csv_index):
        index = dict()
        for schema in self.schemas:
            index[schema] = dict()
        for row in csv_index:
            for schema in self.schemas:
                if row['id'].startswith(schema):
                    identifier = row['id'].replace(f'{schema}:', '')
                    index[schema][identifier] = row['meta']
        return index

    def id_action(self, ids):
        idslist = re.split(one_or_more_spaces, ids)
        # publication id
        for identifier in idslist:
            if 'meta:' in identifier:
                identifier = identifier.replace('meta:', '')
                preexisting_entity = True if identifier in self.preexisting_entities else False
                self.row_meta = identifier.replace('br/', '')
                url = URIRef(self.url + identifier)
                preexisting_graph = self.finder.get_preexisting_graph(
                    url,
                    self.preexisting_graphs) if preexisting_entity else None
                self.br_graph = self.setgraph.add_br(
                    self.resp_agent,
                    source=self.src,
                    res=url,
                    preexisting_graph=preexisting_graph)
        for identifier in idslist:
            self.id_creator(self.br_graph, identifier, ra=False)

    def title_action(self, title):
        if title:
            self.br_graph.has_title(title)

    def author_action(self, authors):
        if authors:
            authorslist = re.split(semicolon_in_people_field, authors)
            aut_role_list = list()
            for aut in authorslist:
                aut_and_ids = re.search(name_and_ids, aut)
                aut_id = aut_and_ids.group(2)
                aut_id_list = aut_id.split(' ')
                for identifier in aut_id_list:
                    if 'meta:' in identifier:
                        identifier = str(identifier).replace('meta:', '')
                        preexisting_entity = True if identifier in self.preexisting_entities else False
                        url = URIRef(self.url + identifier)
                        aut_meta = identifier.replace('ra/', '')
                        preexisting_graph = self.finder.get_preexisting_graph(
                            url, self.preexisting_graphs
                        ) if preexisting_entity else None
                        pub_aut = self.setgraph.add_ra(
                            self.resp_agent,
                            source=self.src,
                            res=url,
                            preexisting_graph=preexisting_graph)
                        author_name = aut_and_ids.group(1)
                        if ',' in author_name:
                            author_name_splitted = re.split(
                                comma_and_spaces, author_name)
                            first_name = author_name_splitted[1]
                            last_name = author_name_splitted[0]
                            if first_name.strip():
                                pub_aut.has_given_name(first_name)
                            pub_aut.has_family_name(last_name)
                        else:
                            pub_aut.has_name(author_name)
                # lists of authors' IDs
                for identifier in aut_id_list:
                    self.id_creator(pub_aut, identifier, ra=True)
                # Author ROLE
                AR = self.ar_index[self.row_meta]['author'][aut_meta]
                ar_id = 'ar/' + str(AR)
                preexisting_entity = True if ar_id in self.preexisting_entities else False
                url_ar = URIRef(self.url + ar_id)
                preexisting_graph = self.finder.get_preexisting_graph(
                    url_ar,
                    self.preexisting_graphs) if preexisting_entity else None
                pub_aut_role = self.setgraph.add_ar(
                    self.resp_agent,
                    source=self.src,
                    res=url_ar,
                    preexisting_graph=preexisting_graph)
                pub_aut_role.create_author()
                self.br_graph.has_contributor(pub_aut_role)
                pub_aut_role.is_held_by(pub_aut)
                aut_role_list.append(pub_aut_role)
                if len(aut_role_list) > 1:
                    aut_role_list[aut_role_list.index(pub_aut_role) -
                                  1].has_next(pub_aut_role)

    def pub_date_action(self, pub_date):
        if pub_date:
            datelist = list()
            datesplit = pub_date.split('-')
            if datesplit:
                for x in datesplit:
                    datelist.append(int(x))
            else:
                datelist.append(int(pub_date))
            str_date = create_date(datelist)
            self.br_graph.has_pub_date(str_date)

    def vvi_action(self, venue, vol, issue):
        if venue:
            venue_and_ids = re.search(name_and_ids, venue)
            venue_ids = venue_and_ids.group(2)
            venue_ids_list = venue_ids.split()
            for identifier in venue_ids_list:
                if 'meta:' in identifier:
                    ven_id = str(identifier).replace('meta:', '')
                    preexisting_entity = True if ven_id in self.preexisting_entities else False
                    url = URIRef(self.url + ven_id)
                    venue_title = venue_and_ids.group(1)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    self.venue_graph = self.setgraph.add_br(
                        self.resp_agent,
                        source=self.src,
                        res=url,
                        preexisting_graph=preexisting_graph)
                    try:
                        venue_type = self.get_venue_type(
                            self.type, venue_ids_list)
                    except UnboundLocalError:
                        error_message = f"[INFO:Creator] I found the venue {venue} for the resource of type {self.type}, but I don't know how to handle it"
                        raise UnboundLocalError(msg=error_message)
                    if venue_type:
                        venue_type = venue_type.replace(' ', '_')
                        getattr(self.venue_graph, f'create_{venue_type}')()
                    self.venue_graph.has_title(venue_title)
            for identifier in venue_ids_list:
                self.id_creator(self.venue_graph, identifier, ra=False)
            if self.type == 'journal article' or self.type == 'journal issue' or self.type == 'journal volume':
                meta_ven = ven_id.replace('br/', '')
                if vol:
                    vol_meta = self.vi_index[meta_ven]['volume'][vol]['id']
                    vol_meta = 'br/' + vol_meta
                    preexisting_entity = True if vol_meta in self.preexisting_entities else False
                    vol_url = URIRef(self.url + vol_meta)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        vol_url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    self.vol_graph = self.setgraph.add_br(
                        self.resp_agent,
                        source=self.src,
                        res=vol_url,
                        preexisting_graph=preexisting_graph)
                    self.vol_graph.create_volume()
                    self.vol_graph.has_number(vol)
                if issue:
                    if vol:
                        issue_meta = self.vi_index[meta_ven]['volume'][vol][
                            'issue'][issue]['id']
                    else:
                        issue_meta = self.vi_index[meta_ven]['issue'][issue][
                            'id']
                    issue_meta = 'br/' + issue_meta
                    preexisting_entity = True if issue_meta in self.preexisting_entities else False
                    issue_url = URIRef(self.url + issue_meta)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        issue_url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    self.issue_graph = self.setgraph.add_br(
                        self.resp_agent,
                        source=self.src,
                        res=issue_url,
                        preexisting_graph=preexisting_graph)
                    self.issue_graph.create_issue()
                    self.issue_graph.has_number(issue)
        if venue and vol and issue:
            self.br_graph.is_part_of(self.issue_graph)
            self.issue_graph.is_part_of(self.vol_graph)
            self.vol_graph.is_part_of(self.venue_graph)
        elif venue and vol and not issue:
            self.br_graph.is_part_of(self.vol_graph)
            self.vol_graph.is_part_of(self.venue_graph)
        elif venue and not vol and not issue:
            self.br_graph.is_part_of(self.venue_graph)
        elif venue and not vol and issue:
            self.br_graph.is_part_of(self.issue_graph)
            self.issue_graph.is_part_of(self.venue_graph)

    @classmethod
    def get_venue_type(cls, br_type: str, venue_ids: str) -> str:
        schemas = {venue_id.split(':')[0] for venue_id in venue_ids}
        if br_type in {'journal article', 'journal volume', 'journal issue'}:
            venue_type = 'journal'
        elif br_type in {
                'book chapter', 'book part', 'book section', 'book track'
        }:
            venue_type = 'book'
        elif br_type in {'book', 'edited book', 'monograph', 'reference book'}:
            venue_type = 'book series'
        elif br_type == 'proceedings article':
            venue_type = 'proceedings'
        elif br_type in {'proceedings', 'report', 'standard', 'series'}:
            venue_type = 'series'
        elif br_type == 'reference entry':
            venue_type = 'reference book'
        # elif br_type == 'report series':
        #     venue_type = 'report series'
        elif not br_type or br_type in {'dataset', 'data file'}:
            venue_type = ''
        # Check the type based on the identifier scheme
        if any(identifier for identifier in venue_ids
               if not identifier.startswith('meta:')):
            if venue_type in {
                    'journal', 'book series', 'series', 'report series'
            }:
                if 'isbn' in schemas or 'issn' not in schemas:
                    # It is undecidable
                    venue_type = ''
            elif venue_type in {'book', 'proceedings'}:
                if 'issn' in schemas or 'isbn' not in schemas:
                    venue_type = ''
            elif venue_type == 'reference book':
                if 'isbn' in schemas and 'issn' not in schemas:
                    venue_type = 'reference book'
                elif 'issn' in schemas and 'isbn' not in schemas:
                    venue_type = 'journal'
                elif 'issn' in schemas and 'isbn' in schemas:
                    venue_type = ''
        return venue_type

    def page_action(self, page):
        if page:
            res_em = self.re_index[self.row_meta]
            re_id = 're/' + str(res_em)
            preexisting_entity = True if re_id in self.preexisting_entities else False
            url_re = URIRef(self.url + re_id)
            preexisting_graph = self.finder.get_preexisting_graph(
                url_re,
                self.preexisting_graphs) if preexisting_entity else None
            form = self.setgraph.add_re(self.resp_agent,
                                        source=self.src,
                                        res=url_re,
                                        preexisting_graph=preexisting_graph)
            form.has_starting_page(page)
            form.has_ending_page(page)
            self.br_graph.has_format(form)

    def type_action(self, entity_type):
        if entity_type == 'archival document':
            self.br_graph.create_archival_document()
        elif entity_type == 'book':
            self.br_graph.create_book()
        elif entity_type == 'book chapter':
            self.br_graph.create_book_chapter()
        elif entity_type == 'book part':
            self.br_graph.create_book_part()
        elif entity_type == 'book section':
            self.br_graph.create_book_section()
        elif entity_type == 'book series':
            self.br_graph.create_book_series()
        elif entity_type == 'book set':
            self.br_graph.create_book_set()
        elif entity_type in {'data file', 'dataset'}:
            self.br_graph.create_dataset()
        elif entity_type == 'dissertation':
            self.br_graph.create_dissertation()
        # elif entity_type == 'edited book':
        #     self.br_graph.create_edited_book()
        elif entity_type == 'journal':
            self.br_graph.create_journal()
        elif entity_type == 'journal article':
            self.br_graph.create_journal_article()
        elif entity_type == 'journal issue':
            self.br_graph.create_issue()
        elif entity_type == 'journal volume':
            self.br_graph.create_volume()
        # elif entity_type == 'monograph':
        #     self.br_graph.create_monograph()
        elif entity_type == 'peer review':
            self.br_graph.create_peer_review()
        elif entity_type == 'proceedings':
            self.br_graph.create_proceedings()
        elif entity_type == 'proceedings article':
            self.br_graph.create_proceedings_article()
        # elif entity_type == 'proceedings series':
        #     self.br_graph.create_proceedings_series()
        elif entity_type == 'reference book':
            self.br_graph.create_reference_book()
        elif entity_type == 'reference entry':
            self.br_graph.create_reference_entry()
        elif entity_type == 'report':
            self.br_graph.create_report()
        elif entity_type == 'report series':
            self.br_graph.create_report_series()
        elif entity_type == 'standard':
            self.br_graph.create_standard()
        elif entity_type == 'series':
            self.br_graph.create_series()
        # elif entity_type == 'standard series':
        #     self.br_graph.create_standard_series()()
        elif entity_type == 'web content':
            self.br_graph.create_web_content()

    def publisher_action(self, publisher):
        publ_and_ids = re.search(name_and_ids, publisher)
        publ_id = publ_and_ids.group(2)
        publ_id_list = publ_id.split()
        for identifier in publ_id_list:
            if 'meta:' in identifier:
                identifier = str(identifier).replace('meta:', '')
                preexisting_entity = True if identifier in self.preexisting_entities else False
                pub_meta = identifier.replace('ra/', '')
                url = URIRef(self.url + identifier)
                publ_name = publ_and_ids.group(1)
                preexisting_graph = self.finder.get_preexisting_graph(
                    url,
                    self.preexisting_graphs) if preexisting_entity else None
                publ = self.setgraph.add_ra(
                    self.resp_agent,
                    source=self.src,
                    res=url,
                    preexisting_graph=preexisting_graph)
                publ.has_name(publ_name)
        for identifier in publ_id_list:
            self.id_creator(publ, identifier, ra=True)
        # publisherRole
        AR = self.ar_index[self.row_meta]['publisher'][pub_meta]
        ar_id = 'ar/' + str(AR)
        preexisting_entity = True if ar_id in self.preexisting_entities else False
        url_ar = URIRef(self.url + ar_id)
        preexisting_graph = self.finder.get_preexisting_graph(
            url_ar, self.preexisting_graphs)
        publ_role = self.setgraph.add_ar(self.resp_agent,
                                         source=self.src,
                                         res=url_ar,
                                         preexisting_graph=preexisting_graph)
        publ_role.create_publisher()
        self.br_graph.has_contributor(publ_role)
        publ_role.is_held_by(publ)

    def editor_action(self, editor):
        editorslist = re.split(semicolon_in_people_field, editor)
        edit_role_list = list()
        for ed in editorslist:
            ed_and_ids = re.search(name_and_ids, ed)
            ed_id = ed_and_ids.group(2)
            ed_id_list = ed_id.split(' ')
            for identifier in ed_id_list:
                if 'meta:' in identifier:
                    identifier = str(identifier).replace('meta:', '')
                    preexisting_entity = True if identifier in self.preexisting_entities else False
                    ed_meta = identifier.replace('ra/', '')
                    url = URIRef(self.url + identifier)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    pub_ed = self.setgraph.add_ra(
                        self.resp_agent,
                        source=self.src,
                        res=url,
                        preexisting_graph=preexisting_graph)
                    editor_name = ed_and_ids.group(1)
                    if ',' in editor_name:
                        editor_name_splitted = re.split(
                            comma_and_spaces, editor_name)
                        firstName = editor_name_splitted[1]
                        lastName = editor_name_splitted[0]
                        if firstName.strip():
                            pub_ed.has_given_name(firstName)
                        pub_ed.has_family_name(lastName)
                    else:
                        pub_ed.has_name(editor_name)
            # lists of editor's IDs
            for identifier in ed_id_list:
                self.id_creator(pub_ed, identifier, ra=True)
            # editorRole
            AR = self.ar_index[self.row_meta]['editor'][ed_meta]
            ar_id = 'ar/' + str(AR)
            preexisting_entity = True if ar_id in self.preexisting_entities else False
            url_ar = URIRef(self.url + ar_id)
            preexisting_graph = self.finder.get_preexisting_graph(
                url_ar,
                self.preexisting_graphs) if preexisting_entity else None
            pub_ed_role = self.setgraph.add_ar(
                self.resp_agent,
                source=self.src,
                res=url_ar,
                preexisting_graph=preexisting_graph)
            if self.type == 'proceedings article' and self.venue_graph:
                pub_ed_role.create_editor()
                self.venue_graph.has_contributor(pub_ed_role)
            elif (self.type == 'book chapter'
                  or self.type == 'book part') and self.venue_graph:
                pub_ed_role.create_editor()
                self.venue_graph.has_contributor(pub_ed_role)
            else:
                pub_ed_role.create_editor()
                self.br_graph.has_contributor(pub_ed_role)
            pub_ed_role.is_held_by(pub_ed)
            edit_role_list.append(pub_ed_role)
            if len(edit_role_list) > 1:
                edit_role_list[edit_role_list.index(pub_ed_role) -
                               1].has_next(pub_ed_role)

    def id_creator(self, graph: BibliographicEntity, identifier: str,
                   ra: bool) -> None:
        new_id = None
        if ra:
            for ra_id_schema in self.ra_id_schemas:
                if identifier.startswith(ra_id_schema):
                    identifier = identifier.replace(f'{ra_id_schema}:', '')
                    res = self.ra_index[ra_id_schema][identifier]
                    preexisting_entity = True if f'id/{res}' in self.preexisting_entities else False
                    url = URIRef(self.url + 'id/' + res)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    new_id = self.setgraph.add_id(
                        self.resp_agent,
                        source=self.src,
                        res=url,
                        preexisting_graph=preexisting_graph)
                    getattr(new_id, f'create_{ra_id_schema}')(identifier)
        else:
            for br_id_schema in self.br_id_schemas:
                if identifier.startswith(br_id_schema):
                    identifier = identifier.replace(f'{br_id_schema}:', '')
                    res = self.br_index[br_id_schema][identifier]
                    preexisting_entity = True if f'id/{res}' in self.preexisting_entities else False
                    url = URIRef(self.url + 'id/' + res)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        url, self.preexisting_graphs
                    ) if preexisting_entity else None
                    new_id = self.setgraph.add_id(
                        self.resp_agent,
                        source=self.src,
                        res=url,
                        preexisting_graph=preexisting_graph)
                    getattr(new_id, f'create_{br_id_schema}')(identifier)
        if new_id:
            graph.has_identifier(new_id)
コード例 #7
0
ファイル: creator.py プロジェクト: opencitations/wcw
class Creator(object):
    def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv,
                 ar_index_csv, vi_index):
        self.url = base_iri

        self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False)

        self.ra_index = self.indexer_id(ra_index)

        self.br_index = self.indexer_id(br_index)

        self.re_index = self.index_re(re_index_csv)

        self.ar_index = self.index_ar(ar_index_csv)

        self.vi_index = vi_index
        self.data = data

    def creator(self, source=None):
        self.src = source
        for row in self.data:
            self.row_meta = ""
            ids = row['id']
            title = row['title']
            authors = row['author']
            pub_date = row['pub_date']
            venue = row['venue']
            vol = row['volume']
            issue = row['issue']
            page = row['page']
            self.type = row['type']
            publisher = row['publisher']
            editor = row['editor']

            self.venue_graph = None
            self.vol_graph = None
            self.issue_graph = None

            self.id_action(ids)
            self.title_action(title)
            self.author_action(authors)
            self.pub_date_action(pub_date)
            self.vvi_action(venue, vol, issue)
            self.page_action(page)
            self.type_action(self.type)
            if publisher:
                self.publisher_action(publisher)
            if editor:
                self.editor_action(editor)

        return self.setgraph

    @staticmethod
    def index_re(id_index):
        index = dict()
        for row in id_index:
            index[row["br"]] = row["re"]
        return index

    @staticmethod
    def index_ar(id_index):
        index = dict()
        for row in id_index:
            index[row["meta"]] = dict()
            index[row["meta"]]["author"] = Creator.ar_worker(row["author"])
            index[row["meta"]]["editor"] = Creator.ar_worker(row["editor"])
            index[row["meta"]]["publisher"] = Creator.ar_worker(row["publisher"])
        return index

    @staticmethod
    def ar_worker(s):
        if s:
            ar_dict = dict()
            couples = s.split("; ")
            for c in couples:
                cou = c.split(", ")
                ar_dict[cou[1]] = cou[0]
            return ar_dict
        else:
            return dict()

    @staticmethod
    def indexer_id(csv_index):
        index = dict()
        index['crossref'] = dict()
        index["doi"] = dict()
        index["issn"] = dict()
        index["isbn"] = dict()
        index["orcid"] = dict()
        index["pmid"] = dict()
        index['pmcid'] = dict()
        index['url'] = dict()
        index['viaf'] = dict()
        index['wikidata'] = dict()
        index['wikipedia'] = dict()

        for row in csv_index:
            if row["id"].startswith("crossref"):
                identifier = row["id"].replace('crossref:', '')
                index['crossref'][identifier] = row["meta"]

            elif row["id"].startswith("doi"):
                identifier = row["id"].replace('doi:', '')
                index['doi'][identifier] = row["meta"]

            elif row["id"].startswith("issn"):
                identifier = row["id"].replace('issn:', '')
                index['issn'][identifier] = row["meta"]

            elif row["id"].startswith("isbn"):
                identifier = row["id"].replace('isbn:', '')
                index['isbn'][identifier] = row["meta"]

            elif row["id"].startswith("orcid"):
                identifier = row["id"].replace('orcid:', '')
                index['orcid'][identifier] = row["meta"]

            elif row["id"].startswith("pmid"):
                identifier = row["id"].replace('pmid:', '')
                index['pmid'][identifier] = row["meta"]

            elif row["id"].startswith("pmcid"):
                identifier = row["id"].replace('pmcid:', '')
                index['pmcid'][identifier] = row["meta"]

            elif row["id"].startswith("url"):
                identifier = row["id"].replace('url:', '')
                index['url'][identifier] = row["meta"]

            elif row["id"].startswith("viaf"):
                identifier = row["id"].replace('viaf:', '')
                index['viaf'][identifier] = row["meta"]

            elif row["id"].startswith("wikidata"):
                identifier = row["id"].replace('wikidata:', '')
                index['wikidata'][identifier] = row["meta"]

            elif row["id"].startswith("wikipedia"):
                identifier = row["id"].replace('wikipedia:', '')
                index['wikipedia'][identifier] = row["meta"]

        return index

    def id_action(self, ids):
        idslist = re.split(r'\s+', ids)

        # publication id
        for identifier in idslist:
            if "meta:" in identifier:
                identifier = identifier.replace("meta:", "")
                self.row_meta = identifier.replace("br/", "")
                url = URIRef(self.url + identifier)
                self.br_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url)

        for identifier in idslist:
            self.id_creator(self.br_graph, identifier, ra=False)

    def title_action(self, title):
        if title:
            self.br_graph.has_title(title)

    def author_action(self, authors):
        if authors:
            authorslist = re.split(r'\s*;\s*(?=[^]]*(?:\[|$))', authors)

            aut_role_list = list()
            for aut in authorslist:
                aut_id = re.search(r'\[\s*(.*?)\s*]', aut).group(1)
                aut_id_list = aut_id.split(" ")

                for identifier in aut_id_list:
                    if "meta:" in identifier:
                        identifier = str(identifier).replace('meta:', "")
                        url = URIRef(self.url + identifier)
                        aut_meta = identifier.replace('ra/', "")
                        pub_aut = self.setgraph.add_ra(resp_agent, source=self.src, res=url)
                        author_name = re.search(r'(.*?)\s*\[.*?]', aut).group(1)
                        if "," in author_name:
                            author_name_splitted = re.split(r'\s*,\s*', author_name)
                            firstName = author_name_splitted[1]
                            lastName = author_name_splitted[0]
                            if firstName.strip():
                                pub_aut.has_given_name(firstName)
                            pub_aut.has_family_name(lastName)
                        else:
                            pub_aut.has_name(author_name)

                # lists of authors' IDs
                for identifier in aut_id_list:
                    self.id_creator(pub_aut, identifier, ra=True)

                # Author ROLE
                AR = self.ar_index[self.row_meta]["author"][aut_meta]
                ar_id = "ar/" + str(AR)
                url_ar = URIRef(self.url + ar_id)
                pub_aut_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar)
                pub_aut_role.create_author()
                self.br_graph.has_contributor(pub_aut_role)
                pub_aut_role.is_held_by(pub_aut)
                aut_role_list.append(pub_aut_role)
                if len(aut_role_list) > 1:
                    aut_role_list[aut_role_list.index(pub_aut_role)-1].has_next(pub_aut_role)

    def pub_date_action(self, pub_date):
        if pub_date:
            datelist = list()
            datesplit = pub_date.split("-")
            if datesplit:
                for x in datesplit:
                    datelist.append(int(x))
            else:
                datelist.append(int(pub_date))
            str_date = create_date(datelist)
            self.br_graph.has_pub_date(str_date)

    def vvi_action(self, venue, vol, issue):

        if venue:
            venue_id = re.search(r'\[\s*(.*?)\s*]', venue).group(1)
            venue_id_list = venue_id.split(" ")

            for identifier in venue_id_list:
                if "meta:" in identifier:
                    ven_id = str(identifier).replace("meta:", "")
                    url = URIRef(self.url + ven_id)
                    venue_title = re.search(r'(.*?)\s*\[.*?]', venue).group(1)
                    self.venue_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url)
                    if self.type == "journal article" or self.type == "journal volume" or self.type == "journal issue":
                        self.venue_graph.create_journal()
                    elif self.type == "book chapter" or self.type == "book part":
                        self.venue_graph.create_book()
                    elif self.type == "proceedings article":
                        self.venue_graph.create_proceedings()
                    elif self.type == "report":
                        self.venue_graph.create_report_series()
                    elif self.type == "standard":
                        self.venue_graph.create_standard_series()

                    self.venue_graph.has_title(venue_title)

            for identifier in venue_id_list:
                self.id_creator(self.venue_graph, identifier, ra=False)

        if (self.type == "journal article" or self.type == "journal issue") and vol:

            meta_ven = ven_id.replace("br/", "")
            vol_meta = self.vi_index[meta_ven]["volume"][vol]["id"]
            vol_meta = "br/" + vol_meta
            url = URIRef(self.url + vol_meta)
            self.vol_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url)
            self.vol_graph.create_volume()
            self.vol_graph.has_number(vol)

        if self.type == "journal article" and issue:

            meta_ven = ven_id.replace("br/", "")
            if vol:
                iss_meta = self.vi_index[meta_ven]["volume"][vol]["issue"][issue]["id"]
            else:
                iss_meta = self.vi_index[meta_ven]["issue"][issue]["id"]

            iss_meta = "br/" + iss_meta
            url = URIRef(self.url + iss_meta)
            self.issue_graph = self.setgraph.add_br(resp_agent, source=self.src, res=url)
            self.issue_graph.create_issue()
            self.issue_graph.has_number(issue)

        if venue and vol and issue:
            self.br_graph.is_part_of(self.issue_graph)
            self.issue_graph.is_part_of(self.vol_graph)
            self.vol_graph.is_part_of(self.venue_graph)

        elif venue and vol and not issue:
            self.br_graph.is_part_of(self.vol_graph)
            self.vol_graph.is_part_of(self.venue_graph)

        elif venue and not vol and not issue:
            self.br_graph.is_part_of(self.venue_graph)

        elif venue and not vol and issue:
            self.br_graph.is_part_of(self.issue_graph)
            self.issue_graph.is_part_of(self.venue_graph)

    def page_action(self, page):
        if page:
            res_em = self.re_index[self.row_meta]
            re_id = "re/" + str(res_em)
            url_re = URIRef(self.url + re_id)
            form = self.setgraph.add_re(resp_agent, source=self.src, res=url_re)
            form.has_starting_page(page)
            form.has_ending_page(page)
            self.br_graph.has_format(form)

    def type_action(self, entity_type):
        if entity_type == "archival document":
            self.br_graph.create_archival_document()
        elif entity_type == "book":
            self.br_graph.create_book()
        elif entity_type == "book chapter":
            self.br_graph.create_book_chapter()
        elif entity_type == "book part":
            self.br_graph.create_book_part()
        elif entity_type == "book section":
            self.br_graph.create_book_section()
        elif entity_type == "book series":
            self.br_graph.create_book_series()
        elif entity_type == "book set":
            self.br_graph.create_book_set()
        elif entity_type == "data file":
            self.br_graph.create_dataset()
        elif entity_type == "dissertation":
            self.br_graph.create_dissertation()
        elif entity_type == "journal":
            self.br_graph.create_journal()
        elif entity_type == "journal article":
            self.br_graph.create_journal_article()
        elif entity_type == "journal issue":
            self.br_graph.create_issue()
        elif entity_type == "journal volume":
            self.br_graph.create_volume()
        elif entity_type == "proceedings article":
            self.br_graph.create_proceedings_article()
        elif entity_type == "proceedings":
            self.br_graph.create_proceedings()
        elif entity_type == "reference book":
            self.br_graph.create_reference_book()
        elif entity_type == "reference entry":
            self.br_graph.create_reference_entry()
        elif entity_type == "report":
            self.br_graph.create_report()
        elif entity_type == "standard":
            self.br_graph.create_standard()
        elif entity_type == "series":
            self.br_graph.create_series()

    def publisher_action(self, publisher):

        publ_id = re.search(r'\[\s*(.*?)\s*]', publisher).group(1)
        publ_id_list = publ_id.split(" ")

        for identifier in publ_id_list:
            if "meta:" in identifier:
                identifier = str(identifier).replace("meta:", "")
                pub_meta = identifier.replace("ra/", "")
                url = URIRef(self.url + identifier)
                publ_name = re.search(r'(.*?)\s*\[.*?]', publisher).group(1)
                publ = self.setgraph.add_ra(resp_agent, source=self.src, res=url)
                publ.has_name(publ_name)

        for identifier in publ_id_list:
            self.id_creator(publ, identifier, ra=True)

        # publisherRole
        AR = self.ar_index[self.row_meta]["publisher"][pub_meta]
        ar_id = "ar/" + str(AR)
        url_ar = URIRef(self.url + ar_id)
        publ_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar)
        publ_role.create_publisher()
        self.br_graph.has_contributor(publ_role)
        publ_role.is_held_by(publ)

    def editor_action(self, editor):
        editorslist = re.split(r'\s*;\s*(?=[^]]*(?:\[|$))', editor)

        edit_role_list = list()
        for ed in editorslist:
            ed_id = re.search(r'\[\s*(.*?)\s*]', ed).group(1)
            ed_id_list = ed_id.split(" ")

            for identifier in ed_id_list:
                if "meta:" in identifier:
                    identifier = str(identifier).replace("meta:", "")
                    ed_meta = identifier.replace("ra/", "")
                    url = URIRef(self.url + identifier)
                    pub_ed = self.setgraph.add_ra(resp_agent, source=self.src, res=url)
                    editor_name = re.search(r'(.*?)\s*\[.*?]', ed).group(1)
                    if "," in editor_name:
                        editor_name_splitted = re.split(r'\s*,\s*', editor_name)
                        firstName = editor_name_splitted[1]
                        lastName = editor_name_splitted[0]
                        if firstName.strip():
                            pub_ed.has_given_name(firstName)
                        pub_ed.has_family_name(lastName)
                    else:
                        pub_ed.has_name(editor_name)

            # lists of editor's IDs
            for identifier in ed_id_list:
                self.id_creator(pub_ed, identifier, ra=True)

            # editorRole
            AR = self.ar_index[self.row_meta]["editor"][ed_meta]
            ar_id = "ar/" + str(AR)
            url_ar = URIRef(self.url + ar_id)
            pub_ed_role = self.setgraph.add_ar(resp_agent, source=self.src, res=url_ar)

            if self.type == "proceedings article" and self.venue_graph:
                pub_ed_role.create_editor()
                self.venue_graph.has_contributor(pub_ed_role)
            elif (self.type == "book chapter" or self.type == "book part") and self.venue_graph:
                pub_ed_role.create_editor()
                self.venue_graph.has_contributor(pub_ed_role)
            else:
                pub_ed_role.create_editor()
                self.br_graph.has_contributor(pub_ed_role)

            pub_ed_role.is_held_by(pub_ed)
            edit_role_list.append(pub_ed_role)
            if len(edit_role_list) > 1:
                edit_role_list[edit_role_list.index(pub_ed_role)-1].has_next(pub_ed_role)

    def id_creator(self, graph, identifier, ra):

        new_id = None

        if ra:
            if identifier.startswith("crossref"):
                identifier = identifier.replace('crossref:', '')
                res = self.ra_index['crossref'][identifier]
                url = URIRef(self.url + "id/" + res)
                new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
                new_id.create_crossref(identifier)

            elif identifier.startswith("orcid"):
                identifier = identifier.replace("orcid:", "")
                res = self.ra_index['orcid'][identifier]
                url = URIRef(self.url + "id/" + res)
                new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
                new_id.create_orcid(identifier)

            elif identifier.startswith("viaf"):
                identifier = identifier.replace("viaf:", "")
                res = self.ra_index['viaf'][identifier]
                url = URIRef(self.url + "id/" + res)
                new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
                new_id.create_viaf(identifier)

            elif identifier.startswith("wikidata"):
                identifier = identifier.replace("wikidata:", "")
                res = self.ra_index['wikidata'][identifier]
                url = URIRef(self.url + "id/" + res)
                new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
                new_id.create_wikidata(identifier)

        elif identifier.startswith("doi"):
            identifier = identifier.replace("doi:", "")
            res = self.br_index['doi'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_doi(identifier)

        elif identifier.startswith("issn"):
            identifier = identifier.replace("issn:", "")
            res = self.br_index['issn'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_issn(identifier)

        elif identifier.startswith("isbn"):
            identifier = identifier.replace("isbn:", "")
            res = self.br_index['isbn'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_isbn(identifier)

        elif identifier.startswith("pmid"):
            identifier = identifier.replace("pmid:", "")
            res = self.br_index['pmid'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_pmid(identifier)

        elif identifier.startswith("pmcid"):
            identifier = identifier.replace("pmcid:", "")
            res = self.br_index['pmcid'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_pmcid(identifier)

        elif identifier.startswith("url"):
            identifier = identifier.replace("url:", "")
            res = self.br_index['url'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_url(identifier)

        elif identifier.startswith("wikidata"):
            identifier = identifier.replace("wikidata:", "")
            res = self.br_index['wikidata'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_wikidata(identifier)

        elif identifier.startswith("wikipedia"):
            identifier = identifier.replace("wikipedia:", "")
            res = self.br_index['wikipedia'][identifier]
            url = URIRef(self.url + "id/" + res)
            new_id = self.setgraph.add_id(resp_agent, source=self.src, res=url)
            new_id.create_wikipedia(identifier)

        if new_id:
            graph.has_identifier(new_id)
コード例 #8
0
class RespAgentsCreator(Creator):
    def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str,
                 supplier_prefix: str, resp_agent: str, ra_index: dict,
                 preexisting_entities: set):
        self.url = base_iri
        self.setgraph = GraphSet(self.url,
                                 info_dir,
                                 supplier_prefix,
                                 wanted_label=False)
        self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri)
        self.resp_agent = resp_agent
        self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'}
        self.br_id_schemas = {
            'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata',
            'wikipedia'
        }
        self.schemas = self.ra_id_schemas.union(self.br_id_schemas)
        self.ra_index = self.indexer_id(ra_index)
        self.preexisting_entities = preexisting_entities
        self.data = data

    def creator(self, source=None):
        self.src = source
        for row in self.data:
            authors = row['author']
            publisher = row['publisher']
            editor = row['editor']
            self.author_action(authors)
            if publisher:
                self.publisher_action(publisher)
            if editor:
                self.editor_action(editor)
        return self.setgraph

    def author_action(self, authors):
        if authors:
            authorslist = re.split(semicolon_in_people_field, authors)
            for aut in authorslist:
                aut_and_ids = re.search(name_and_ids, aut)
                aut_id = aut_and_ids.group(2)
                aut_id_list = aut_id.split()
                for identifier in aut_id_list:
                    if 'meta:' in identifier:
                        identifier = str(identifier).replace('meta:', '')
                        preexisting_entity = True if identifier in self.preexisting_entities else False
                        url = URIRef(self.url + identifier)
                        preexisting_graph = self.finder.get_preexisting_graph(
                            url) if preexisting_entity else None
                        pub_aut = self.setgraph.add_ra(
                            self.resp_agent,
                            source=self.src,
                            res=url,
                            preexisting_graph=preexisting_graph)
                        author_name = aut_and_ids.group(1)
                        if ',' in author_name:
                            author_name_splitted = re.split(
                                comma_and_spaces, author_name)
                            first_name = author_name_splitted[1]
                            last_name = author_name_splitted[0]
                            if first_name.strip():
                                pub_aut.has_given_name(first_name)
                            pub_aut.has_family_name(last_name)
                        else:
                            pub_aut.has_name(author_name)
                # lists of authors' IDs
                for identifier in aut_id_list:
                    self.id_creator(pub_aut, identifier, ra=True)

    def publisher_action(self, publisher):
        publ_and_ids = re.search(name_and_ids, publisher)
        publ_id = publ_and_ids.group(2)
        publ_id_list = publ_id.split()
        for identifier in publ_id_list:
            if 'meta:' in identifier:
                identifier = str(identifier).replace('meta:', '')
                preexisting_entity = True if identifier in self.preexisting_entities else False
                url = URIRef(self.url + identifier)
                publ_name = publ_and_ids.group(1)
                preexisting_graph = self.finder.get_preexisting_graph(
                    url) if preexisting_entity else None
                publ = self.setgraph.add_ra(
                    self.resp_agent,
                    source=self.src,
                    res=url,
                    preexisting_graph=preexisting_graph)
                publ.has_name(publ_name)
        for identifier in publ_id_list:
            self.id_creator(publ, identifier, ra=True)

    def editor_action(self, editor):
        editorslist = re.split(semicolon_in_people_field, editor)
        for ed in editorslist:
            ed_and_ids = re.search(name_and_ids, ed)
            ed_id = ed_and_ids.group(2)
            ed_id_list = ed_id.split(' ')
            for identifier in ed_id_list:
                if 'meta:' in identifier:
                    identifier = str(identifier).replace('meta:', '')
                    preexisting_entity = True if identifier in self.preexisting_entities else False
                    url = URIRef(self.url + identifier)
                    preexisting_graph = self.finder.get_preexisting_graph(
                        url) if preexisting_entity else None
                    pub_ed = self.setgraph.add_ra(
                        self.resp_agent,
                        source=self.src,
                        res=url,
                        preexisting_graph=preexisting_graph)
                    editor_name = ed_and_ids.group(1)
                    if ',' in editor_name:
                        editor_name_splitted = re.split(
                            comma_and_spaces, editor_name)
                        firstName = editor_name_splitted[1]
                        lastName = editor_name_splitted[0]
                        if firstName.strip():
                            pub_ed.has_given_name(firstName)
                        pub_ed.has_family_name(lastName)
                    else:
                        pub_ed.has_name(editor_name)
            # lists of editor's IDs
            for identifier in ed_id_list:
                self.id_creator(pub_ed, identifier, ra=True)
コード例 #9
0
def process(cur_citations_file: str, conversion_dict: Dict[str, str]) -> None:
    """
    This function takes care of generating an OCDM compliant RDF file containing
    the Citation entities that describe the relations between citing Wikipedia pages
    and cited bibliographic resources.

    Additionally, a CSV file compliant with other OpenCitations tools is produced. Since this
    is not strictly needed for the 'Wikipedia Citations in Wikidata' workflow, those files
    can be safely ignored.

    Please note: the bool flag 'rdf_output_in_chunks' from conf/conf_citations.py MUST be set to True,
    otherwise the following scripts of the workflow (Enricher and Pusher) won't be able to import
    the intermediate RDF files produced by this script.

    :param cur_citations_file: The filename (without the path) of the CSV file to be converted
    :param conversion_dict: The dictionary that maps 'tmp' identifiers onto their respective 'meta' identifiers
    """
    filepath: str = os.path.join(citations_csv_dir, cur_citations_file)
    df: pd.DataFrame = pd.read_csv(filepath,
                                   usecols=['citing', 'cited'],
                                   low_memory=False)

    # 'tmp-to-meta' mapping is applied to each column of the DataFrame
    tmp_to_meta_mapping(df['citing'], conversion_dict)
    tmp_to_meta_mapping(df['cited'], conversion_dict)

    # Rows containing None values are dropped: we cannot generate valid Citation entities for them
    df = df.dropna(axis=0, how='any', subset=['citing', 'cited'])
    df = df.reset_index(drop=True)

    # The DataFrame is enriched with additional columns that are needed for achieving full
    # compliance with other OpenCitations tools. This is not strictly needed for our workflow:
    df['id'] = None
    df['oci'] = None
    df['creation'] = None  # not applicable (the citation comes from a Wikipedia page)
    df['timespan'] = None  # not applicable (the citation comes from a Wikipedia page)
    df['journal_sc'] = 'no'
    df['author_sc'] = 'no'

    # A temporary GraphSet is used to instantiate BibliographicResource entities that are needed
    # for the creation of Citation entities but that won't be kept in the output RDF file:
    temp_gs: GraphSet = GraphSet(base_iri)

    # The actual GraphSet that will contain the Citation entities to be stored in the output RDF file:
    ci_gs: GraphSet = GraphSet(base_iri,
                               info_dir=info_dir,
                               supplier_prefix=supplier_prefix,
                               wanted_label=False)

    # Here the DataFrame columns are converted into Numpy arrays
    # so that we can iterate way faster over their elements:
    citing_col = df['citing'].to_numpy(copy=False)
    cited_col = df['cited'].to_numpy(copy=False)
    id_col = df['id'].to_numpy(copy=False)
    oci_col = df['oci'].to_numpy(copy=False)

    for i, (citing_meta_id,
            cited_meta_id) in enumerate(zip(citing_col, cited_col)):
        citing_res: URIRef = URIRef(base_iri + citing_meta_id)
        cited_res: URIRef = URIRef(base_iri + cited_meta_id)

        # A query is performed to discover if the current citation has already been processed:
        query_string: str = f'''
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX cito: <http://purl.org/spar/cito/>
        PREFIX datacite: <http://purl.org/spar/datacite/>
        PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>

        SELECT ?ci_res ?oci
        FROM <https://w3id.org/oc/meta/ci/>
        WHERE {{
            ?ci_res rdf:type cito:Citation ;
                    cito:hasCitingEntity <{citing_res}> ;
                    cito:hasCitedEntity <{cited_res}> .
            OPTIONAL {{
                ?ci_res datacite:hasIdentifier ?id .
                ?id	datacite:usesIdentifierScheme datacite:oci ;
                    literal:hasLiteralValue ?oci .
            }}
        }}
        LIMIT 1
        '''
        tp: SPARQLWrapper = SPARQLWrapper(triplestore_url)
        tp.setTimeout(query_timeout)
        tp.setMethod('GET')
        tp.setQuery(query_string)
        tp.setReturnFormat(JSON)
        results = tp.queryAndConvert()
        bindings = results["results"]["bindings"]

        if len(
                bindings
        ) >= 1:  # 'LIMIT 1' in the query string should guarantee a maximum of 1 returned binding
            # This citation is already stored in the triplestore!
            row: Dict = bindings[0]

            # Update the output dataframe
            ci_res: URIRef = URIRef(bindings[0]["ci_res"]["value"])
            id_col[i] = str(ci_res)[len(base_iri):]

            if "oci" in row:
                oci_col[i] = row["oci"]["value"]
        else:
            # This citation is currently missing from the triplestore!

            # Create BR entities in "append mode" by providing 'res' without 'preexisting_graph'
            citing_br: BibliographicResource = temp_gs.add_br(
                resp_agent, res=citing_res, preexisting_graph=None)
            cited_br: BibliographicResource = temp_gs.add_br(
                resp_agent, res=cited_res, preexisting_graph=None)

            # Create OCI identifier
            oci_str: str = str(citing_res)[len(base_iri + 'br/'):] + '-' + str(
                cited_res)[len(base_iri + 'br/'):]
            oci: Identifier = ci_gs.add_id(resp_agent)
            oci.create_oci(oci_str)

            # Create citation
            ci: Citation = ci_gs.add_ci(resp_agent)
            ci.has_identifier(oci)
            ci.has_citing_entity(citing_br)
            ci.has_cited_entity(cited_br)

            # Update the output dataframe
            id_col[i] = str(ci.res)[len(base_iri):]
            oci_col[i] = oci_str

    # Store the dataframe as a CSV file that's compliant with OpenCitations tools:
    output_filepath: str = os.path.join(converter_citations_csv_output_dir,
                                        cur_citations_file)
    df.to_csv(output_filepath,
              index=False,
              chunksize=100000,
              columns=[
                  'id', 'oci', 'citing', 'cited', 'creation', 'timespan',
                  'journal_sc', 'author_sc'
              ])

    # Store new citations in an RDF file (together with the related provenance).
    # They should also be uploaded to the triplestore so to update the current state
    # of execution: by this way, they won't be created again since they will already
    # be present inside the triplestore.
    ci_ps: ProvSet = ProvSet(ci_gs, base_iri)
    ci_ps.generate_provenance()

    ci_storer: Storer = Storer(ci_gs,
                               dir_split=dir_split_number,
                               n_file_item=items_per_file,
                               default_dir=default_dir,
                               output_format='nt11')
    ci_prov_storer: Storer = Storer(ci_ps,
                                    dir_split=dir_split_number,
                                    n_file_item=items_per_file,
                                    default_dir=default_dir,
                                    output_format='nquads')

    if rdf_output_in_chunks:
        # The RDF files are stored WITHOUT following the folder structure
        # adopted by OpenCitations: all the newly created citations are kept
        # in a single file.
        # In the following steps of the workflow, every script assumes that data was produced in chunks:
        # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must
        # be set to True.
        filename_without_csv: str = cur_citations_file[:-4]

        # Data
        f: str = os.path.join(converter_citations_rdf_output_dir,
                              filename_without_csv + ".nt")
        if not os.path.exists(os.path.dirname(f)):
            os.makedirs(os.path.dirname(f))
        ci_storer.store_graphs_in_file(f, context_path)
        ci_storer.upload_all(triplestore_url,
                             converter_citations_rdf_output_dir,
                             batch_size=100)

        # Provenance
        prov_dir: str = os.path.join(converter_citations_rdf_output_dir,
                                     'prov')
        f_prov: str = os.path.join(prov_dir, filename_without_csv + '.nq')
        if not os.path.exists(os.path.dirname(f_prov)):
            os.makedirs(os.path.dirname(f_prov))
        ci_prov_storer.store_graphs_in_file(f_prov, context_path)
    else:
        # The RDF files are stored following the folder structure adopted by OpenCitations.
        # Newly created citations could be split into different files based on
        # various conditions.
        # In the following steps of the workflow, every script assumes that data was produced in chunks:
        # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must
        # be set to True.
        ci_storer.upload_and_store(converter_citations_rdf_output_dir,
                                   triplestore_url,
                                   base_iri,
                                   context_path,
                                   batch_size=100)

        ci_prov_storer.store_all(converter_citations_rdf_output_dir, base_iri,
                                 context_path)