Esempio n. 1
0
    def __init__(self, data: list, endpoint: str, base_iri: str, info_dir: str,
                 supplier_prefix: str, resp_agent: str, ra_index: dict,
                 br_index: dict, re_index_csv: dict, ar_index_csv: dict,
                 vi_index: dict, preexisting_entities: set):
        self.url = base_iri
        self.setgraph = GraphSet(self.url,
                                 info_dir,
                                 supplier_prefix,
                                 wanted_label=False)
        self.resp_agent = resp_agent
        self.finder = ResourceFinder(ts_url=endpoint, base_iri=base_iri)

        self.ra_id_schemas = {'crossref', 'orcid', 'viaf', 'wikidata'}
        self.br_id_schemas = {
            'doi', 'issn', 'isbn', 'pmid', 'pmcid', 'url', 'wikidata',
            'wikipedia'
        }
        self.schemas = self.ra_id_schemas.union(self.br_id_schemas)

        self.ra_index = self.indexer_id(ra_index)
        self.br_index = self.indexer_id(br_index)
        self.re_index = self.index_re(re_index_csv)
        self.ar_index = self.index_ar(ar_index_csv)
        self.vi_index = vi_index
        self.preexisting_entities = preexisting_entities
        self.preexisting_graphs = dict()
        self.data = data
Esempio n. 2
0
    def test_extract_ids(self):
        resp_agent = 'http://w3c.org/oc/meta/pa/999'
        g_set = GraphSet('http://w3c.org/oc/meta/')
        br = g_set.add_br(resp_agent)

        isbn = g_set.add_id(resp_agent)
        isbn.create_isbn('978-88-515-2159-2')

        orcid = g_set.add_id(resp_agent)
        orcid.create_orcid('0000-0002-1825-0097')

        wikidata = g_set.add_id(resp_agent)
        wikidata.create_wikidata('Q9')

        br.has_identifier(isbn)
        br.has_identifier(orcid)
        br.has_identifier(wikidata)

        result = extract_ids(br)
        self.assertIsNotNone(result)

        self.assertDictEqual(
            result,
            {
                'isbn13': '978-88-515-2159-2',
                'isbn10': '88-515-2159-X',  # this is automatically inferred
                'orcid': '0000-0002-1825-0097',
                'wikidata': 'Q9'
            })
Esempio n. 3
0
def process_chunk(filename: str) -> None:
    """
    This function wraps the functionality of the external library 'oc_graphenricher'.
    It imports an OCDM compliant RDF chunk file, it tries to enrich it with external identifiers
    and then deduplicates its entities.

    :param filename: a string representing the filename (without the path) of the chunk file to be processed
    """
    filepath: str = os.path.join(rdf_input_dir, filename)
    filename_without_extension: str = os.path.splitext(filename)[0]

    g: Graph = Graph()
    g = g.parse(filepath, format='nt11')

    reader: Reader = Reader()
    g_set: GraphSet = GraphSet(base_iri=base_iri,
                               info_dir=info_dir,
                               supplier_prefix=supplier_prefix,
                               wanted_label=False)
    reader.import_entities_from_graph(g_set,
                                      g,
                                      enable_validation=False,
                                      resp_agent=resp_agent)

    # Enrichment
    enriched_filepath: str = rdf_output_dir + os.sep + 'enriched' + os.sep +\
        filename_without_extension + '.nt'
    enriched_prov: str = rdf_output_dir + os.sep + 'enriched' + os.sep + 'prov' + os.sep +\
        filename_without_extension + '.nq'
    # Output folders are created if not already existing
    if not os.path.exists(os.path.dirname(enriched_filepath)):
        os.makedirs(os.path.dirname(enriched_filepath))
    if not os.path.exists(os.path.dirname(enriched_prov)):
        os.makedirs(os.path.dirname(enriched_prov))

    enricher: GraphEnricher = GraphEnricher(g_set,
                                            graph_filename=enriched_filepath,
                                            provenance_filename=enriched_prov,
                                            info_dir=info_dir,
                                            debug=False,
                                            serialize_in_the_middle=False)
    enricher.enrich()

    # Deduplication
    deduplicated_filepath: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep +\
        filename_without_extension + '.nt'
    deduplicated_prov: str = rdf_output_dir + os.sep + 'deduplicated' + os.sep + 'prov' + os.sep + \
        filename_without_extension + '.nq'
    # Output folders are created if not already existing
    if not os.path.exists(os.path.dirname(deduplicated_filepath)):
        os.makedirs(os.path.dirname(deduplicated_filepath))
    if not os.path.exists(os.path.dirname(deduplicated_prov)):
        os.makedirs(os.path.dirname(deduplicated_prov))

    matcher = InstanceMatching(g_set,
                               graph_filename=deduplicated_filepath,
                               provenance_filename=deduplicated_prov,
                               info_dir=info_dir,
                               debug=False)
    matcher.match()
Esempio n. 4
0
    def __init__(self, data, base_iri, info_dir, supplier_prefix, ra_index, br_index, re_index_csv,
                 ar_index_csv, vi_index):
        self.url = base_iri

        self.setgraph = GraphSet(self.url, info_dir, supplier_prefix, wanted_label=False)

        self.ra_index = self.indexer_id(ra_index)

        self.br_index = self.indexer_id(br_index)

        self.re_index = self.index_re(re_index_csv)

        self.ar_index = self.index_ar(ar_index_csv)

        self.vi_index = vi_index
        self.data = data
Esempio n. 5
0
def process_chunk(chunk_filepath: str, citations_mapping: Dict[URIRef, str]):
    """
    This function handles all the steps which are needed to fully process a single
    chunk of the citations input dataset.
      - Firstly, the RDF graph serialized inside the chunk file
        is imported in the form of an oc_ocdm's GraphSet.
      - Secondly, a loop over each CI entity
        is performed: the citing and cited OCDM IRIs are extracted and then mapped
        to the Wikidata IDs contained inside the given mapping dictionary (when possible).
        A TSV statement is created for each citation to be uploaded.
      - Lastly, the collected list of statements is appended to the output file.

    :param chunk_filepath: A string representing the filesystem path to the chunk to be imported
    :param citations_mapping: A dictionary mapping OCDM IRIs into the corresponding Wikidata IDs
    """
    # PROCESS INITIALIZATION
    statements: List[str] = []

    # DATA IMPORT PHASE
    graph_chunk: Graph = Graph().parse(location=chunk_filepath, format='nt11')

    g_set: GraphSet = GraphSet(base_iri, wanted_label=False)
    Reader.import_entities_from_graph(g_set,
                                      graph_chunk,
                                      resp_agent,
                                      enable_validation=False)

    # TSV STATEMENTS GENERATION
    for ci in g_set.get_ci():
        citing_uri: Optional[URIRef] = ci.get_citing_entity().res
        cited_uri: Optional[URIRef] = ci.get_cited_entity().res

        if citing_uri in citations_mapping and cited_uri in citations_mapping:
            citing_qid: str = citations_mapping[citing_uri]
            cited_qid: str = citations_mapping[cited_uri]
            statements.append('\t'.join(
                [citing_qid, "P2860", cited_qid, "S248", "Q328"]))

    # TSV STATEMENTS EXPORT
    if len(statements) > 0:
        store_batch(statements)
Esempio n. 6
0
def process(cur_citations_file: str, conversion_dict: Dict[str, str]) -> None:
    """
    This function takes care of generating an OCDM compliant RDF file containing
    the Citation entities that describe the relations between citing Wikipedia pages
    and cited bibliographic resources.

    Additionally, a CSV file compliant with other OpenCitations tools is produced. Since this
    is not strictly needed for the 'Wikipedia Citations in Wikidata' workflow, those files
    can be safely ignored.

    Please note: the bool flag 'rdf_output_in_chunks' from conf/conf_citations.py MUST be set to True,
    otherwise the following scripts of the workflow (Enricher and Pusher) won't be able to import
    the intermediate RDF files produced by this script.

    :param cur_citations_file: The filename (without the path) of the CSV file to be converted
    :param conversion_dict: The dictionary that maps 'tmp' identifiers onto their respective 'meta' identifiers
    """
    filepath: str = os.path.join(citations_csv_dir, cur_citations_file)
    df: pd.DataFrame = pd.read_csv(filepath,
                                   usecols=['citing', 'cited'],
                                   low_memory=False)

    # 'tmp-to-meta' mapping is applied to each column of the DataFrame
    tmp_to_meta_mapping(df['citing'], conversion_dict)
    tmp_to_meta_mapping(df['cited'], conversion_dict)

    # Rows containing None values are dropped: we cannot generate valid Citation entities for them
    df = df.dropna(axis=0, how='any', subset=['citing', 'cited'])
    df = df.reset_index(drop=True)

    # The DataFrame is enriched with additional columns that are needed for achieving full
    # compliance with other OpenCitations tools. This is not strictly needed for our workflow:
    df['id'] = None
    df['oci'] = None
    df['creation'] = None  # not applicable (the citation comes from a Wikipedia page)
    df['timespan'] = None  # not applicable (the citation comes from a Wikipedia page)
    df['journal_sc'] = 'no'
    df['author_sc'] = 'no'

    # A temporary GraphSet is used to instantiate BibliographicResource entities that are needed
    # for the creation of Citation entities but that won't be kept in the output RDF file:
    temp_gs: GraphSet = GraphSet(base_iri)

    # The actual GraphSet that will contain the Citation entities to be stored in the output RDF file:
    ci_gs: GraphSet = GraphSet(base_iri,
                               info_dir=info_dir,
                               supplier_prefix=supplier_prefix,
                               wanted_label=False)

    # Here the DataFrame columns are converted into Numpy arrays
    # so that we can iterate way faster over their elements:
    citing_col = df['citing'].to_numpy(copy=False)
    cited_col = df['cited'].to_numpy(copy=False)
    id_col = df['id'].to_numpy(copy=False)
    oci_col = df['oci'].to_numpy(copy=False)

    for i, (citing_meta_id,
            cited_meta_id) in enumerate(zip(citing_col, cited_col)):
        citing_res: URIRef = URIRef(base_iri + citing_meta_id)
        cited_res: URIRef = URIRef(base_iri + cited_meta_id)

        # A query is performed to discover if the current citation has already been processed:
        query_string: str = f'''
        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
        PREFIX cito: <http://purl.org/spar/cito/>
        PREFIX datacite: <http://purl.org/spar/datacite/>
        PREFIX literal: <http://www.essepuntato.it/2010/06/literalreification/>

        SELECT ?ci_res ?oci
        FROM <https://w3id.org/oc/meta/ci/>
        WHERE {{
            ?ci_res rdf:type cito:Citation ;
                    cito:hasCitingEntity <{citing_res}> ;
                    cito:hasCitedEntity <{cited_res}> .
            OPTIONAL {{
                ?ci_res datacite:hasIdentifier ?id .
                ?id	datacite:usesIdentifierScheme datacite:oci ;
                    literal:hasLiteralValue ?oci .
            }}
        }}
        LIMIT 1
        '''
        tp: SPARQLWrapper = SPARQLWrapper(triplestore_url)
        tp.setTimeout(query_timeout)
        tp.setMethod('GET')
        tp.setQuery(query_string)
        tp.setReturnFormat(JSON)
        results = tp.queryAndConvert()
        bindings = results["results"]["bindings"]

        if len(
                bindings
        ) >= 1:  # 'LIMIT 1' in the query string should guarantee a maximum of 1 returned binding
            # This citation is already stored in the triplestore!
            row: Dict = bindings[0]

            # Update the output dataframe
            ci_res: URIRef = URIRef(bindings[0]["ci_res"]["value"])
            id_col[i] = str(ci_res)[len(base_iri):]

            if "oci" in row:
                oci_col[i] = row["oci"]["value"]
        else:
            # This citation is currently missing from the triplestore!

            # Create BR entities in "append mode" by providing 'res' without 'preexisting_graph'
            citing_br: BibliographicResource = temp_gs.add_br(
                resp_agent, res=citing_res, preexisting_graph=None)
            cited_br: BibliographicResource = temp_gs.add_br(
                resp_agent, res=cited_res, preexisting_graph=None)

            # Create OCI identifier
            oci_str: str = str(citing_res)[len(base_iri + 'br/'):] + '-' + str(
                cited_res)[len(base_iri + 'br/'):]
            oci: Identifier = ci_gs.add_id(resp_agent)
            oci.create_oci(oci_str)

            # Create citation
            ci: Citation = ci_gs.add_ci(resp_agent)
            ci.has_identifier(oci)
            ci.has_citing_entity(citing_br)
            ci.has_cited_entity(cited_br)

            # Update the output dataframe
            id_col[i] = str(ci.res)[len(base_iri):]
            oci_col[i] = oci_str

    # Store the dataframe as a CSV file that's compliant with OpenCitations tools:
    output_filepath: str = os.path.join(converter_citations_csv_output_dir,
                                        cur_citations_file)
    df.to_csv(output_filepath,
              index=False,
              chunksize=100000,
              columns=[
                  'id', 'oci', 'citing', 'cited', 'creation', 'timespan',
                  'journal_sc', 'author_sc'
              ])

    # Store new citations in an RDF file (together with the related provenance).
    # They should also be uploaded to the triplestore so to update the current state
    # of execution: by this way, they won't be created again since they will already
    # be present inside the triplestore.
    ci_ps: ProvSet = ProvSet(ci_gs, base_iri)
    ci_ps.generate_provenance()

    ci_storer: Storer = Storer(ci_gs,
                               dir_split=dir_split_number,
                               n_file_item=items_per_file,
                               default_dir=default_dir,
                               output_format='nt11')
    ci_prov_storer: Storer = Storer(ci_ps,
                                    dir_split=dir_split_number,
                                    n_file_item=items_per_file,
                                    default_dir=default_dir,
                                    output_format='nquads')

    if rdf_output_in_chunks:
        # The RDF files are stored WITHOUT following the folder structure
        # adopted by OpenCitations: all the newly created citations are kept
        # in a single file.
        # In the following steps of the workflow, every script assumes that data was produced in chunks:
        # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must
        # be set to True.
        filename_without_csv: str = cur_citations_file[:-4]

        # Data
        f: str = os.path.join(converter_citations_rdf_output_dir,
                              filename_without_csv + ".nt")
        if not os.path.exists(os.path.dirname(f)):
            os.makedirs(os.path.dirname(f))
        ci_storer.store_graphs_in_file(f, context_path)
        ci_storer.upload_all(triplestore_url,
                             converter_citations_rdf_output_dir,
                             batch_size=100)

        # Provenance
        prov_dir: str = os.path.join(converter_citations_rdf_output_dir,
                                     'prov')
        f_prov: str = os.path.join(prov_dir, filename_without_csv + '.nq')
        if not os.path.exists(os.path.dirname(f_prov)):
            os.makedirs(os.path.dirname(f_prov))
        ci_prov_storer.store_graphs_in_file(f_prov, context_path)
    else:
        # The RDF files are stored following the folder structure adopted by OpenCitations.
        # Newly created citations could be split into different files based on
        # various conditions.
        # In the following steps of the workflow, every script assumes that data was produced in chunks:
        # this means that this modality should never be chosen and that 'rdf_output_in_chunks' must
        # be set to True.
        ci_storer.upload_and_store(converter_citations_rdf_output_dir,
                                   triplestore_url,
                                   base_iri,
                                   context_path,
                                   batch_size=100)

        ci_prov_storer.store_all(converter_citations_rdf_output_dir, base_iri,
                                 context_path)