def __load_citations_from_csv_file(data_f_path, prov_f_path, baseurl, service_name, id_type, id_shape, citation_type): citation_data = OrderedDict() with open(data_f_path, encoding="utf8") as f: for row in DictReader(f): citation_data[row["oci"]] = row citation_prov = OrderedDict() with open(prov_f_path, encoding="utf8") as f: for row in DictReader(f): citation_prov[row["oci"]] = row for oci in citation_data: dent = citation_data[oci] pent = citation_prov[oci] c = Citation(oci, baseurl + quote(dent["citing"]), None, baseurl + quote(dent["cited"]), None, dent["creation"], dent["timespan"], int(pent["snapshot"]), pent["agent"], pent["source"], pent["created"], service_name, id_type, id_shape, citation_type, dent["journal_sc"] == "yes", dent["author_sc"] == "yes", pent["invalidated"], pent["description"], pent["update"]) yield c
def get_next_citation_data(self): row = self._get_next_in_file() while row is not None: citing = self.doi.normalise(row.get("citing_id")) cited = self.doi.normalise(row.get("cited_id")) if citing is not None and cited is not None: created = row.get("citing_publication_date") if not created: created = None cited_pub_date = row.get("cited_publication_date") if not cited_pub_date: timespan = None else: c = Citation(None, None, created, None, cited_pub_date, None, None, None, None, "", None, None, None, None, None) timespan = c.duration self.update_status_file() return citing, cited, created, timespan, None, None self.update_status_file() row = self._get_next_in_file() remove(self.status_file)
def __load_citations_from_slx_file(data_f_path, oci, service_name, id_type, id_shape, citation_type, agent, source): with open(data_f_path, encoding="utf8") as f: citation_data = load(f) for obj in citation_data: c = Citation(oci, obj["Source"]["Identifier"]["IDURL"], obj["Source"].get("PublicationDate"), obj["Target"]["Identifier"]["IDURL"], obj["Target"].get("PublicationDate"), None, None, 1, agent, source, obj["LinkPublicationDate"], service_name, id_type, id_shape, citation_type, False, False, None, None, None) yield c
def __create_citation(self, citing, cited, created, timespan, journal_sc, author_sc): return Citation(self.oci.get_oci(citing, cited, "020"), "http://dx.doi.org/" + quote(citing), None, "http://dx.doi.org/" + quote(cited), None, created, timespan, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/" + quote(citing), "2018-01-01T00:00:00", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc, author_sc, prov_description="Creation of the citation")
def test_inferred_leap_year_dates(self): cit = Citation( None, "http://dx.doi.org/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3", "2001", "http://dx.doi.org/10.1002/%28sici%291097-0258%2819960229%2915%3A4%3C361%3A%3Aaid-sim168%3E3.0.co%3B2-4", "1996-02-29", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertEqual(cit.duration, "P5Y")
def process(input_dir, output_dir): if not exists(output_dir): makedirs(output_dir) citing_doi_with_no_date = set() valid_doi = CSVManager(output_dir + sep + "valid_doi.csv") id_date = CSVManager(output_dir + sep + "id_date.csv") id_issn = CSVManager(output_dir + sep + "id_issn.csv") id_orcid = CSVManager(output_dir + sep + "id_orcid.csv") doi_manager = DOIManager(valid_doi) issn_manager = ISSNManager() orcid_manager = ORCIDManager() all_files, opener = get_all_files(input_dir) len_all_files = len(all_files) # Read all the JSON file in the Crossref dump to create the main information of all the indexes print("\n\n# Add valid DOIs from Crossref metadata") for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) try: data = load(f) # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus, # it must be converted into an utf-8 string before loading it into a JSON. except TypeError: utf8reader = codecs.getreader("utf-8") data = load(utf8reader(f)) if "items" in data: for obj in data['items']: if "DOI" in obj: citing_doi = doi_manager.normalise(obj["DOI"], True) doi_manager.set_valid(citing_doi) if id_date.get_value(citing_doi) is None: citing_date = Citation.check_date(build_pubdate(obj)) if citing_date is not None: id_date.add_value(citing_doi, citing_date) if citing_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(citing_doi) else: citing_doi_with_no_date.add(citing_doi) if id_issn.get_value(citing_doi) is None: if "type" in obj: cur_type = obj["type"] if cur_type is not None and "journal" in cur_type and "ISSN" in obj: cur_issn = obj["ISSN"] if cur_issn is not None: for issn in [issn_manager.normalise(issn) for issn in cur_issn]: if issn is not None: id_issn.add_value(citing_doi, issn) if id_orcid.get_value(citing_doi) is None: if "author" in obj: cur_author = obj['author'] if cur_author is not None: for author in cur_author: if "ORCID" in author: orcid = orcid_manager.normalise(author["ORCID"]) if orcid is not None: id_orcid.add_value(citing_doi, orcid) # Do it again for updating the dates of the cited DOIs, if these are valid print("\n\n# Check cited DOIs from Crossref reference field") doi_date = {} for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) data = load(f) if "items" in data: for obj in data['items']: if "DOI" in obj and "reference" in obj: for ref in obj['reference']: if "DOI" in ref: cited_doi = doi_manager.normalise(ref["DOI"], True) if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None: if cited_doi not in doi_date: doi_date[cited_doi] = [] cited_date = Citation.check_date(build_pubdate(ref)) if cited_date is not None: doi_date[cited_doi].append(cited_date) if cited_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(cited_doi) # Add the date to the DOI if such date is the most adopted one in the various references. # In case two distinct dates are used the most, select the older one. for doi in doi_date: count = Counter(doi_date[doi]) if len(count): top_value = count.most_common(1)[0][1] selected_dates = [] for date in count: if count[date] == top_value: selected_dates.append(date) best_date = sorted(selected_dates)[0] id_date.add_value(doi, best_date) else: id_date.add_value(doi, "") # Add emtpy dates for the remaining DOIs for doi in citing_doi_with_no_date: id_date.add_value(doi, "")
def __load_citations_from_rdf_file(data_f_path, prov_f_path, service_name, id_type, id_shape, citation_type): citation_data = Graph() citation_data.load(data_f_path, format="nt11") citation_prov = ConjunctiveGraph() citation_prov.load(prov_f_path, format="nquads") for cit_ent in citation_data.subjects(RDF.type, Citation.citation): prov_entity = None snapshot = 0 for entity in citation_prov.subjects(Citation.specialization_of, cit_ent): entity_snapshot = int(sub("^.+/se/(.+)$", "\\1", entity)) if prov_entity is None or snapshot < entity_snapshot: prov_entity = entity snapshot = entity_snapshot invalidated = None update = None creation_date = None timespan = None for en in citation_prov.objects(prov_entity, Citation.invalidated_at_time): invalidated = str(en) for en in citation_prov.objects(prov_entity, Citation.has_update_query): update = str(en) for en in citation_data.objects( cit_ent, Citation.has_citation_creation_date): creation_date = str(en) for en in citation_data.objects(cit_ent, Citation.has_citation_time_span): timespan = str(en) c = Citation( sub("^.+/ci/(.+)$", "\\1", str(cit_ent)), str( list( citation_data.objects(cit_ent, Citation.has_citing_entity))[0]), None, str( list( citation_data.objects(cit_ent, Citation.has_cited_entity))[0]), None, creation_date, timespan, entity_snapshot, str( list( citation_prov.objects(prov_entity, Citation.was_attributed_to))[0]), str( list( citation_prov.objects( prov_entity, Citation.had_primary_source))[0]), str( list( citation_prov.objects(prov_entity, Citation.generated_at_time))[0]), service_name, id_type, id_shape, citation_type, Citation.journal_self_citation in citation_data.objects( cit_ent, RDF.type), Citation.author_self_citation in citation_data.objects(cit_ent, RDF.type), invalidated, str( list( citation_prov.objects(prov_entity, Citation.description))[0]), update) yield c
def __store_rdf_on_file(f_path, rdf_obj, format="nt"): with open(f_path, "a", encoding="utf8") as f: rdf_string = Citation.format_rdf(rdf_obj, format) f.write(rdf_string)
def setUp(self): self.citation_data_csv_path = "index%stest_data%scitations_data.csv" % ( sep, sep) self.citation_prov_csv_path = "index%stest_data%scitations_prov.csv" % ( sep, sep) self.citation_data_ttl_path = "index%stest_data%scitations_data.ttl" % ( sep, sep) self.citation_prov_ttl_path = "index%stest_data%scitations_prov.ttl" % ( sep, sep) self.citation_data_prov_scholix_path = "index%stest_data%scitations_data_prov.scholix" % ( sep, sep) self.base_url = "https://w3id.org/oc/index/coci/" self.citation_1 = Citation( "02001000308362819371213133704040001020809-020010009063615193700006300030306151914", "http://dx.doi.org/10.1038/sj.cdd.4401289", "2003-10-24", "http://dx.doi.org/10.1096/fj.00-0336fje", "2001-01", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1038/sj.cdd.4401289", "2018-11-01T09:14:03", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.citation_2 = Citation( "02001000002361927283705040000-02001000002361927283705030002", "http://dx.doi.org/10.1002/jrs.5400", "2018-06", "http://dx.doi.org/10.1002/jrs.5302", "2017-12-05", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/jrs.5400", "2018-11-01T14:51:52", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=True, author_sc=True, prov_description="Creation of the citation") self.citation_3 = Citation( "02001000002361927283705040000-020010003093612062710020603000720", "http://dx.doi.org/10.1002/jrs.5400", "2018-06", "http://dx.doi.org/10.1039/c6ra26307k", "2017", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/jrs.5400", "2018-11-01T14:51:52", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=True, prov_description="Creation of the citation") self.citation_4 = Citation( "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707", "http://dx.doi.org/10.1038/sj.cdd.4401284", "2003-08-22", "http://dx.doi.org/10.1038/sj.cdd.4400377", "1998-05", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1038/sj.cdd.4401284", "2018-11-01T09:14:03", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=True, author_sc=False, prov_description="Creation of the citation") self.citation_5 = Citation( "020010000023625242110370100030001-02001010009361222251430273701090809370903040403", "http://dx.doi.org/10.1002/pola.10301", "2002-06-21", "http://dx.doi.org/10.1109/cmpeur.1989.93443", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/pola.10301", "2018-10-31T16:13:26", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.citation_6 = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") # Hack for correct handling of date datatypes if XSD.gYear in _toPythonMapping: _toPythonMapping.pop(XSD.gYear) if XSD.gYearMonth in _toPythonMapping: _toPythonMapping.pop(XSD.gYearMonth)
class CitationTest(unittest.TestCase): """This class aim at testing the methods of the class CSVManager.""" def setUp(self): self.citation_data_csv_path = "index%stest_data%scitations_data.csv" % ( sep, sep) self.citation_prov_csv_path = "index%stest_data%scitations_prov.csv" % ( sep, sep) self.citation_data_ttl_path = "index%stest_data%scitations_data.ttl" % ( sep, sep) self.citation_prov_ttl_path = "index%stest_data%scitations_prov.ttl" % ( sep, sep) self.citation_data_prov_scholix_path = "index%stest_data%scitations_data_prov.scholix" % ( sep, sep) self.base_url = "https://w3id.org/oc/index/coci/" self.citation_1 = Citation( "02001000308362819371213133704040001020809-020010009063615193700006300030306151914", "http://dx.doi.org/10.1038/sj.cdd.4401289", "2003-10-24", "http://dx.doi.org/10.1096/fj.00-0336fje", "2001-01", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1038/sj.cdd.4401289", "2018-11-01T09:14:03", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.citation_2 = Citation( "02001000002361927283705040000-02001000002361927283705030002", "http://dx.doi.org/10.1002/jrs.5400", "2018-06", "http://dx.doi.org/10.1002/jrs.5302", "2017-12-05", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/jrs.5400", "2018-11-01T14:51:52", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=True, author_sc=True, prov_description="Creation of the citation") self.citation_3 = Citation( "02001000002361927283705040000-020010003093612062710020603000720", "http://dx.doi.org/10.1002/jrs.5400", "2018-06", "http://dx.doi.org/10.1039/c6ra26307k", "2017", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/jrs.5400", "2018-11-01T14:51:52", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=True, prov_description="Creation of the citation") self.citation_4 = Citation( "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707", "http://dx.doi.org/10.1038/sj.cdd.4401284", "2003-08-22", "http://dx.doi.org/10.1038/sj.cdd.4400377", "1998-05", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1038/sj.cdd.4401284", "2018-11-01T09:14:03", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=True, author_sc=False, prov_description="Creation of the citation") self.citation_5 = Citation( "020010000023625242110370100030001-02001010009361222251430273701090809370903040403", "http://dx.doi.org/10.1002/pola.10301", "2002-06-21", "http://dx.doi.org/10.1109/cmpeur.1989.93443", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/pola.10301", "2018-10-31T16:13:26", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.citation_6 = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") # Hack for correct handling of date datatypes if XSD.gYear in _toPythonMapping: _toPythonMapping.pop(XSD.gYear) if XSD.gYearMonth in _toPythonMapping: _toPythonMapping.pop(XSD.gYearMonth) def test_inferred_leap_year_dates(self): cit = Citation( None, "http://dx.doi.org/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3", "2001", "http://dx.doi.org/10.1002/%28sici%291097-0258%2819960229%2915%3A4%3C361%3A%3Aaid-sim168%3E3.0.co%3B2-4", "1996-02-29", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertEqual(cit.duration, "P5Y") def test_invalid_date_for_citation(self): cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", "0000", "http://dx.doi.org/10.1134/s1028334x09040217", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", "2019", "http://dx.doi.org/10.1134/s1028334x09040217", "0000", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNotNone(cit.citing_pub_date) self.assertIsNotNone(cit.creation_date) self.assertIsNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", "2019", None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNotNone(cit.citing_pub_date) self.assertIsNotNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNotNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", "2019-02-29", None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNone(cit.duration) def test_citation_data_csv(self): citation_data_csv = None with open(self.citation_data_csv_path) as f: citation_data_csv = list(DictReader(f)) self.assertEqual( list(DictReader(StringIO(self.citation_1.get_citation_csv())))[0], citation_data_csv[0]) self.assertEqual( list(DictReader(StringIO(self.citation_2.get_citation_csv())))[0], citation_data_csv[1]) self.assertEqual( list(DictReader(StringIO(self.citation_3.get_citation_csv())))[0], citation_data_csv[2]) self.assertEqual( list(DictReader(StringIO(self.citation_4.get_citation_csv())))[0], citation_data_csv[3]) self.assertEqual( list(DictReader(StringIO(self.citation_5.get_citation_csv())))[0], citation_data_csv[4]) self.assertEqual( list(DictReader(StringIO(self.citation_6.get_citation_csv())))[0], citation_data_csv[5]) def test_citation_prov_csv(self): citation_prov_csv = None with open(self.citation_prov_csv_path) as f: citation_prov_csv = list(DictReader(f)) self.assertEqual( list(DictReader(StringIO( self.citation_1.get_citation_prov_csv())))[0], citation_prov_csv[0]) self.assertEqual( list(DictReader(StringIO( self.citation_2.get_citation_prov_csv())))[0], citation_prov_csv[1]) self.assertEqual( list(DictReader(StringIO( self.citation_3.get_citation_prov_csv())))[0], citation_prov_csv[2]) self.assertEqual( list(DictReader(StringIO( self.citation_4.get_citation_prov_csv())))[0], citation_prov_csv[3]) self.assertEqual( list(DictReader(StringIO( self.citation_5.get_citation_prov_csv())))[0], citation_prov_csv[4]) self.assertEqual( list(DictReader(StringIO( self.citation_6.get_citation_prov_csv())))[0], citation_prov_csv[5]) def test_citation_data_ttl(self): g1 = ConjunctiveGraph() g1.load(self.citation_data_ttl_path, format="nt11") g2 = ConjunctiveGraph() for c in [ self.citation_1, self.citation_2, self.citation_3, self.citation_4, self.citation_5, self.citation_6 ]: for s, p, o in c.get_citation_rdf(self.base_url, False, False, False): g2.add((s, p, o)) self.assertTrue(isomorphic(g1, g2)) def test_citation_prov_ttl(self): g1 = ConjunctiveGraph() g1.load(self.citation_prov_ttl_path, format="nquads") g2 = ConjunctiveGraph() for c in [ self.citation_1, self.citation_2, self.citation_3, self.citation_4, self.citation_5, self.citation_6 ]: for s, p, o, g in c.get_citation_prov_rdf(self.base_url).quads( (None, None, None, None)): g2.add((s, p, o, g)) self.assertTrue(isomorphic(g1, g2)) def test_citation_data_prov_scholix(self): citation_data_prov_scholix = None with open(self.citation_data_prov_scholix_path) as f: citation_data_prov_scholix = load(f) self.assertEqual(loads(self.citation_1.get_citation_scholix()), citation_data_prov_scholix[0]) self.assertEqual(loads(self.citation_2.get_citation_scholix()), citation_data_prov_scholix[1]) self.assertEqual(loads(self.citation_3.get_citation_scholix()), citation_data_prov_scholix[2]) self.assertEqual(loads(self.citation_4.get_citation_scholix()), citation_data_prov_scholix[3]) self.assertEqual(loads(self.citation_5.get_citation_scholix()), citation_data_prov_scholix[4]) self.assertEqual(loads(self.citation_6.get_citation_scholix()), citation_data_prov_scholix[5]) def test_lookup(self): doi_1 = "10.1038/sj.cdd.4401289" doi_2 = "10.1096/fj.00-0336fje" doi_3 = "10.1002/jrs.5400" doi_4 = "10.1039/c6ra26307k" doi_5 = "10.1234/456789qwertyuiopasdfghjklzxcvbnmè+òàù,.-åß∂ƒ∞∆ªº¬∑≤†©√∫˜≥»”’¢‰" doi_6 = "10.1234/!\"£$%&/()=?^é*ç°§;:_<>«“‘¥~‹÷´`ˆ[]@#¶…•–„Ω€®™æ¨øπ" # Test conversion without any file oci_man = OCIManager() oci = oci_man.get_oci(doi_1, doi_2, "020") self.assertEqual( doi_1.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[0].replace( "020", "", 1)) ])) self.assertEqual( doi_2.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[1].replace( "020", "", 1)) ])) self.assertEqual(len(oci_man.lookup.keys()), len(set(doi_1 + doi_2))) # Test conversion with full file oci_12 = "oci:02001000308362819371213133704040001020809-020010009063615193700006300030306151914" oci_man = OCIManager(lookup_file="index%stest_data%slookup_full.csv" % (sep, sep)) self.assertEqual(oci_man.get_oci(doi_1, doi_2, "020"), oci_12) # Test conversion with new file new_file_path = "index%stest_data%slookup_new.csv" % (sep, sep) if exists(new_file_path): remove(new_file_path) oci_man = OCIManager(lookup_file=new_file_path) oci = oci_man.get_oci(doi_1, doi_2, "020") self.assertEqual( doi_1.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[0].replace( "020", "", 1)) ])) self.assertEqual( doi_2.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[1].replace( "020", "", 1)) ])) self.assertEqual(len(oci_man.lookup.keys()), len(set(doi_1 + doi_2))) # Test conversion with incomplete file (ver 1, existing DOIs) oci_man = OCIManager(lookup_file=new_file_path) oci = oci_man.get_oci(doi_3, doi_4, "020") self.assertEqual( doi_3.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[0].replace( "020", "", 1)) ])) self.assertEqual( doi_4.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[1].replace( "020", "", 1)) ])) self.assertEqual(len(oci_man.lookup.keys()), len(set(doi_1 + doi_2 + doi_3 + doi_4))) # Test conversion with incomplete file (ver 2, non-existing DOIs) oci_man = OCIManager(lookup_file=new_file_path) oci = oci_man.get_oci(doi_5, doi_6, "020") self.assertEqual( doi_5.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[0].replace( "020", "", 1)) ])) self.assertEqual( doi_6.replace("10.", "", 1), "".join([ oci_man.lookup[code] for code in findall( "(9*[0-8][0-9])", oci.replace("oci:", "").split("-")[1].replace( "020", "", 1)) ])) self.assertEqual( len(oci_man.lookup.keys()), len(set(doi_1 + doi_2 + doi_3 + doi_4 + doi_5 + doi_6)))
def test_invalid_date_for_citation(self): cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", "0000", "http://dx.doi.org/10.1134/s1028334x09040217", None, None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", "2019", "http://dx.doi.org/10.1134/s1028334x09040217", "0000", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNotNone(cit.citing_pub_date) self.assertIsNotNone(cit.creation_date) self.assertIsNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", None, None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", "2019", None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNotNone(cit.citing_pub_date) self.assertIsNotNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNotNone(cit.duration) cit = Citation( "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107", "http://dx.doi.org/10.1130/2015.2513%2800%29", None, "http://dx.doi.org/10.1134/s1028334x09040217", "2011", "2019-02-29", None, 1, "https://w3id.org/oc/index/prov/ra/1", "https://api.crossref.org/works/10.1130/2015.2513%2800%29", "2018-10-31T16:17:07", "OpenCitations Index: COCI", "doi", "http://dx.doi.org/([[XXX__decode]])", None, journal_sc=False, author_sc=False, prov_description="Creation of the citation") self.assertIsNone(cit.citing_pub_date) self.assertIsNone(cit.creation_date) self.assertIsNotNone(cit.cited_pub_date) self.assertIsNone(cit.duration)
def extract_citations(idbaseurl, baseurl, python, pclass, input, lookup, data, prefix, agent, source, service, verbose, doi_manager, rf_handler, oci_to_do=None): BASE_URL = idbaseurl DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl oci_manager = OCIManager(lookup_file=lookup) exi_ocis = CSVManager.load_csv_column_as_set( data + sep + "data", "oci" ) # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer. if oci_to_do is not None: oci_to_do.difference_update(exi_ocis) cit_storer = CitationStorer(data, DATASET_URL) citations_already_present = 0 new_citations_added = 0 error_in_dois_existence = 0 cs = import_citation_source(python, pclass, input) next_citation = cs.get_next_citation_data() while next_citation is not None: citing, cited, created, timespan, journal_sc, author_sc = next_citation oci = oci_manager.get_oci(citing, cited, prefix) oci_noprefix = oci.replace("oci:", "") if oci_noprefix not in exi_ocis and (oci_to_do is None or oci_noprefix in oci_to_do): if doi_manager.is_valid(citing) and doi_manager.is_valid(cited): if created is None: citing_date = rf_handler.get_date(citing) else: citing_date = created cited_date = rf_handler.get_date(cited) if journal_sc is None or type(journal_sc) is not bool: journal_sc = rf_handler.share_issn(citing, cited) if author_sc is None or type(author_sc) is not bool: author_sc = rf_handler.share_orcid(citing, cited) if created is not None and timespan is not None: cit = Citation( oci, BASE_URL + quote(citing), None, BASE_URL + quote(cited), None, created, timespan, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) else: cit = Citation( oci, BASE_URL + quote(citing), citing_date, BASE_URL + quote(cited), cited_date, None, None, 1, agent, source, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service, "doi", BASE_URL + "([[XXX__decode]])", "reference", journal_sc, author_sc, None, "Creation of the citation", None) cit_storer.store_citation(cit) if verbose: print( "Create citation data for '%s' between DOI '%s' and DOI '%s'" % (oci, citing, cited)) new_citations_added += 1 exi_ocis.add(oci_noprefix) else: if verbose: print( "WARNING: some DOIs, among '%s' and '%s', do not exist" % (citing, cited)) error_in_dois_existence += 1 if oci_to_do is not None: oci_to_do.remove(oci_noprefix) else: if verbose: print( "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed" % (citing, cited)) citations_already_present += 1 next_citation = cs.get_next_citation_data() return new_citations_added, citations_already_present, error_in_dois_existence