Ejemplo n.º 1
0
    def __load_citations_from_csv_file(data_f_path, prov_f_path, baseurl,
                                       service_name, id_type, id_shape,
                                       citation_type):
        citation_data = OrderedDict()
        with open(data_f_path, encoding="utf8") as f:
            for row in DictReader(f):
                citation_data[row["oci"]] = row
        citation_prov = OrderedDict()
        with open(prov_f_path, encoding="utf8") as f:
            for row in DictReader(f):
                citation_prov[row["oci"]] = row

        for oci in citation_data:
            dent = citation_data[oci]
            pent = citation_prov[oci]
            c = Citation(oci, baseurl + quote(dent["citing"]), None,
                         baseurl + quote(dent["cited"]), None,
                         dent["creation"], dent["timespan"],
                         int(pent["snapshot"]), pent["agent"], pent["source"],
                         pent["created"], service_name, id_type, id_shape,
                         citation_type, dent["journal_sc"] == "yes",
                         dent["author_sc"] == "yes", pent["invalidated"],
                         pent["description"], pent["update"])

            yield c
    def get_next_citation_data(self):
        row = self._get_next_in_file()

        while row is not None:
            citing = self.doi.normalise(row.get("citing_id"))
            cited = self.doi.normalise(row.get("cited_id"))

            if citing is not None and cited is not None:
                created = row.get("citing_publication_date")
                if not created:
                    created = None

                cited_pub_date = row.get("cited_publication_date")
                if not cited_pub_date:
                    timespan = None
                else:
                    c = Citation(None, None, created, None, cited_pub_date,
                                 None, None, None, None, "", None, None, None,
                                 None, None)
                    timespan = c.duration

                self.update_status_file()
                return citing, cited, created, timespan, None, None

            self.update_status_file()
            row = self._get_next_in_file()

        remove(self.status_file)
Ejemplo n.º 3
0
    def __load_citations_from_slx_file(data_f_path, oci, service_name, id_type,
                                       id_shape, citation_type, agent, source):
        with open(data_f_path, encoding="utf8") as f:
            citation_data = load(f)

            for obj in citation_data:
                c = Citation(oci, obj["Source"]["Identifier"]["IDURL"],
                             obj["Source"].get("PublicationDate"),
                             obj["Target"]["Identifier"]["IDURL"],
                             obj["Target"].get("PublicationDate"), None, None,
                             1, agent, source, obj["LinkPublicationDate"],
                             service_name, id_type, id_shape, citation_type,
                             False, False, None, None, None)

                yield c
Ejemplo n.º 4
0
 def __create_citation(self, citing, cited, created, timespan, journal_sc,
                       author_sc):
     return Citation(self.oci.get_oci(citing, cited, "020"),
                     "http://dx.doi.org/" + quote(citing),
                     None,
                     "http://dx.doi.org/" + quote(cited),
                     None,
                     created,
                     timespan,
                     1,
                     "https://w3id.org/oc/index/prov/ra/1",
                     "https://api.crossref.org/works/" + quote(citing),
                     "2018-01-01T00:00:00",
                     "OpenCitations Index: COCI",
                     "doi",
                     "http://dx.doi.org/([[XXX__decode]])",
                     None,
                     journal_sc,
                     author_sc,
                     prov_description="Creation of the citation")
Ejemplo n.º 5
0
 def test_inferred_leap_year_dates(self):
     cit = Citation(
         None,
         "http://dx.doi.org/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3",
         "2001",
         "http://dx.doi.org/10.1002/%28sici%291097-0258%2819960229%2915%3A4%3C361%3A%3Aaid-sim168%3E3.0.co%3B2-4",
         "1996-02-29",
         None,
         None,
         1,
         "https://w3id.org/oc/index/prov/ra/1",
         "https://api.crossref.org/works/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3",
         "2018-10-31T16:17:07",
         "OpenCitations Index: COCI",
         "doi",
         "http://dx.doi.org/([[XXX__decode]])",
         None,
         journal_sc=False,
         author_sc=False,
         prov_description="Creation of the citation")
     self.assertEqual(cit.duration, "P5Y")
Ejemplo n.º 6
0
def process(input_dir, output_dir):
    if not exists(output_dir):
        makedirs(output_dir)

    citing_doi_with_no_date = set()
    valid_doi = CSVManager(output_dir + sep + "valid_doi.csv")
    id_date = CSVManager(output_dir + sep + "id_date.csv")
    id_issn = CSVManager(output_dir + sep + "id_issn.csv")
    id_orcid = CSVManager(output_dir + sep + "id_orcid.csv")

    doi_manager = DOIManager(valid_doi)
    issn_manager = ISSNManager()
    orcid_manager = ORCIDManager()

    all_files, opener = get_all_files(input_dir)
    len_all_files = len(all_files)

    # Read all the JSON file in the Crossref dump to create the main information of all the indexes
    print("\n\n# Add valid DOIs from Crossref metadata")
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            try:
                data = load(f)
            # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus,
            # it must be converted into an utf-8 string before loading it into a JSON.
            except TypeError:
                utf8reader = codecs.getreader("utf-8")
                data = load(utf8reader(f))

            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj:
                        citing_doi = doi_manager.normalise(obj["DOI"], True)
                        doi_manager.set_valid(citing_doi)

                        if id_date.get_value(citing_doi) is None:
                            citing_date = Citation.check_date(build_pubdate(obj))
                            if citing_date is not None:
                                id_date.add_value(citing_doi, citing_date)
                                if citing_doi in citing_doi_with_no_date:
                                    citing_doi_with_no_date.remove(citing_doi)
                            else:
                                citing_doi_with_no_date.add(citing_doi)

                        if id_issn.get_value(citing_doi) is None:
                            if "type" in obj:
                                cur_type = obj["type"]
                                if cur_type is not None and "journal" in cur_type and "ISSN" in obj:
                                    cur_issn = obj["ISSN"]
                                    if cur_issn is not None:
                                        for issn in [issn_manager.normalise(issn) for issn in cur_issn]:
                                            if issn is not None:
                                                id_issn.add_value(citing_doi, issn)

                        if id_orcid.get_value(citing_doi) is None:
                            if "author" in obj:
                                cur_author = obj['author']
                                if cur_author is not None:
                                    for author in cur_author:
                                        if "ORCID" in author:
                                            orcid = orcid_manager.normalise(author["ORCID"])
                                            if orcid is not None:
                                                id_orcid.add_value(citing_doi, orcid)

    # Do it again for updating the dates of the cited DOIs, if these are valid
    print("\n\n# Check cited DOIs from Crossref reference field")
    doi_date = {}
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            data = load(f)
            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj and "reference" in obj:
                        for ref in obj['reference']:
                            if "DOI" in ref:
                                cited_doi = doi_manager.normalise(ref["DOI"], True)
                                if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None:
                                    if cited_doi not in doi_date:
                                        doi_date[cited_doi] = []
                                    cited_date = Citation.check_date(build_pubdate(ref))
                                    if cited_date is not None:
                                        doi_date[cited_doi].append(cited_date)
                                        if cited_doi in citing_doi_with_no_date:
                                            citing_doi_with_no_date.remove(cited_doi)

    # Add the date to the DOI if such date is the most adopted one in the various references.
    # In case two distinct dates are used the most, select the older one.
    for doi in doi_date:
        count = Counter(doi_date[doi])
        if len(count):
            top_value = count.most_common(1)[0][1]
            selected_dates = []
            for date in count:
                if count[date] == top_value:
                    selected_dates.append(date)
            best_date = sorted(selected_dates)[0]
            id_date.add_value(doi, best_date)
        else:
            id_date.add_value(doi, "")

    # Add emtpy dates for the remaining DOIs
    for doi in citing_doi_with_no_date:
        id_date.add_value(doi, "")
Ejemplo n.º 7
0
    def __load_citations_from_rdf_file(data_f_path, prov_f_path, service_name,
                                       id_type, id_shape, citation_type):
        citation_data = Graph()
        citation_data.load(data_f_path, format="nt11")

        citation_prov = ConjunctiveGraph()
        citation_prov.load(prov_f_path, format="nquads")

        for cit_ent in citation_data.subjects(RDF.type, Citation.citation):
            prov_entity = None
            snapshot = 0

            for entity in citation_prov.subjects(Citation.specialization_of,
                                                 cit_ent):
                entity_snapshot = int(sub("^.+/se/(.+)$", "\\1", entity))
                if prov_entity is None or snapshot < entity_snapshot:
                    prov_entity = entity
                    snapshot = entity_snapshot

            invalidated = None
            update = None
            creation_date = None
            timespan = None
            for en in citation_prov.objects(prov_entity,
                                            Citation.invalidated_at_time):
                invalidated = str(en)
            for en in citation_prov.objects(prov_entity,
                                            Citation.has_update_query):
                update = str(en)
            for en in citation_data.objects(
                    cit_ent, Citation.has_citation_creation_date):
                creation_date = str(en)
            for en in citation_data.objects(cit_ent,
                                            Citation.has_citation_time_span):
                timespan = str(en)

            c = Citation(
                sub("^.+/ci/(.+)$", "\\1", str(cit_ent)),
                str(
                    list(
                        citation_data.objects(cit_ent,
                                              Citation.has_citing_entity))[0]),
                None,
                str(
                    list(
                        citation_data.objects(cit_ent,
                                              Citation.has_cited_entity))[0]),
                None, creation_date, timespan, entity_snapshot,
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.was_attributed_to))[0]),
                str(
                    list(
                        citation_prov.objects(
                            prov_entity, Citation.had_primary_source))[0]),
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.generated_at_time))[0]),
                service_name, id_type, id_shape, citation_type,
                Citation.journal_self_citation in citation_data.objects(
                    cit_ent, RDF.type), Citation.author_self_citation
                in citation_data.objects(cit_ent, RDF.type), invalidated,
                str(
                    list(
                        citation_prov.objects(prov_entity,
                                              Citation.description))[0]),
                update)

            yield c
Ejemplo n.º 8
0
 def __store_rdf_on_file(f_path, rdf_obj, format="nt"):
     with open(f_path, "a", encoding="utf8") as f:
         rdf_string = Citation.format_rdf(rdf_obj, format)
         f.write(rdf_string)
Ejemplo n.º 9
0
    def setUp(self):
        self.citation_data_csv_path = "index%stest_data%scitations_data.csv" % (
            sep, sep)
        self.citation_prov_csv_path = "index%stest_data%scitations_prov.csv" % (
            sep, sep)
        self.citation_data_ttl_path = "index%stest_data%scitations_data.ttl" % (
            sep, sep)
        self.citation_prov_ttl_path = "index%stest_data%scitations_prov.ttl" % (
            sep, sep)
        self.citation_data_prov_scholix_path = "index%stest_data%scitations_data_prov.scholix" % (
            sep, sep)
        self.base_url = "https://w3id.org/oc/index/coci/"

        self.citation_1 = Citation(
            "02001000308362819371213133704040001020809-020010009063615193700006300030306151914",
            "http://dx.doi.org/10.1038/sj.cdd.4401289",
            "2003-10-24",
            "http://dx.doi.org/10.1096/fj.00-0336fje",
            "2001-01",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1038/sj.cdd.4401289",
            "2018-11-01T09:14:03",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_2 = Citation(
            "02001000002361927283705040000-02001000002361927283705030002",
            "http://dx.doi.org/10.1002/jrs.5400",
            "2018-06",
            "http://dx.doi.org/10.1002/jrs.5302",
            "2017-12-05",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/jrs.5400",
            "2018-11-01T14:51:52",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=True,
            author_sc=True,
            prov_description="Creation of the citation")

        self.citation_3 = Citation(
            "02001000002361927283705040000-020010003093612062710020603000720",
            "http://dx.doi.org/10.1002/jrs.5400",
            "2018-06",
            "http://dx.doi.org/10.1039/c6ra26307k",
            "2017",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/jrs.5400",
            "2018-11-01T14:51:52",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=True,
            prov_description="Creation of the citation")

        self.citation_4 = Citation(
            "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707",
            "http://dx.doi.org/10.1038/sj.cdd.4401284",
            "2003-08-22",
            "http://dx.doi.org/10.1038/sj.cdd.4400377",
            "1998-05",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1038/sj.cdd.4401284",
            "2018-11-01T09:14:03",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=True,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_5 = Citation(
            "020010000023625242110370100030001-02001010009361222251430273701090809370903040403",
            "http://dx.doi.org/10.1002/pola.10301",
            "2002-06-21",
            "http://dx.doi.org/10.1109/cmpeur.1989.93443",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/pola.10301",
            "2018-10-31T16:13:26",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_6 = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        # Hack for correct handling of date datatypes
        if XSD.gYear in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYear)
        if XSD.gYearMonth in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYearMonth)
Ejemplo n.º 10
0
class CitationTest(unittest.TestCase):
    """This class aim at testing the methods of the class CSVManager."""
    def setUp(self):
        self.citation_data_csv_path = "index%stest_data%scitations_data.csv" % (
            sep, sep)
        self.citation_prov_csv_path = "index%stest_data%scitations_prov.csv" % (
            sep, sep)
        self.citation_data_ttl_path = "index%stest_data%scitations_data.ttl" % (
            sep, sep)
        self.citation_prov_ttl_path = "index%stest_data%scitations_prov.ttl" % (
            sep, sep)
        self.citation_data_prov_scholix_path = "index%stest_data%scitations_data_prov.scholix" % (
            sep, sep)
        self.base_url = "https://w3id.org/oc/index/coci/"

        self.citation_1 = Citation(
            "02001000308362819371213133704040001020809-020010009063615193700006300030306151914",
            "http://dx.doi.org/10.1038/sj.cdd.4401289",
            "2003-10-24",
            "http://dx.doi.org/10.1096/fj.00-0336fje",
            "2001-01",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1038/sj.cdd.4401289",
            "2018-11-01T09:14:03",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_2 = Citation(
            "02001000002361927283705040000-02001000002361927283705030002",
            "http://dx.doi.org/10.1002/jrs.5400",
            "2018-06",
            "http://dx.doi.org/10.1002/jrs.5302",
            "2017-12-05",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/jrs.5400",
            "2018-11-01T14:51:52",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=True,
            author_sc=True,
            prov_description="Creation of the citation")

        self.citation_3 = Citation(
            "02001000002361927283705040000-020010003093612062710020603000720",
            "http://dx.doi.org/10.1002/jrs.5400",
            "2018-06",
            "http://dx.doi.org/10.1039/c6ra26307k",
            "2017",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/jrs.5400",
            "2018-11-01T14:51:52",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=True,
            prov_description="Creation of the citation")

        self.citation_4 = Citation(
            "02001000308362819371213133704040001020804-02001000308362819371213133704040000030707",
            "http://dx.doi.org/10.1038/sj.cdd.4401284",
            "2003-08-22",
            "http://dx.doi.org/10.1038/sj.cdd.4400377",
            "1998-05",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1038/sj.cdd.4401284",
            "2018-11-01T09:14:03",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=True,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_5 = Citation(
            "020010000023625242110370100030001-02001010009361222251430273701090809370903040403",
            "http://dx.doi.org/10.1002/pola.10301",
            "2002-06-21",
            "http://dx.doi.org/10.1109/cmpeur.1989.93443",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/pola.10301",
            "2018-10-31T16:13:26",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        self.citation_6 = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")

        # Hack for correct handling of date datatypes
        if XSD.gYear in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYear)
        if XSD.gYearMonth in _toPythonMapping:
            _toPythonMapping.pop(XSD.gYearMonth)

    def test_inferred_leap_year_dates(self):
        cit = Citation(
            None,
            "http://dx.doi.org/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3",
            "2001",
            "http://dx.doi.org/10.1002/%28sici%291097-0258%2819960229%2915%3A4%3C361%3A%3Aaid-sim168%3E3.0.co%3B2-4",
            "1996-02-29",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1002/1097-0142%2820010815%2992%3A4%3C796%3A%3Aaid-cncr1385%3E3.0.co%3B2-3",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertEqual(cit.duration, "P5Y")

    def test_invalid_date_for_citation(self):
        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            "0000",
            "http://dx.doi.org/10.1134/s1028334x09040217",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            "2019",
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "0000",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNotNone(cit.citing_pub_date)
        self.assertIsNotNone(cit.creation_date)
        self.assertIsNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            "2019",
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNotNone(cit.citing_pub_date)
        self.assertIsNotNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNotNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            "2019-02-29",
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

    def test_citation_data_csv(self):
        citation_data_csv = None

        with open(self.citation_data_csv_path) as f:
            citation_data_csv = list(DictReader(f))

        self.assertEqual(
            list(DictReader(StringIO(self.citation_1.get_citation_csv())))[0],
            citation_data_csv[0])
        self.assertEqual(
            list(DictReader(StringIO(self.citation_2.get_citation_csv())))[0],
            citation_data_csv[1])
        self.assertEqual(
            list(DictReader(StringIO(self.citation_3.get_citation_csv())))[0],
            citation_data_csv[2])
        self.assertEqual(
            list(DictReader(StringIO(self.citation_4.get_citation_csv())))[0],
            citation_data_csv[3])
        self.assertEqual(
            list(DictReader(StringIO(self.citation_5.get_citation_csv())))[0],
            citation_data_csv[4])
        self.assertEqual(
            list(DictReader(StringIO(self.citation_6.get_citation_csv())))[0],
            citation_data_csv[5])

    def test_citation_prov_csv(self):
        citation_prov_csv = None

        with open(self.citation_prov_csv_path) as f:
            citation_prov_csv = list(DictReader(f))

        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_1.get_citation_prov_csv())))[0],
            citation_prov_csv[0])
        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_2.get_citation_prov_csv())))[0],
            citation_prov_csv[1])
        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_3.get_citation_prov_csv())))[0],
            citation_prov_csv[2])
        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_4.get_citation_prov_csv())))[0],
            citation_prov_csv[3])
        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_5.get_citation_prov_csv())))[0],
            citation_prov_csv[4])
        self.assertEqual(
            list(DictReader(StringIO(
                self.citation_6.get_citation_prov_csv())))[0],
            citation_prov_csv[5])

    def test_citation_data_ttl(self):
        g1 = ConjunctiveGraph()
        g1.load(self.citation_data_ttl_path, format="nt11")

        g2 = ConjunctiveGraph()
        for c in [
                self.citation_1, self.citation_2, self.citation_3,
                self.citation_4, self.citation_5, self.citation_6
        ]:
            for s, p, o in c.get_citation_rdf(self.base_url, False, False,
                                              False):
                g2.add((s, p, o))

        self.assertTrue(isomorphic(g1, g2))

    def test_citation_prov_ttl(self):
        g1 = ConjunctiveGraph()
        g1.load(self.citation_prov_ttl_path, format="nquads")

        g2 = ConjunctiveGraph()
        for c in [
                self.citation_1, self.citation_2, self.citation_3,
                self.citation_4, self.citation_5, self.citation_6
        ]:
            for s, p, o, g in c.get_citation_prov_rdf(self.base_url).quads(
                (None, None, None, None)):
                g2.add((s, p, o, g))

        self.assertTrue(isomorphic(g1, g2))

    def test_citation_data_prov_scholix(self):
        citation_data_prov_scholix = None

        with open(self.citation_data_prov_scholix_path) as f:
            citation_data_prov_scholix = load(f)

        self.assertEqual(loads(self.citation_1.get_citation_scholix()),
                         citation_data_prov_scholix[0])
        self.assertEqual(loads(self.citation_2.get_citation_scholix()),
                         citation_data_prov_scholix[1])
        self.assertEqual(loads(self.citation_3.get_citation_scholix()),
                         citation_data_prov_scholix[2])
        self.assertEqual(loads(self.citation_4.get_citation_scholix()),
                         citation_data_prov_scholix[3])
        self.assertEqual(loads(self.citation_5.get_citation_scholix()),
                         citation_data_prov_scholix[4])
        self.assertEqual(loads(self.citation_6.get_citation_scholix()),
                         citation_data_prov_scholix[5])

    def test_lookup(self):
        doi_1 = "10.1038/sj.cdd.4401289"
        doi_2 = "10.1096/fj.00-0336fje"
        doi_3 = "10.1002/jrs.5400"
        doi_4 = "10.1039/c6ra26307k"
        doi_5 = "10.1234/456789qwertyuiopasdfghjklzxcvbnmè+òàù,.-åß∂ƒ∞∆ªº¬∑≤†©√∫˜≥»”’¢‰"
        doi_6 = "10.1234/!\"£$%&/()=?^é*ç°§;:_<>«“‘¥~‹÷´`￿ˆ[]@#¶…•–„Ω€®™æ¨øπ"

        # Test conversion without any file
        oci_man = OCIManager()
        oci = oci_man.get_oci(doi_1, doi_2, "020")
        self.assertEqual(
            doi_1.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[0].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(
            doi_2.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[1].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(len(oci_man.lookup.keys()), len(set(doi_1 + doi_2)))

        # Test conversion with full file
        oci_12 = "oci:02001000308362819371213133704040001020809-020010009063615193700006300030306151914"
        oci_man = OCIManager(lookup_file="index%stest_data%slookup_full.csv" %
                             (sep, sep))
        self.assertEqual(oci_man.get_oci(doi_1, doi_2, "020"), oci_12)

        # Test conversion with new file
        new_file_path = "index%stest_data%slookup_new.csv" % (sep, sep)
        if exists(new_file_path):
            remove(new_file_path)
        oci_man = OCIManager(lookup_file=new_file_path)
        oci = oci_man.get_oci(doi_1, doi_2, "020")
        self.assertEqual(
            doi_1.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[0].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(
            doi_2.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[1].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(len(oci_man.lookup.keys()), len(set(doi_1 + doi_2)))

        # Test conversion with incomplete file (ver 1, existing DOIs)
        oci_man = OCIManager(lookup_file=new_file_path)
        oci = oci_man.get_oci(doi_3, doi_4, "020")
        self.assertEqual(
            doi_3.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[0].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(
            doi_4.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[1].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(len(oci_man.lookup.keys()),
                         len(set(doi_1 + doi_2 + doi_3 + doi_4)))

        # Test conversion with incomplete file (ver 2, non-existing DOIs)
        oci_man = OCIManager(lookup_file=new_file_path)
        oci = oci_man.get_oci(doi_5, doi_6, "020")
        self.assertEqual(
            doi_5.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[0].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(
            doi_6.replace("10.", "", 1), "".join([
                oci_man.lookup[code] for code in findall(
                    "(9*[0-8][0-9])",
                    oci.replace("oci:", "").split("-")[1].replace(
                        "020", "", 1))
            ]))
        self.assertEqual(
            len(oci_man.lookup.keys()),
            len(set(doi_1 + doi_2 + doi_3 + doi_4 + doi_5 + doi_6)))
Ejemplo n.º 11
0
    def test_invalid_date_for_citation(self):
        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            "0000",
            "http://dx.doi.org/10.1134/s1028334x09040217",
            None,
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            "2019",
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "0000",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNotNone(cit.citing_pub_date)
        self.assertIsNotNone(cit.creation_date)
        self.assertIsNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            None,
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            "2019",
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNotNone(cit.citing_pub_date)
        self.assertIsNotNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNotNone(cit.duration)

        cit = Citation(
            "020010103003602000105370205010358000059-02001010304362801000208030304330009000400020107",
            "http://dx.doi.org/10.1130/2015.2513%2800%29",
            None,
            "http://dx.doi.org/10.1134/s1028334x09040217",
            "2011",
            "2019-02-29",
            None,
            1,
            "https://w3id.org/oc/index/prov/ra/1",
            "https://api.crossref.org/works/10.1130/2015.2513%2800%29",
            "2018-10-31T16:17:07",
            "OpenCitations Index: COCI",
            "doi",
            "http://dx.doi.org/([[XXX__decode]])",
            None,
            journal_sc=False,
            author_sc=False,
            prov_description="Creation of the citation")
        self.assertIsNone(cit.citing_pub_date)
        self.assertIsNone(cit.creation_date)
        self.assertIsNotNone(cit.cited_pub_date)
        self.assertIsNone(cit.duration)
Ejemplo n.º 12
0
def extract_citations(idbaseurl,
                      baseurl,
                      python,
                      pclass,
                      input,
                      lookup,
                      data,
                      prefix,
                      agent,
                      source,
                      service,
                      verbose,
                      doi_manager,
                      rf_handler,
                      oci_to_do=None):
    BASE_URL = idbaseurl
    DATASET_URL = baseurl + "/" if not baseurl.endswith("/") else baseurl

    oci_manager = OCIManager(lookup_file=lookup)
    exi_ocis = CSVManager.load_csv_column_as_set(
        data + sep + "data", "oci"
    )  # TODO: we need to specify carefully the dir, eg by adding an additional flag to distinguish between the files belonging to a particular process, and it should be aligned with the storer.
    if oci_to_do is not None:
        oci_to_do.difference_update(exi_ocis)
    cit_storer = CitationStorer(data, DATASET_URL)

    citations_already_present = 0
    new_citations_added = 0
    error_in_dois_existence = 0

    cs = import_citation_source(python, pclass, input)
    next_citation = cs.get_next_citation_data()

    while next_citation is not None:
        citing, cited, created, timespan, journal_sc, author_sc = next_citation
        oci = oci_manager.get_oci(citing, cited, prefix)
        oci_noprefix = oci.replace("oci:", "")
        if oci_noprefix not in exi_ocis and (oci_to_do is None
                                             or oci_noprefix in oci_to_do):
            if doi_manager.is_valid(citing) and doi_manager.is_valid(cited):
                if created is None:
                    citing_date = rf_handler.get_date(citing)
                else:
                    citing_date = created
                cited_date = rf_handler.get_date(cited)
                if journal_sc is None or type(journal_sc) is not bool:
                    journal_sc = rf_handler.share_issn(citing, cited)
                if author_sc is None or type(author_sc) is not bool:
                    author_sc = rf_handler.share_orcid(citing, cited)

                if created is not None and timespan is not None:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), None,
                        BASE_URL + quote(cited), None, created, timespan, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)
                else:
                    cit = Citation(
                        oci, BASE_URL + quote(citing), citing_date,
                        BASE_URL + quote(cited), cited_date, None, None, 1,
                        agent, source,
                        datetime.now().strftime('%Y-%m-%dT%H:%M:%S'), service,
                        "doi", BASE_URL + "([[XXX__decode]])", "reference",
                        journal_sc, author_sc, None,
                        "Creation of the citation", None)

                cit_storer.store_citation(cit)

                if verbose:
                    print(
                        "Create citation data for '%s' between DOI '%s' and DOI '%s'"
                        % (oci, citing, cited))
                new_citations_added += 1
                exi_ocis.add(oci_noprefix)
            else:
                if verbose:
                    print(
                        "WARNING: some DOIs, among '%s' and '%s', do not exist"
                        % (citing, cited))
                error_in_dois_existence += 1
            if oci_to_do is not None:
                oci_to_do.remove(oci_noprefix)
        else:
            if verbose:
                print(
                    "WARNING: the citation between DOI '%s' and DOI '%s' has been already processed"
                    % (citing, cited))
            citations_already_present += 1

        next_citation = cs.get_next_citation_data()

    return new_citations_added, citations_already_present, error_in_dois_existence