Ejemplo n.º 1
0
    def test_get_value(self):
        csv_m = CSVManager(self.initial_path)
        retrieved_1 = csv_m.get_value("doi:10.1108/jd-12-2013-0166")
        self.assertEqual({"2015-03-09"}, retrieved_1)

        retrieved_2 = csv_m.get_value("doi:10.1108/jd-12-2013-0167")
        self.assertIsNone(retrieved_2)
Ejemplo n.º 2
0
def process(input_dir, output_dir):
    if not exists(output_dir):
        makedirs(output_dir)

    citing_doi_with_no_date = set()
    valid_doi = CSVManager(output_dir + sep + "valid_doi.csv")
    id_date = CSVManager(output_dir + sep + "id_date.csv")
    id_issn = CSVManager(output_dir + sep + "id_issn.csv")
    id_orcid = CSVManager(output_dir + sep + "id_orcid.csv")

    doi_manager = DOIManager(valid_doi)
    issn_manager = ISSNManager()
    orcid_manager = ORCIDManager()

    all_files, opener = get_all_files(input_dir)
    len_all_files = len(all_files)

    # Read all the JSON file in the Crossref dump to create the main information of all the indexes
    print("\n\n# Add valid DOIs from Crossref metadata")
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            try:
                data = load(f)
            # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus,
            # it must be converted into an utf-8 string before loading it into a JSON.
            except TypeError:
                utf8reader = codecs.getreader("utf-8")
                data = load(utf8reader(f))

            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj:
                        citing_doi = doi_manager.normalise(obj["DOI"], True)
                        doi_manager.set_valid(citing_doi)

                        if id_date.get_value(citing_doi) is None:
                            citing_date = Citation.check_date(build_pubdate(obj))
                            if citing_date is not None:
                                id_date.add_value(citing_doi, citing_date)
                                if citing_doi in citing_doi_with_no_date:
                                    citing_doi_with_no_date.remove(citing_doi)
                            else:
                                citing_doi_with_no_date.add(citing_doi)

                        if id_issn.get_value(citing_doi) is None:
                            if "type" in obj:
                                cur_type = obj["type"]
                                if cur_type is not None and "journal" in cur_type and "ISSN" in obj:
                                    cur_issn = obj["ISSN"]
                                    if cur_issn is not None:
                                        for issn in [issn_manager.normalise(issn) for issn in cur_issn]:
                                            if issn is not None:
                                                id_issn.add_value(citing_doi, issn)

                        if id_orcid.get_value(citing_doi) is None:
                            if "author" in obj:
                                cur_author = obj['author']
                                if cur_author is not None:
                                    for author in cur_author:
                                        if "ORCID" in author:
                                            orcid = orcid_manager.normalise(author["ORCID"])
                                            if orcid is not None:
                                                id_orcid.add_value(citing_doi, orcid)

    # Do it again for updating the dates of the cited DOIs, if these are valid
    print("\n\n# Check cited DOIs from Crossref reference field")
    doi_date = {}
    for file_idx, file in enumerate(all_files, 1):
        with opener(file) as f:
            print("Open file %s of %s" % (file_idx, len_all_files))
            data = load(f)
            if "items" in data:
                for obj in data['items']:
                    if "DOI" in obj and "reference" in obj:
                        for ref in obj['reference']:
                            if "DOI" in ref:
                                cited_doi = doi_manager.normalise(ref["DOI"], True)
                                if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None:
                                    if cited_doi not in doi_date:
                                        doi_date[cited_doi] = []
                                    cited_date = Citation.check_date(build_pubdate(ref))
                                    if cited_date is not None:
                                        doi_date[cited_doi].append(cited_date)
                                        if cited_doi in citing_doi_with_no_date:
                                            citing_doi_with_no_date.remove(cited_doi)

    # Add the date to the DOI if such date is the most adopted one in the various references.
    # In case two distinct dates are used the most, select the older one.
    for doi in doi_date:
        count = Counter(doi_date[doi])
        if len(count):
            top_value = count.most_common(1)[0][1]
            selected_dates = []
            for date in count:
                if count[date] == top_value:
                    selected_dates.append(date)
            best_date = sorted(selected_dates)[0]
            id_date.add_value(doi, best_date)
        else:
            id_date.add_value(doi, "")

    # Add emtpy dates for the remaining DOIs
    for doi in citing_doi_with_no_date:
        id_date.add_value(doi, "")