def test_add_value(self): if exists(self.addition_path): remove(self.addition_path) csv_m = CSVManager(self.addition_path) csv_m.add_value("doi:10.1108/jd-12-2013-0166", "orcid:0000-0003-0530-4305") csv_m.add_value("doi:10.7717/peerj.4375", "orcid:0000-0003-1613-5981") csv_m.add_value("doi:10.1108/jd-12-2013-0166", "orcid:0000-0001-5506-523X") self.assertDictEqual( csv_m.data, { "doi:10.1108/jd-12-2013-0166": {"orcid:0000-0003-0530-4305", "orcid:0000-0001-5506-523X"}, "doi:10.7717/peerj.4375": {"orcid:0000-0003-1613-5981"} })
def process(input_dir, output_dir): if not exists(output_dir): makedirs(output_dir) citing_doi_with_no_date = set() valid_doi = CSVManager(output_dir + sep + "valid_doi.csv") id_date = CSVManager(output_dir + sep + "id_date.csv") id_issn = CSVManager(output_dir + sep + "id_issn.csv") id_orcid = CSVManager(output_dir + sep + "id_orcid.csv") doi_manager = DOIManager(valid_doi) issn_manager = ISSNManager() orcid_manager = ORCIDManager() all_files, opener = get_all_files(input_dir) len_all_files = len(all_files) # Read all the JSON file in the Crossref dump to create the main information of all the indexes print("\n\n# Add valid DOIs from Crossref metadata") for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) try: data = load(f) # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus, # it must be converted into an utf-8 string before loading it into a JSON. except TypeError: utf8reader = codecs.getreader("utf-8") data = load(utf8reader(f)) if "items" in data: for obj in data['items']: if "DOI" in obj: citing_doi = doi_manager.normalise(obj["DOI"], True) doi_manager.set_valid(citing_doi) if id_date.get_value(citing_doi) is None: citing_date = Citation.check_date(build_pubdate(obj)) if citing_date is not None: id_date.add_value(citing_doi, citing_date) if citing_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(citing_doi) else: citing_doi_with_no_date.add(citing_doi) if id_issn.get_value(citing_doi) is None: if "type" in obj: cur_type = obj["type"] if cur_type is not None and "journal" in cur_type and "ISSN" in obj: cur_issn = obj["ISSN"] if cur_issn is not None: for issn in [issn_manager.normalise(issn) for issn in cur_issn]: if issn is not None: id_issn.add_value(citing_doi, issn) if id_orcid.get_value(citing_doi) is None: if "author" in obj: cur_author = obj['author'] if cur_author is not None: for author in cur_author: if "ORCID" in author: orcid = orcid_manager.normalise(author["ORCID"]) if orcid is not None: id_orcid.add_value(citing_doi, orcid) # Do it again for updating the dates of the cited DOIs, if these are valid print("\n\n# Check cited DOIs from Crossref reference field") doi_date = {} for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) data = load(f) if "items" in data: for obj in data['items']: if "DOI" in obj and "reference" in obj: for ref in obj['reference']: if "DOI" in ref: cited_doi = doi_manager.normalise(ref["DOI"], True) if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None: if cited_doi not in doi_date: doi_date[cited_doi] = [] cited_date = Citation.check_date(build_pubdate(ref)) if cited_date is not None: doi_date[cited_doi].append(cited_date) if cited_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(cited_doi) # Add the date to the DOI if such date is the most adopted one in the various references. # In case two distinct dates are used the most, select the older one. for doi in doi_date: count = Counter(doi_date[doi]) if len(count): top_value = count.most_common(1)[0][1] selected_dates = [] for date in count: if count[date] == top_value: selected_dates.append(date) best_date = sorted(selected_dates)[0] id_date.add_value(doi, best_date) else: id_date.add_value(doi, "") # Add emtpy dates for the remaining DOIs for doi in citing_doi_with_no_date: id_date.add_value(doi, "")