def __init__(self, date=None, orcid=None, issn=None, doi=None, **params): if date is None: date = CSVManager(store_new=False) if orcid is None: orcid = CSVManager(store_new=False) if issn is None: issn = CSVManager(store_new=False) if doi is None: doi = CSVManager(store_new=False) for key in params: setattr(self, key, params[key]) self.issn = issn self.date = date self.orcid = orcid if hasattr(self, 'use_api_service'): self.dm = DOIManager(doi, self.use_api_service) else: self.dm = DOIManager(doi) self.im = ISSNManager() self.om = ORCIDManager() self.headers = { "User-Agent": "ResourceFinder / OpenCitations Indexes " "(http://opencitations.net; mailto:[email protected])" }
class CrowdsourcedCitationSource(CSVFileCitationSource): def __init__(self, src, local_name=""): self.doi = DOIManager() super(CrowdsourcedCitationSource, self).__init__(src, local_name) def get_next_citation_data(self): row = self._get_next_in_file() while row is not None: citing = self.doi.normalise(row.get("citing_id")) cited = self.doi.normalise(row.get("cited_id")) if citing is not None and cited is not None: created = row.get("citing_publication_date") if not created: created = None cited_pub_date = row.get("cited_publication_date") if not cited_pub_date: timespan = None else: c = Citation(None, None, created, None, cited_pub_date, None, None, None, None, "", None, None, None, None, None) timespan = c.duration self.update_status_file() return citing, cited, created, timespan, None, None self.update_status_file() row = self._get_next_in_file() remove(self.status_file)
def test_doi_normalise(self): dm = DOIManager() self.assertEqual( self.valid_doi_1, dm.normalise(self.valid_doi_1.upper().replace("10.", "doi: 10. "))) self.assertEqual( self.valid_doi_1, dm.normalise(self.valid_doi_1.upper().replace("10.", "doi:10."))) self.assertEqual( self.valid_doi_1, dm.normalise(self.valid_doi_1.upper().replace( "10.", "https://doi.org/10.")))
def execute_workflow(idbaseurl, baseurl, python, pclass, input, doi_file, date_file, orcid_file, issn_file, orcid, lookup, data, prefix, agent, source, service, verbose, no_api): # Create the support file for handling information about bibliographic resources valid_doi, id_date, id_orcid, id_issn = create_csv(doi_file, date_file, orcid_file, issn_file) doi_manager = DOIManager(valid_doi, use_api_service=not no_api) crossref_rf = CrossrefResourceFinder(date=id_date, orcid=id_orcid, issn=id_issn, doi=valid_doi, use_api_service=not no_api) datacite_rf = DataCiteResourceFinder(date=id_date, orcid=id_orcid, issn=id_issn, doi=valid_doi, use_api_service=not no_api) orcid_rf = ORCIDResourceFinder( date=id_date, orcid=id_orcid, issn=id_issn, doi=valid_doi, use_api_service=True if orcid is not None and not no_api else False, key=orcid) rf_handler = ResourceFinderHandler([crossref_rf, datacite_rf, orcid_rf]) return extract_citations(idbaseurl, baseurl, python, pclass, input, lookup, data, prefix, agent, source, service, verbose, doi_manager, rf_handler)
class CrossrefCitationSource(DirCitationSource): def __init__(self, src, local_name=""): self.last_ref = -1 self.doi = DOIManager() super(CrossrefCitationSource, self).__init__(src, local_name) def load(self, file_path): result = [] with open(file_path, encoding="utf8") as f: j = load(f) if "items" in j: result.extend(j["items"]) return result, len(result) def select_file(self, file_path): return file_path.endswith(".json") def get_next_citation_data(self): row = self._get_next_in_file() while row is not None: citing = self.doi.normalise(row.get("DOI")) if citing is not None and "reference" in row: for idx, ref in enumerate(row["reference"]): if idx > self.last_ref: self.last_ref = idx cited = self.doi.normalise(ref.get("DOI")) if cited is not None: self.last_row -= 1 # self.update_status_file() # In Crossref, this should not be # needed since I modify the row only when I finished to process # all the references of a certain row (since here, a row is an # article, not a citation) return citing, cited, None, None, None, None self.update_status_file() row = self._get_next_in_file() self.last_ref = -1 remove(self.status_file)
class CrossrefCitationSource(DirCitationSource): def __init__(self, src, local_name=""): self.last_ref = -1 self.doi = DOIManager() super(CrossrefCitationSource, self).__init__(src, local_name) def load(self, file_path): result = [] with open(file_path) as f: j = load(f) if "items" in j: result.extend(j["items"]) return result, len(result) def select_file(self, file_path): return file_path.endswith(".json") def get_next_citation_data(self): row = self._get_next_in_file() while row is not None: citing = self.doi.normalise(row.get("DOI")) if citing is not None and "reference" in row: for idx, ref in enumerate(row["reference"]): if idx > self.last_ref: self.last_ref = idx cited = self.doi.normalise(ref.get("DOI")) if cited is not None: self.last_row -= 1 self.update_status_file() return citing, cited, None, None, None, None self.update_status_file() row = self._get_next_in_file() self.last_ref = -1 remove(self.status_file)
def test_doi_is_valid(self): dm_nofile = DOIManager() self.assertTrue(dm_nofile.is_valid(self.valid_doi_1)) self.assertTrue(dm_nofile.is_valid(self.valid_doi_2)) self.assertFalse(dm_nofile.is_valid(self.invalid_doi_1)) self.assertFalse(dm_nofile.is_valid(self.invalid_doi_2)) valid_doi = CSVManager(self.valid_doi_path) dm_file = DOIManager(valid_doi=valid_doi, use_api_service=False) self.assertTrue(dm_file.is_valid(self.valid_doi_1)) self.assertFalse(dm_file.is_valid(self.invalid_doi_1)) dm_nofile_noapi = DOIManager(use_api_service=False) self.assertFalse(dm_nofile_noapi.is_valid(self.valid_doi_1)) self.assertFalse(dm_nofile_noapi.is_valid(self.invalid_doi_1))
def process(input_dir, output_dir): if not exists(output_dir): makedirs(output_dir) citing_doi_with_no_date = set() valid_doi = CSVManager(output_dir + sep + "valid_doi.csv") id_date = CSVManager(output_dir + sep + "id_date.csv") id_issn = CSVManager(output_dir + sep + "id_issn.csv") id_orcid = CSVManager(output_dir + sep + "id_orcid.csv") doi_manager = DOIManager(valid_doi) issn_manager = ISSNManager() orcid_manager = ORCIDManager() all_files, opener = get_all_files(input_dir) len_all_files = len(all_files) # Read all the JSON file in the Crossref dump to create the main information of all the indexes print("\n\n# Add valid DOIs from Crossref metadata") for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) try: data = load(f) # When using tar.gz file or zip file a stream of byte is returned by the opener. Thus, # it must be converted into an utf-8 string before loading it into a JSON. except TypeError: utf8reader = codecs.getreader("utf-8") data = load(utf8reader(f)) if "items" in data: for obj in data['items']: if "DOI" in obj: citing_doi = doi_manager.normalise(obj["DOI"], True) doi_manager.set_valid(citing_doi) if id_date.get_value(citing_doi) is None: citing_date = Citation.check_date(build_pubdate(obj)) if citing_date is not None: id_date.add_value(citing_doi, citing_date) if citing_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(citing_doi) else: citing_doi_with_no_date.add(citing_doi) if id_issn.get_value(citing_doi) is None: if "type" in obj: cur_type = obj["type"] if cur_type is not None and "journal" in cur_type and "ISSN" in obj: cur_issn = obj["ISSN"] if cur_issn is not None: for issn in [issn_manager.normalise(issn) for issn in cur_issn]: if issn is not None: id_issn.add_value(citing_doi, issn) if id_orcid.get_value(citing_doi) is None: if "author" in obj: cur_author = obj['author'] if cur_author is not None: for author in cur_author: if "ORCID" in author: orcid = orcid_manager.normalise(author["ORCID"]) if orcid is not None: id_orcid.add_value(citing_doi, orcid) # Do it again for updating the dates of the cited DOIs, if these are valid print("\n\n# Check cited DOIs from Crossref reference field") doi_date = {} for file_idx, file in enumerate(all_files, 1): with opener(file) as f: print("Open file %s of %s" % (file_idx, len_all_files)) data = load(f) if "items" in data: for obj in data['items']: if "DOI" in obj and "reference" in obj: for ref in obj['reference']: if "DOI" in ref: cited_doi = doi_manager.normalise(ref["DOI"], True) if doi_manager.is_valid(cited_doi) and id_date.get_value(cited_doi) is None: if cited_doi not in doi_date: doi_date[cited_doi] = [] cited_date = Citation.check_date(build_pubdate(ref)) if cited_date is not None: doi_date[cited_doi].append(cited_date) if cited_doi in citing_doi_with_no_date: citing_doi_with_no_date.remove(cited_doi) # Add the date to the DOI if such date is the most adopted one in the various references. # In case two distinct dates are used the most, select the older one. for doi in doi_date: count = Counter(doi_date[doi]) if len(count): top_value = count.most_common(1)[0][1] selected_dates = [] for date in count: if count[date] == top_value: selected_dates.append(date) best_date = sorted(selected_dates)[0] id_date.add_value(doi, best_date) else: id_date.add_value(doi, "") # Add emtpy dates for the remaining DOIs for doi in citing_doi_with_no_date: id_date.add_value(doi, "")
def __init__(self, src, local_name=""): self.last_ref = -1 self.doi = DOIManager() super(CrossrefCitationSource, self).__init__(src, local_name)
result = {} threshold = 10000 existing_ocis = set() all_files = [] if isdir(args.input_file): for cur_dir, cur_subdir, cur_files in walk(args.input_file): for cur_file in [f for f in cur_files if f.endswith(".csv")]: all_files.append(cur_dir + sep + cur_file) else: all_files.append(args.input_file) dois = None if args.doi_file is not None and exists(args.doi_file): with open(args.doi_file, encoding="utf8") as f: dm = DOIManager() dois = set() csv_reader = reader(f) for row in csv_reader: doi = dm.normalise(row[0]) if doi: dois.add(doi) for cur_file in all_files: with open(cur_file, encoding="utf8") as f: csv_content = "" for idx, line in enumerate(f.readlines()): if header is None: header = line csv_content = header else:
def __init__(self, src, local_name=""): self.doi = DOIManager() super(CrowdsourcedCitationSource, self).__init__(src, local_name)