def process_doi(self, doi, doi_curator, doi_source_provider): existing_res = self.rf.retrieve_from_doi(doi) if existing_res is None: cur_json = self.get_crossref_item(self.__process_entity(doi, self.crossref_api_works)) if cur_json is not None: return self.process_crossref_json( cur_json, self.crossref_api_works + encode_url(doi), doi_curator, doi_source_provider, self.source) else: return self.process_existing_by_id(existing_res, self.id)
def process_article(self, cur_id, cur_source, cur_doi, cur_pmid, cur_pmcid, oa=False, intext_refs=False): if cur_doi is None and cur_pmid is not None: cur_doi = self.__get_doi_from_xml_source(cur_pmid) cur_localid = cur_source + "-" + cur_id id_list = [cur_doi, cur_pmid, cur_pmid, cur_localid] if not self.rs.is_any_stored(id_list): self.repok.new_article() self.repok.add_sentence("Processing article with local id '%s'." % cur_localid) if oa and not intext_refs: ref_list_url = self.process_xml_source(cur_pmid, cur_doi, intext_refs=False) elif oa and intext_refs: ref_list_url = self.process_xml_source(cur_pmid, cur_doi, intext_refs=True) else: ref_list_url = self.process_references(cur_source, cur_id) if ref_list_url is not None: stored = self.rs.store( next(item for item in id_list if item is not None), cur_localid, cur_doi, cur_pmid, cur_pmcid, self.name, self.provider, encode_url(ref_list_url), True) if stored: self.repok.add_sentence( "References of '%s' have been stored." % cur_localid) else: self.repok.add_sentence( "Something went wrong in storing the references of '%s'." % cur_localid) else: self.reper.add_sentence( "The article '%s' has no references or its PubMed Central " "ID is not defined." % cur_localid) else: self.repok.add_sentence( "The article '%s' has been already stored." % cur_localid)
def process_doi(self, doi: str, doi_curator: str, doi_source_provider: str, check=False, result=None, typ='both'): """ Process a DOI searching for it on Crossref (local/remote). Parameters ---------- :param doi: The DOI to be searched. :param doi_curator : The curator(URL), e.g.: https://api.crossref.org/works/ :param doi_source_provider: The source provider, e.g.: Europe PubMed Central :param check: Set it to True only in the tests in order to return the json :param result: A result retrieved with the query_interface during the Bibentry creation process :param typ: A string that can be 'both', 'only_local', or 'only_blazegraph'. Useful when you want to query only on a specific kind of store. """ # Check if we already have this resource existing_res = self.rf.retrieve_from_doi(doi, typ=typ) # Otherwise query for it if existing_res is None: if result is None: cur_json = self.query_interface.get_data_crossref_doi(doi) else: cur_json = result if cur_json is not None: if check: return cur_json else: return self.process_crossref_json( cur_json, self.crossref_api_works + encode_url(doi), doi_curator, doi_source_provider, self.source) else: return self.process_existing_by_id(existing_res, self.id)
def create_url(self, string): return self._associate_identifier_with_scheme( encode_url(string.lower()), GraphEntity.url)
def process_article(self, paper, oa=False, intext_refs=False): cur_source = "MED" cur_doi = paper["cur_doi"] cur_pmid = int(paper["cur_pmid"]) cur_pmcid = paper["cur_pmcid"] cur_name = paper["cur_name"] if cur_pmid != 0: cur_id = "PMID{}".format(cur_pmid) elif cur_pmcid != 0: cur_id = "{}".format(cur_pmcid) elif cur_doi is not None and cur_doi != "": cur_id = "DOI{}".format(cur_doi) else: self.repok.add_sentence("No id for this paper") return references = json.loads(paper["references"]) cur_localid = "{}-{}".format(cur_source, cur_id) id_list = [str(cur_doi), str(cur_pmid), str(cur_pmid), cur_localid] if not self.rs.is_any_stored(id_list): self.repok.new_article() self.repok.add_sentence( "Processing article with local id {}".format(cur_localid)) if oa and not intext_refs: ref_list_url, ref_list, ref_pointer_list = self.process_xml_source( cur_pmid, cur_name, cur_doi, references, intext_refs=False) elif oa and intext_refs: ref_list_url, ref_list, ref_pointer_list = self.process_xml_source( cur_pmid, cur_name, cur_doi, references, intext_refs=True) #else: # ref_list_url = self.process_references(cur_source, cur_id) if ref_list_url is not None: if cur_pmid == 0 or cur_pmid is None: cur_pmid = "" if cur_pmcid == 0 or cur_pmid is None: cur_pmcid = "" json_item = {} json_item["references"] = ref_list if cur_localid != "": json_item["localid"] = cur_id if cur_doi != "" and cur_doi != "nan" and cur_doi is not None: json_item["doi"] = str(cur_doi) if cur_pmid != "" and cur_pmid != "0" and cur_pmid is not None: json_item["pmid"] = str(cur_pmid) if cur_pmcid != "" and cur_pmcid != "0" and cur_pmcid is not None: json_item["pmcid"] = str(cur_pmcid) if self.name is not None: json_item["curator"] = str(self.name) if self.provider is not None: json_item["source_provider"] = str(self.provider) if encode_url(ref_list_url) is not None: json_item["source"] = str(encode_url(ref_list_url)) if len(ref_pointer_list): json_item["reference_pointers"] = ref_pointer_list cur_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f_') local_file_name = str(uuid.uuid4()) + ".json" local_dir_name = self.rs.new_supplier() + re.sub( "^([0-9]+-[0-9]+-[0-9]+-[0-9]+).+$", "\\1", cur_time) new_dir_path = self.rs.ref_dir + os.sep + local_dir_name new_file_path = new_dir_path + os.sep + local_file_name if not os.path.exists(new_dir_path): os.makedirs(new_dir_path) try: with open(new_file_path, "w") as f: json.dump(json_item, f, indent=4, ensure_ascii=False) if cur_localid not in self.rs.stored: with open(self.rs.csv_file, "a") as name_f: name_f.write(cur_localid + "\n") self.rs.stored.add(cur_localid) return True except Exception as e: print(e, "\n\n", json_item, "\n\n") else: self.reper.add_sentence( "The article '%s' has no references or its PubMed Central " "ID is not defined." % cur_localid) else: self.repok.add_sentence( "The article '%s' has been already stored." % cur_localid)