def get_orcid_ids(self, doi_string, family_names=[]): result = [] records = self.get_orcid_records(doi_string, family_names) if records is not None: for orcid_id in dg(records, ["result", "orcid-identifier", "path"]): personal_details = self.get_orcid_data(orcid_id) if personal_details is not None: given_name = dg(personal_details, ["name", "given-names", "value"]) family_name = dg(personal_details, ["name", "family-name", "value"]) credit_name = dg(personal_details, ["name", "credit-name", "value"]) other_names = dg(personal_details, ["other-names", "other-name", "content"]) result += [ da({ "orcid": orcid_id, "given": given_name, "family": family_name, "credit": credit_name, "other": other_names }) ] return result
def __get_paper_data(self, source, paper_id): doi = None pmid = None pmcid = None paper_data = self.__get_data(self.paper_api + "ext_id:%s+src:%s" % (paper_id, source)) matched_results = dg(paper_data, ["resultList", "result"]) if matched_results is not None and len(matched_results) > 0: doi = dg(matched_results[0], ["doi"]) pmid = dg(matched_results[0], ["pmid"]) pmcid = dg(matched_results[0], ["pmcid"]) return {"doi": doi, "pmid": pmid, "pmcid": pmcid}
def process_citing_entity(self): citing_entity = None if self.occ is not None: citing_resource = self.rf.retrieve_entity(self.occ, GraphEntity.expression) citing_entity = self.g_set.add_br(self.name, self.id, self.source_provider, citing_resource) if citing_entity is None and self.doi is not None: citing_entity = self.process_doi(self.doi, self.curator, self.source_provider) if citing_entity is None: citing_entity = self.g_set.add_br(self.name) self.__add_doi(citing_entity, self.doi, self.curator) self.rf.update_graph_set(self.g_set) self.repok.add_sentence( self.message("The citing entity has been created even if no results have " "been returned by the API.", "doi", self.doi)) # Add other ids if they exist self.__add_pmid(citing_entity, self.pmid) self.__add_pmcid(citing_entity, self.pmcid) cited_entities = self.process_references() if cited_entities is not None: for idx, cited_entity in enumerate(cited_entities): citing_entity.has_citation(cited_entity) cur_bibentry = dg(self.entries[idx], ["bibentry"]) if cur_bibentry is not None and cur_bibentry.strip(): cur_be = self.g_set.add_be(self.curator, self.source_provider, self.source) citing_entity.contains_in_reference_list(cur_be) cited_entity.has_reference(cur_be) cur_be.create_content(cur_bibentry.strip()) return self.g_set
def get_orcid_ids(self, doi_string, family_names=[]): result = [] records = self.get_orcid_records(doi_string, family_names) if records is not None: if isinstance(self.query_interface, RemoteQuery): for orcid_id in dg(records, ["result", "orcid-identifier", "path"]): personal_details = self.get_orcid_data(orcid_id) if personal_details is not None: given_name = dg(personal_details, ["name", "given-names", "value"]) family_name = dg(personal_details, ["name", "family-name", "value"]) credit_name = dg(personal_details, ["name", "credit-name", "value"]) other_names = dg( personal_details, ["other-names", "other-name", "content"]) result += [ da({ "orcid": orcid_id, "given": given_name, "family": family_name, "credit": credit_name, "other": other_names }) ] else: for author in records: result += [ da({ "orcid": author['orcid'], "given": author['given_names'], "family": author['family_name'], "credit": "", # actually we don't manage this "other": "" # actually we don't manage this }) ] return result
def process_references(self, cur_source, cur_id): ref_list_url = self.ref_list_api.replace("XXX", cur_source).replace( "YYY", cur_id) paper_references = self.__get_data(ref_list_url) references = dg(paper_references, ["referenceList", "reference"]) if references is not None: self.rs.new_ref_list() for reference in references: ref_entry = self.__create_entry(reference) entry_text = None if ref_entry is None else ref_entry[0] process_entry_text = \ True if ref_entry is None else ref_entry[1] # Add special data if the reference matches with # the ePMC database if reference["match"] == "Y": ref_id = reference["id"] ref_source = reference["source"] ref_localid = ref_source + "-" + ref_id paper_ids = self.__get_paper_data(ref_source, ref_id) ref_doi = self.normalise_doi(paper_ids["doi"]) ref_pmid = paper_ids["pmid"] ref_pmcid = paper_ids["pmcid"] else: ref_localid = None ref_doi = self.normalise_doi(dg(reference, ["doi"])) ref_pmid = dg(reference, ["pmid"]) ref_pmcid = dg(reference, ["pmcid"]) ref_url = dg(reference, ["externalLink"]) self.rs.add_reference(entry_text, process_entry_text, ref_localid, ref_doi, ref_pmid, ref_pmcid, ref_url, None) # TODO none xmlid return ref_list_url
def __init__(self, full_entry, repok, reperr, query_interface, resourcefinder, get_bib_entry_doi, message, process_existing_by_id, do_process_entry=True): self.id = "Crossref" self.repok = repok self.reperr = reperr self.query_interface = query_interface self.rf = resourcefinder self.get_bib_entry_doi = get_bib_entry_doi self.message = message self.process_existing_by_id = process_existing_by_id self.extracted_doi_used = False self.do_process_entry = do_process_entry self.entry = dg(full_entry, ["bibentry"]) self.provided_doi = dg(full_entry, ["doi"]) self.provided_pmid = dg(full_entry, ["pmid"]) self.provided_pmcid = dg(full_entry, ["pmcid"]) self.provided_url = dg(full_entry, ["url"]) self.process_string = dg(full_entry, ["process_entry"]) # Variables used to store results self.process_doi_result = None self.process_pmid_result = None self.process_pmcid_result = None self.process_url_result = None self.existing_bibref_entry = None self.extracted_doi = None self.extracted_url = None self.cur_res = None self.existing_res_on_blazegraph = None self.cur_res_obtained_via = None self.cur_json_obtained_via = None # Variable to use for disambiguation purposes self.to_be_considered = True if self.process_string is not None: self.do_process_entry = self.process_string.lower().strip() == "true" if self.provided_url is not None: self.provided_url = FormatProcessor.extract_url(self.provided_url) else: self.extracted_url = FormatProcessor.extract_url(self.entry) self.extracted_doi = FormatProcessor.extract_doi(self.entry) # Start to query for data self.process_remote()
def process(self, oa=False, intext_refs=False): while True: if self.stopper.can_proceed(): cur_page = self.__get_next_page() result, cur_get_url = self.__get_data_from_page(cur_page, oa) # Re-run the query with the first page, # since a wrong page can be specified if result is None: result, cur_get_url = self.__get_data_from_page("*", oa) # Proceed only if there were no problems in getting the data, otherwise stop if result is not None: papers_retrieved = dg(result, ["resultList", "result"]) if papers_retrieved is not None and papers_retrieved: for paper in papers_retrieved: if self.stopper.can_proceed(): cur_id = dg(paper, ["id"]) cur_source = dg(paper, ["source"]) cur_doi = self.normalise_doi(dg( paper, ["doi"])) cur_pmid = dg(paper, ["pmid"]) cur_pmcid = dg(paper, ["pmcid"]) self.process_article(cur_id, cur_source, cur_doi, cur_pmid, cur_pmcid, oa, intext_refs) else: break if self.stopper.can_proceed(): self.__store_page_number( dg(result, ["nextCursorMark"])) else: # We have browsed all the pages with results, and thus the counting is reset self.__reset_page_number() self.repok.add_sentence( "All the pages have been processed.") break else: self.reper.add_sentence( "Problems in retrieving data for '%s'" % cur_get_url) break else: # Process stopped due to external reasons self.repok.add_sentence( "Process stopped due to external reasons.") break
def __create_entry(entry): result = None author = dg(entry, ["authorString"]) unstructured = dg(entry, ["unstructuredInformation"]) if author is not None and author.lower() != "author unknown": to_process = True entry_string = author year = dg(entry, ["pubYear"]) if year is not None and year > 0: entry_string += " (%s)" % str(year) else: to_process &= False title = dg(entry, ["title"]) if title is not None and title.strip() != "": entry_string += "%s %s" % ("" if entry_string[-1] == "." else ".", title.strip()) else: to_process &= False editors = dg(entry, ["editors"]) if editors is not None and editors.strip() != "": entry_string += "%s %s (Eds.)" % ("" if entry_string[-1] == "." else ".", editors.strip()) journal = dg(entry, ["journalAbbreviation"]) if journal is not None and journal.strip() != "": entry_string += "%s %s" % \ ("," if entry_string.endswith("(Eds.)") else "" if re.search("[\.\?\!]$", entry_string) is not None else ".", journal.strip()) container = dg(entry, ["publicationTitle"]) if container is not None and container.strip() != "": entry_string += "%s %s" % \ ("," if entry_string.endswith("(Eds.)") else "" if re.search("[\.\?\!]$", entry_string) is not None else ".", container.strip()) series = dg(entry, ["seriesName"]) if series is not None and series.strip() != "": entry_string += "%s %s" % \ ("," if entry_string.endswith("(Eds.)") else "" if re.search("[\.\?\!]$", entry_string) is not None else ".", series.strip()) volume = dg(entry, ["volume"]) if volume is not None and volume.strip() != "": entry_string += "%s %s" % ("" if re.search( "[\.\?\!]$", entry_string) is not None else ",", volume.strip()) issue = dg(entry, ["issue"]) if issue is not None and issue.strip() != "": is_digit = entry_string[-1].isdigit() entry_string += "%s%s%s" % \ (" (" if is_digit else " ", issue.strip(), ")" if is_digit else "") page = dg(entry, ["pageInfo"]) if page is not None and page.strip() != "": entry_string += "%s %s" % ("" if re.search( "[\.\?\!]$", entry_string) is not None else ":", page.strip()) edition = dg(entry, ["edition"]) if edition is not None and edition.strip() != "": entry_string += "%s %s" % ("" if re.search( "[\.\?\!]$", entry_string) is not None else ".", edition.strip()) doi = dg(entry, ["doi"]) if doi is not None and doi.strip() != "": entry_string += "%s https://doi.org/%s" % \ ("" if entry_string[-1] == "." else ".", doi.strip()) result = (entry_string, to_process) elif unstructured is not None and len(unstructured.strip()): result = (html.document_fromstring( unstructured.strip()).text_content(), True) return result
def process_references(self): result = [] for full_entry in self.entries: self.repok.new_article() self.reperr.new_article() cur_res = None entry = dg(full_entry, ["bibentry"]) do_process_entry = True process_string = dg(full_entry, ["process_entry"]) if process_string is not None: do_process_entry = process_string.lower().strip() == "true" provided_doi = dg(full_entry, ["doi"]) provided_pmid = dg(full_entry, ["pmid"]) provided_pmcid = dg(full_entry, ["pmcid"]) provided_url = dg(full_entry, ["url"]) # This is useful if additional data are stored in the field URL, e.g.: # 'http://pub.stat.ee/px/web.2001/dialog/statfile1.asp. Accessed on 2009' if provided_url is not None: provided_url = FormatProcessor.extract_url(provided_url) extracted_doi = FormatProcessor.extract_doi(entry) extracted_doi_used = False extracted_url = FormatProcessor.extract_url(entry) if provided_doi is not None: cur_res = self.process_doi(provided_doi, self.curator, self.source_provider) if cur_res is not None: self.repok.add_sentence( self.message( "The entity has been found by means of the " "DOI provided as input by %s." % self.source_provider, "DOI", provided_doi)) if cur_res is None and provided_pmid is not None: cur_res = self.process_pmid(provided_pmid) if cur_res is not None: self.repok.add_sentence( self.message( "The entity has been found by means of the " "PMID provided as input by %s." % self.source_provider, "PMID", provided_pmid)) if cur_res is None and provided_pmcid is not None: cur_res = self.process_pmcid(provided_pmcid) if cur_res is not None: self.repok.add_sentence( self.message( "The entity has been found by means of the " "PMCID provided as input by %s." % self.source_provider, "PMCID", provided_pmcid)) if cur_res is None and provided_url is not None: cur_res = self.process_url(provided_url) if cur_res is not None: self.repok.add_sentence( self.message( "The entity has been found by means of the " "URL provided as input by %s." % self.source_provider, "URL", provided_url)) if cur_res is None and entry is not None: if do_process_entry: cur_res = self.process_entry(entry) if cur_res is None: if self.get_bib_entry_doi and extracted_doi is not None: extracted_doi_used = True cur_res = self.process_doi(extracted_doi, self.name, self.source_provider) if cur_res is not None: self.repok.add_sentence( self.message( "The entity for '%s' has been found by means of the " "DOI extracted from it." % entry, "DOI", extracted_doi)) if cur_res is None and self.get_bib_entry_url and extracted_url is not None: existing_res = self.rf.retrieve_from_url(extracted_url) if existing_res is not None: cur_res = self.g_set.add_br( self.name, self.source_provider, self.source, existing_res) self.repok.add_sentence( self.message( "The entity for '%s' has been found by means of the " "URL extracted from it." % entry, "URL", extracted_url)) else: self.repok.add_sentence( self.message( "The entity has been retrieved by using the search API.", "entry", entry)) # If no errors were generated, proceed if self.reperr.is_empty(): # If it is none if cur_res is None: cur_res = self.g_set.add_br(self.name) self.rf.update_graph_set(self.g_set) self.repok.add_sentence( self.message( "The entity has been created even if no results have " "been returned by the API.", "entry", entry)) # Add the DOI, the PMID and the PMCID if they have been provided by the curator # (if they are not already associated to the resource) self.__add_doi(cur_res, provided_doi, self.curator) self.__add_pmid(cur_res, provided_pmid) self.__add_pmcid(cur_res, provided_pmcid) self.__add_url(cur_res, provided_url) # Add any DOI extracted from the entry if it is not already included (and only if # a resource has not been retrieved by a DOI specified in the entry explicitly, or # by a Crossref search. if self.get_bib_entry_doi and extracted_doi_used: self.__add_doi(cur_res, extracted_doi, self.name) # Add any URL extracted from the entry if it is not already included if self.get_bib_entry_url: self.__add_url(cur_res, extracted_url) result += [cur_res] self.rf.update_graph_set(self.g_set) else: # If errors have been raised, stop the process for this entry (by returning None) return None # If the process comes here, then everything worked correctly return result
def author(self, cur_br, key, json, source, *args): # Get all ORCID of the authors (if any) all_authors = json[key] all_family_names = dg(all_authors, ["family"]) author_orcid = [] if "DOI" in json and all_family_names: doi_string = json["DOI"] if self.of is not None: author_orcid = self.of.get_orcid_ids(doi_string, all_family_names) # Used to create ordered list of authors/editors of bibliographic entities prev_role = None # Analyse all authors for author in json["author"]: given_name_string = None if "given" in author: given_name_string = author["given"] family_name_string = None if "family" in author: family_name_string = author["family"] cur_orcid_record = None # TODO: handle if ORCID in Crossref if family_name_string: # Get all the ORCID/author records retrieved that share the # family name into consideration orcid_with_such_family = dgt(author_orcid, "family", family_name_string) author_with_such_family = dgt(all_authors, "family", family_name_string) if len(orcid_with_such_family) == 1 and len(author_with_such_family) == 1: cur_orcid_record = orcid_with_such_family[0] elif given_name_string is not None and \ len(orcid_with_such_family) >= 1 and len(author_with_such_family) >= 1: # From the previous lists of ORCID/author record, get the list # of all the given name defined orcid_given_with_such_family = dg(orcid_with_such_family, ["given"]) author_given_with_such_family = dg(author_with_such_family, ["given"]) # Get the indexes of the previous list that best match with the # given name of the author we are considering closest_orcid_matches_idx = \ slc(orcid_given_with_such_family, given_name_string) closest_author_matches_idx = \ slc(author_given_with_such_family, given_name_string) if len(closest_orcid_matches_idx) == 1 and \ len(closest_author_matches_idx) == 1: closest_author_orcid_matches_idx = slc( author_given_with_such_family, orcid_given_with_such_family[0]) if closest_author_orcid_matches_idx == closest_author_matches_idx: cur_orcid_record = \ orcid_with_such_family[closest_orcid_matches_idx[0]] # An ORCID has been found to match with such author record, and we try to # see if such orcid (and thus, the author) has been already added in the # store retrieved_agent = None if cur_orcid_record is not None and self.rf is not None: # TODO: handle if ORCID in Crossref retrieved_agent = self.rf.retrieve_from_orcid(cur_orcid_record["orcid"]) # If the resource does not exist already, create a new one if retrieved_agent is None: cur_agent = self.g_set.add_ra(self.name, self.id, source) if cur_orcid_record is not None and self.of is not None: cur_agent_orcid = self.g_set.add_id(self.of.name, self.of.id, self.of.get_last_query()) cur_agent_orcid.create_orcid(cur_orcid_record["orcid"]) cur_agent.has_id(cur_agent_orcid) self.rf.add_orcid_to_store(cur_agent, cur_agent_orcid, cur_orcid_record["orcid"]) if given_name_string is not None: cur_agent.create_given_name(given_name_string) elif cur_orcid_record is not None and "given" in cur_orcid_record: cur_agent.create_given_name(cur_orcid_record["given"]) if family_name_string is not None: cur_agent.create_family_name(family_name_string) elif cur_orcid_record is not None and "family" in cur_orcid_record: cur_agent.create_family_name(cur_orcid_record["family"]) else: cur_agent = self.g_set.add_ra(self.name, self.id, source, retrieved_agent) # Add statements related to the author resource (that could or could not # exist in the store) cur_role = self.g_set.add_ar(self.name, self.id, source) if json["type"] == "edited-book": cur_role.create_editor(cur_br) else: cur_role.create_author(cur_br) cur_agent.has_role(cur_role) if prev_role is not None: cur_role.follows(prev_role) prev_role = cur_role
def process_citing_entity(self): # This method let us process the citing entity: this is the first step of the process, if the citing resource # hasn't been found in blazegraph. citing_entity = None if self.occ is not None: citing_resource = self.rf.retrieve_entity(self.occ, GraphEntity.expression, typ='only_blazegraph') citing_entity = self.g_set.add_br(self.name, self.id, self.source_provider, citing_resource) if citing_entity is None and self.doi is not None: citing_entity = self.process_doi_query(self.doi, self.curator, self.source_provider, typ='only_blazegraph') if citing_entity is None: # If the citing entity hasn't been found, then create one and update the graph citing_entity = self.g_set.add_br(self.name) self.__add_doi(citing_entity, self.doi, self.curator) # self.rf.update_graph_set(self.g_set) self.repok.add_sentence( self.message( "The citing entity has been created even if no results have " "been returned by the API.", "doi", self.doi)) # Add other ids if they exist self.__add_pmid(citing_entity, self.pmid) self.__add_pmcid(citing_entity, self.pmcid) # Process all the references contained and return related entities cited_entities = self.process_references(citing_entity=citing_entity, citing_doi=self.doi) if cited_entities is not None: cited_entities_xmlid_be = [] for idx, cited_entity in enumerate(cited_entities): citing_entity.has_citation(cited_entity) cur_bibentry = dg(self.entries[idx], ["bibentry"]) cur_be_xmlid = dg(self.entries[idx], ["xmlid"]) if cur_bibentry is not None and cur_bibentry.strip(): cur_be = self.g_set.add_be(self.curator, self.source_provider, self.source) citing_entity.contains_in_reference_list(cur_be) cited_entity.has_reference(cur_be) self.__add_xmlid(cur_be, cur_be_xmlid) # new cur_be.create_content(cur_bibentry.strip()) cited_entities_xmlid_be.append( (cited_entity, cur_be_xmlid, cur_be)) # create rp, pl, de, ci, an if self.intext_refs: rp_entities = jt.process_reference_pointers(citing_entity, \ cited_entities_xmlid_be, self.reference_pointers, self.g_set, \ self.curator, self.source_provider, self.source) # self.rf.update_graph_set(self.g_set) return self.g_set