def enrich_from_crossref(bib, email): cr = Crossref() if email: Crossref(mailto=email) okcount = 0 print("entries where no exact matching entry could be found on Crossref:") for entry in bib.entries: res = cr.works(query_bibliographic=entry["title"]) item = get_matching_item(entry, res['message']['items']) if item: okcount += 1 enrich_entry(entry, item) print(okcount, "of", len(bib.entries), "had matching titles")
def build_user_graph(graph, users, spinner, cursor): global crossref_email global vertex_dict global sqlite_cursor sqlite_cursor = cursor vertex_dict = { "paper": {}, "journal": {}, "subject": {}, "author": {}, "user": {} } total = len(users) counter = 1 spinner.start() cr = Crossref(mailto=crossref_email) for uni in users: process_user(graph, uni, cr, counter, total, spinner) counter += 1 spinner.succeed("All users inserted")
def CrossRefAPIfunc(keyPhrase): cr = Crossref() x = cr.works(query=keyPhrase) if x["message"]["total-results"] > 0: x = x['message']["items"][0] date = x["indexed"]["date-parts"][0] #referenceCount = x["is-referenced-by-count"] title = x["title"][0] if "author" in x: authors = x["author"][0]["given"] + x["author"][0]["family"] if len(x["author"]) > 1: authors += " et al." else: authors = None url = x["URL"] #score = x["score"] else: date = None #referenceCount = None title = None author = None url = None #score = -1 return (url, title, authors, date)
def __init__(self, doi): cr = Crossref() try: message = cr.works(doi)['message'] except: message = None if message: metadata = {} metadata['doi'] = doi journal_info = { x['name']: x['value'] for x in message['assertion'] } metadata.update(journal_info) metadata['page'] = message['page'].replace('-', '--') metadata['volume'] = message['volume'] metadata['author'] = ' and '.join( [f"{x['given']} {x['family']}" for x in message['author']]) ts = message['license'][0]['start']['date-time'] metadata['date'] = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').date() metadata[ 'article_name'] = f"{message['author'][0]['family']}_{metadata['date'].year}" self.metadata = metadata
def names_from_xref(doi): """ Get the first names of the first and last authors for a given DOI. Inputs ------ doi : string The DOI of the paper whose first and last author names you want to know. Here, it's usually a citing paper. Outputs ------- first_author : string The first name of the first author of the given paper. last_author : string The first name of the last author of the given paper. """ cr = Crossref() title = "" works = cr.works( query=title, select=["DOI", "author"], limit=1, filter={"doi": doi} ) if works["message"]["total-results"] > 0: item = works["message"]["items"][0] if "author" in item.keys(): first_author = get_name_from_author_dict(item["author"][0]) last_author = get_name_from_author_dict(item["author"][-1]) else: first_author = "" last_author = "" return first_author, last_author
def lookup_data( doi: str = None, in_wikipedia: bool = False, ): # -> Dict[str, str]: """Lookup data and return Dict""" # https://www.crossref.org/education/retrieve-metadata/rest-api/ # async client here https://github.com/izihawa/aiocrossref but only 1 contributor # https://github.com/sckott/habanero >6 contributors not async if doi is None: print("Error. Got None instead of DOI. Report this error please.") else: print("Looking up from Crossref") cr = Crossref() #result = cr.works(doi=doi) result = cr.works(ids=doi) # print(result.keys()) message = result["message"] object_type = message["type"] if object_type == "book": print("Book detected, we exclude those for now.") return None #print(message.keys()) data = extract_data(message, in_wikipedia) print(data) if data.get("publisher") and data.get("publisher_location"): # TODO look up publisher via sparqldataframe print("Found both publisher and location")
def fetch_doi_from_crossref(item): """ link titles with dois """ cr = Crossref() # goes thru all the papers and checks via crossref try: query = '"' + item["title"] + '"'\ + " " + flatten(item["authors"]) except TypeError: # No author information available (probably) query = '"' + item["title"] + '"' print(STD_INFO + query) server_reached = False while server_reached == False: try: query_result = cr.works(query=query, limit=3) server_reached = True except: #HTTPError (Service Unavailable) print(STD_WARNING + "CrossRef server unavailable. Retry in 5 seconds") time.sleep(5) try: title = query_result['message']['items'][0]['title'][0] except KeyError: title = 'None' doi = query_result['message']['items'][0]['DOI'] return doi, title
def crossref_api(login_data, sql_dataframe, e_mail, chunk_size): pd.options.mode.chained_assignment = None match_info = sql_dataframe sampledata = match_info[(match_info["match_id"] == "not_match") | (match_info["match_id"] == "error")] sampledata["crossref"] = np.nan cr = Crossref(mailto=e_mail) i = 0 while True: ns = i + chunk_size if i < len(sampledata): dict_cross = [] for index, row in sampledata[i:ns].iterrows(): tempdata = [] reftext = sampledata.ix[index]["ref_text"] try: x = cr.works( query=reftext, limit=1, select= "DOI,title,issued,short-container-title,ISSN,score,URL,title,page,publisher,container-title,DOI,author,volume,issued" ) tempdata.append(row[0]) tempdata.append(x["message"]["items"]) except: tempdata.append(row[0]) tempdata.append(np.nan) print("error" + str(index)) dict_cross.append(tempdata) crossref_to_db(login_data, dict_cross) i = i + chunk_size else: break
def get_papers(issn=ISSN, offset=0, per_page=PER_PAGE, username=HABANERO_USERNAME): crossref_api = Crossref(mailto=username) return crossref_api.works(filter={"issn": issn}, offset=offset, limit=per_page) # get a first set of papers
def pre_save_article(sender, instance, **kwargs): cr = Crossref() article_meta = cr.works(ids=instance.DOI) instance.DOI = instance.DOI.strip() instance.title = get_title(article_meta) instance.description = get_description(article_meta) instance.keywords = get_keywords(article_meta) instance.article_url = get_url(article_meta)
def doi_valid(value): try: cr = Crossref(mailto="*****@*****.**" ) # necessary to end up in the polite pool work = cr.works(ids=value) work['message']['title'] except Exception as e: raise ValidationError(f"Invalid DOI: {e}")
def __init__(self): # constants self.TRIALS = 3 # init self.sh = SciHub() self.cr = Crossref()
def get_doi(dois, config): """Retrieve an article by doi. """ crossref = Crossref(mailto=config['settings']['email']) cr_result = crossref.works(ids=dois) if len(dois) == 1: return DB_dict.parse_cr([cr_result['message']]) else: return DB_dict.parse_cr([c['message'] for c in cr_result])
def __init__(self, rsrcmgr, pageno=1, laparams: LAParams = None): PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams) self.rows = [] self.page_number = 0 self.doi = None self.cr = Crossref(mailto='*****@*****.**')
def __init__(self, doi=None, parms=None): cr = Crossref() if doi: self.raw_data = cr.works(ids=doi,format="json") self.record = self.raw_data['message'] elif parms: self.raw_data = cr.works(filter=parms, cursor="*", limit=500,format="json") self.record = None self.record_list = []
def crossref(doi): cr = Crossref(mailto="*****@*****.**") response = cr.works(ids=doi) # habanero returns a list if doi is a list of len > 1 # otherwise a single dict if isinstance(doi, (list, tuple, set)) and len(doi) > 1: D = [parse_crossref(i) for i in response] return {x.pop("doi"): x for x in D} else: return parse_crossref(response)
def get_crossref_metadata(title, path): """ Gets Crossref metadata, given an article's title. Then puts the metadata on the clipboard :param title: Title to search for :param path: PDF-Path, not necessary """ print "getting crossref" # Searches the Crossref API for the given title, gets best result cr = Crossref() query = cr.works(query=title, limit=1) doi = '' # Extract DOI out of Crossref answer for item in query['message']['items']: doi = item['DOI'] # Not used, but useful. Gets metadata from isbnlib, given DOI # print isbnlib.doi2tex(doi) # Gets APA citation, given DOI apa_citation = cn.content_negotiation(ids=doi, format="text", style="apa") # We could get more formats this way, but this is not used at the moment, better performance without getting these formats # rdf_citation = cn.content_negotiation(ids=doi, format="rdf-xml") # json_citation = cn.content_negotiation(ids=doi, format="citeproc-json") # bib_entry = cn.content_negotiation(ids=doi, format="bibentry") # Prettify APA citation apa_citation = prettify_UTF8_Strings(apa_citation).strip('\n') print apa_citation clp.OpenClipboard(None) citations = {} citations['APA'] = apa_citation try: citations['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: citations['content'] = 'no text content available' # Puts the citations on the clipboard clp.SetClipboardData(citation_format, json.dumps(citations)) sources = {} sources['source'] = path try: sources['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: sources['content'] = 'no text content available' # Puts the sources on the clipboard clp.SetClipboardData(src_format, json.dumps(sources)) clp.CloseClipboard()
def title_from_DOI(DOI): cr = Crossref(mailto=crossref_email) try: response = cr.works(ids=DOI) except HTTPError: return None if "message" in response: if "title" in response["message"]: if response["message"]["title"]: return response["message"]["title"][0] return None
def build_network_graph(graph, DOIs): global spinner spinner.start() cr = Crossref(mailto = "*****@*****.**") for DOI in DOIs: process_paper(graph, DOI, cr) spinner.stop() print("Network built.")
def build_author_graph(graph, DOIs): global vertex_dict global spinner spinner.start() cr = Crossref(mailto = "*****@*****.**") for DOI in DOIs: process_author_paper(graph, DOI, cr) spinner.succeed("Author network built.")
def update_from_cr(config): """Retrieve records from Crossref. """ crossref = Crossref(mailto=config['settings']['email']) orcid_ids = [n for n in config['authors']] cr_results = crossref.works(filter={ 'orcid': orcid_ids, 'type': [config['settings']['article_type']] }, sort=config['settings']['sort_field'], order=config['settings']['order'], limit=config['settings']['num_records']) return DB_dict.parse_cr(cr_results['message']['items'])
def build_user_graph(graph, users): global vertex_dict global spinner spinner.start() cr = Crossref(mailto = "*****@*****.**") for uni in users: process_user(graph, uni, cr) spinner.succeed("All users inserted") spinner.stop()
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice( f"Querying and formatting {i + 1} out of {len(publications)} publications" ) link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and "doi.org" in link: doi = urlparse(link).path.strip("/") # Extract the DOI using the title else: results = cr.works(query_bibliographic=title, limit=1) if (results["message"]["total-results"] == 0 or results["message"]["items"][0]["title"][0].lower() != title.lower()): log.warn(f'Could not find the doi for "{title}"') continue doi = results["message"]["items"][0]["DOI"] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append( re.sub("^@.*{", "", reference.split("\n")[0]).strip(",")) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, "references.bib"), "w") as f: f.write("\n\n".join(lines)) return references
def get_crossref_results(query, index=10): cr = Crossref() filters = {"type": "journal-article"} limit = 10 sort = "score" order = "desc" results = cr.works( query_bibliographic=query, filters=filters, limit=limit, sort=sort, order=order, ) results = results["message"]["items"] return results[:index]
def __init__(self, email=""): self.mailto = email self.arXivApi = "http://export.arxiv.org/api/query?id_list=" self.crossrefApi = Crossref(mailto=self.mailto) self.regString = r"\b(10\.[0-9]{4,}(?:\.[0-9]+)*\/(?:(?![\"&\'])\S)+)\b" self.metadata = { "doi": "", "url": "", "year": "", "journal": "", "author": "", "title": "", "abstract": "", } self.page0_text = ""
def __init__(self, pmidsListOrCSVfile, email): """ Constructor. Used for initialization. :param pmidsListOrCSVfile: [list, set or string] a list (or a set) of PMIDs, or a name of a CSV file containg the list of PMIDs with the header 'PMID'. :param email: [string] an email address for Entrez and Crossref in case of need to contact you (e.g. send warnigs about download limits). Usage:: from pubMedArticleGrabber import PubMedArticleGrabber wanted = PubMedArticleGrabber('name of csv file', '*****@*****.**') others = PubMedArticleGrabber(['1047458', '1050021'], '*****@*****.**') someOthers = PubMedArticleGrabber({'1047458', '1047458', '1050021'}, '*****@*****.**') """ self.pmids = pmidsListOrCSVfile Entrez.email = email cr = Crossref(mailto=email)
def build_combined_graph(graph, DOIs): global vertex_dict global spinner spinner.start() cr = Crossref(mailto = "*****@*****.**") for DOI in DOIs: combined_process_paper(graph, DOI, cr) spinner.succeed("All papers inserted") spinner.start("Building citation edges...") for DOI in vertex_dict["paper"]: process_citations(graph, DOI, cr) spinner.stop()
def async_post_save_article_info(self, doi): from .models import Article from .signals import get_abstract, get_image_url article = Article.objects.get(DOI=doi) if not article.image_url and not article.abstract: cr = Crossref() article_meta = cr.works(ids=article.DOI) article.abstract = get_abstract(article_meta) article.image_url = get_image_url(article_meta) if not validators.url(article.image_url): article.image_url = str() article.save()
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice(f'Querying and formatting {i + 1} out of {len(publications)} publications') link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and 'doi.org' in link: doi = urlparse(link).path.strip('/') # Extract the DOI using the title else: results = cr.works(query_title=title, limit=1) if results['message']['total-results'] == 0 or \ results['message']['items'][0]['title'][0].lower() != title.lower(): log.warn(f'Could not find the doi for "{title}"') continue doi = results['message']['items'][0]['DOI'] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append(re.sub('^@.*{', '', reference.split('\n')[0]).strip(',')) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, 'references.bib'), 'w') as f: f.write('\n\n'.join(lines)) return references
def my_view(request): references = [] if 'DOI' in request.GET: doi = request.GET['DOI'] cr = Crossref() logging.info(request.GET) # try: x = cr.works(doi) if 'message' in x and 'reference' in x['message']: if 'title' in x['message']: logging.info(u"Evaluating references for {}".format( x['message']['title'])) for r in x['message']['reference']: if 'DOI' not in r: continue title = "" year = "" try: y = cr.works(u'{}'.format(r['DOI'])) logging.info(r['DOI']) if 'message' in y and 'title' in y['message']: title = u"{}".format(u''.join(y['message']['title'])) if 'message' in y and 'author' in y['message']: title += u"\n" + u", ".join( [a['family'] for a in y['message']['author']]) if 'message' in y and 'issued' in y['message']: year = y['message']['issued']['date-parts'][0][0] except: if 'unstructured' in r: title = u"{}".format(r['unstructured']) logging.info(title) references.append([ u"{}/{}".format(SCIHUB_URL, r['DOI']), title, ##authors, year ]) # except: # pass logging.info(references) return {'refs': references}