def __init__(self, doi=None, parms=None): cr = Crossref() if doi: self.raw_data = cr.works(ids=doi,format="json") self.record = self.raw_data['message'] elif parms: self.raw_data = cr.works(filter=parms, cursor="*", limit=500,format="json") self.record = None self.record_list = []
def lookup_data( doi: str = None, in_wikipedia: bool = False, ): # -> Dict[str, str]: """Lookup data and return Dict""" # https://www.crossref.org/education/retrieve-metadata/rest-api/ # async client here https://github.com/izihawa/aiocrossref but only 1 contributor # https://github.com/sckott/habanero >6 contributors not async if doi is None: print("Error. Got None instead of DOI. Report this error please.") else: print("Looking up from Crossref") cr = Crossref() #result = cr.works(doi=doi) result = cr.works(ids=doi) # print(result.keys()) message = result["message"] object_type = message["type"] if object_type == "book": print("Book detected, we exclude those for now.") return None #print(message.keys()) data = extract_data(message, in_wikipedia) print(data) if data.get("publisher") and data.get("publisher_location"): # TODO look up publisher via sparqldataframe print("Found both publisher and location")
def CrossRefAPIfunc(keyPhrase): cr = Crossref() x = cr.works(query=keyPhrase) if x["message"]["total-results"] > 0: x = x['message']["items"][0] date = x["indexed"]["date-parts"][0] #referenceCount = x["is-referenced-by-count"] title = x["title"][0] if "author" in x: authors = x["author"][0]["given"] + x["author"][0]["family"] if len(x["author"]) > 1: authors += " et al." else: authors = None url = x["URL"] #score = x["score"] else: date = None #referenceCount = None title = None author = None url = None #score = -1 return (url, title, authors, date)
def get_metadata_from_title(title): """ Fetch the metadata for a document given its title and return a crossref item dict. :param title: title of document :return: crossref item dictionary """ cr = Crossref() x = cr.works(query=title) top_items = x['message']['items'][:N_TOP_ITEMS] # rank top titles in order of string similarity to queried title def match(item): return SequenceMatcher(None, item['title'][0], title).ratio() top_item = sorted(top_items, key=match)[-1] # attempt to find abstract top_item['abstract'] = get_abstract(top_item) return top_item
def names_from_xref(doi): """ Get the first names of the first and last authors for a given DOI. Inputs ------ doi : string The DOI of the paper whose first and last author names you want to know. Here, it's usually a citing paper. Outputs ------- first_author : string The first name of the first author of the given paper. last_author : string The first name of the last author of the given paper. """ cr = Crossref() title = "" works = cr.works( query=title, select=["DOI", "author"], limit=1, filter={"doi": doi} ) if works["message"]["total-results"] > 0: item = works["message"]["items"][0] if "author" in item.keys(): first_author = get_name_from_author_dict(item["author"][0]) last_author = get_name_from_author_dict(item["author"][-1]) else: first_author = "" last_author = "" return first_author, last_author
def fetch_doi_from_crossref(item): """ link titles with dois """ cr = Crossref() # goes thru all the papers and checks via crossref try: query = '"' + item["title"] + '"'\ + " " + flatten(item["authors"]) except TypeError: # No author information available (probably) query = '"' + item["title"] + '"' print(STD_INFO + query) server_reached = False while server_reached == False: try: query_result = cr.works(query=query, limit=3) server_reached = True except: #HTTPError (Service Unavailable) print(STD_WARNING + "CrossRef server unavailable. Retry in 5 seconds") time.sleep(5) try: title = query_result['message']['items'][0]['title'][0] except KeyError: title = 'None' doi = query_result['message']['items'][0]['DOI'] return doi, title
def crossref_api(login_data, sql_dataframe, e_mail, chunk_size): pd.options.mode.chained_assignment = None match_info = sql_dataframe sampledata = match_info[(match_info["match_id"] == "not_match") | (match_info["match_id"] == "error")] sampledata["crossref"] = np.nan cr = Crossref(mailto=e_mail) i = 0 while True: ns = i + chunk_size if i < len(sampledata): dict_cross = [] for index, row in sampledata[i:ns].iterrows(): tempdata = [] reftext = sampledata.ix[index]["ref_text"] try: x = cr.works( query=reftext, limit=1, select= "DOI,title,issued,short-container-title,ISSN,score,URL,title,page,publisher,container-title,DOI,author,volume,issued" ) tempdata.append(row[0]) tempdata.append(x["message"]["items"]) except: tempdata.append(row[0]) tempdata.append(np.nan) print("error" + str(index)) dict_cross.append(tempdata) crossref_to_db(login_data, dict_cross) i = i + chunk_size else: break
def __init__(self, doi): cr = Crossref() try: message = cr.works(doi)['message'] except: message = None if message: metadata = {} metadata['doi'] = doi journal_info = { x['name']: x['value'] for x in message['assertion'] } metadata.update(journal_info) metadata['page'] = message['page'].replace('-', '--') metadata['volume'] = message['volume'] metadata['author'] = ' and '.join( [f"{x['given']} {x['family']}" for x in message['author']]) ts = message['license'][0]['start']['date-time'] metadata['date'] = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%SZ').date() metadata[ 'article_name'] = f"{message['author'][0]['family']}_{metadata['date'].year}" self.metadata = metadata
def get_papers(issn=ISSN, offset=0, per_page=PER_PAGE, username=HABANERO_USERNAME): crossref_api = Crossref(mailto=username) return crossref_api.works(filter={"issn": issn}, offset=offset, limit=per_page) # get a first set of papers
def pre_save_article(sender, instance, **kwargs): cr = Crossref() article_meta = cr.works(ids=instance.DOI) instance.DOI = instance.DOI.strip() instance.title = get_title(article_meta) instance.description = get_description(article_meta) instance.keywords = get_keywords(article_meta) instance.article_url = get_url(article_meta)
def doi_valid(value): try: cr = Crossref(mailto="*****@*****.**" ) # necessary to end up in the polite pool work = cr.works(ids=value) work['message']['title'] except Exception as e: raise ValidationError(f"Invalid DOI: {e}")
def my_view(request): references = [] if 'DOI' in request.GET: doi = request.GET['DOI'] cr = Crossref() logging.info(request.GET) # try: x = cr.works(doi) if 'message' in x and 'reference' in x['message']: if 'title' in x['message']: logging.info(u"Evaluating references for {}".format( x['message']['title'])) for r in x['message']['reference']: if 'DOI' not in r: continue title = "" year = "" try: y = cr.works(u'{}'.format(r['DOI'])) logging.info(r['DOI']) if 'message' in y and 'title' in y['message']: title = u"{}".format(u''.join(y['message']['title'])) if 'message' in y and 'author' in y['message']: title += u"\n" + u", ".join( [a['family'] for a in y['message']['author']]) if 'message' in y and 'issued' in y['message']: year = y['message']['issued']['date-parts'][0][0] except: if 'unstructured' in r: title = u"{}".format(r['unstructured']) logging.info(title) references.append([ u"{}/{}".format(SCIHUB_URL, r['DOI']), title, ##authors, year ]) # except: # pass logging.info(references) return {'refs': references}
def get_doi(dois, config): """Retrieve an article by doi. """ crossref = Crossref(mailto=config['settings']['email']) cr_result = crossref.works(ids=dois) if len(dois) == 1: return DB_dict.parse_cr([cr_result['message']]) else: return DB_dict.parse_cr([c['message'] for c in cr_result])
def title2doi(title): title = title.lower() clean_title = ''.join(e for e in title if e.isalnum()) cr = Crossref() res = cr.works(query_title=title, select="title,DOI", limit=5) for r in res['message']['items']: fetched_title = r['title'][0].lower() clean_fetched = ''.join(e for e in fetched_title if e.isalnum()) if clean_fetched == clean_title: return r['DOI']
def crossref(doi): cr = Crossref(mailto="*****@*****.**") response = cr.works(ids=doi) # habanero returns a list if doi is a list of len > 1 # otherwise a single dict if isinstance(doi, (list, tuple, set)) and len(doi) > 1: D = [parse_crossref(i) for i in response] return {x.pop("doi"): x for x in D} else: return parse_crossref(response)
def get_crossref_metadata(title, path): """ Gets Crossref metadata, given an article's title. Then puts the metadata on the clipboard :param title: Title to search for :param path: PDF-Path, not necessary """ print "getting crossref" # Searches the Crossref API for the given title, gets best result cr = Crossref() query = cr.works(query=title, limit=1) doi = '' # Extract DOI out of Crossref answer for item in query['message']['items']: doi = item['DOI'] # Not used, but useful. Gets metadata from isbnlib, given DOI # print isbnlib.doi2tex(doi) # Gets APA citation, given DOI apa_citation = cn.content_negotiation(ids=doi, format="text", style="apa") # We could get more formats this way, but this is not used at the moment, better performance without getting these formats # rdf_citation = cn.content_negotiation(ids=doi, format="rdf-xml") # json_citation = cn.content_negotiation(ids=doi, format="citeproc-json") # bib_entry = cn.content_negotiation(ids=doi, format="bibentry") # Prettify APA citation apa_citation = prettify_UTF8_Strings(apa_citation).strip('\n') print apa_citation clp.OpenClipboard(None) citations = {} citations['APA'] = apa_citation try: citations['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: citations['content'] = 'no text content available' # Puts the citations on the clipboard clp.SetClipboardData(citation_format, json.dumps(citations)) sources = {} sources['source'] = path try: sources['content'] = unicode(clp.GetClipboardData(clp.CF_TEXT), errors='replace') except: sources['content'] = 'no text content available' # Puts the sources on the clipboard clp.SetClipboardData(src_format, json.dumps(sources)) clp.CloseClipboard()
def get_crossref(doc_id='', query=''): """Return data from crossref api""" try: parser = CrossRefPaperParser() cr = Crossref() doi = doc_id if doi: try: entry = cr.works(ids=[doi]).get('message') return parser.parse(entry) except HTTPError: pass entries = cr.works(query=query, limit=1).get('items') if entries: entry = entries[0] return parser.parse(entry) except: pass return None
def title_from_DOI(DOI): cr = Crossref(mailto=crossref_email) try: response = cr.works(ids=DOI) except HTTPError: return None if "message" in response: if "title" in response["message"]: if response["message"]["title"]: return response["message"]["title"][0] return None
def query_crossref(title, author): """Query Crossref for extracted data Args: title (str): The title of the paper author (List(str)): A list of the authors of the paper Raises: ExtractionError: No suitable search criteria extracted ExtractionError: No suitable Crossref candidates ExtractionError: Crossref returned an error Returns: str: A BibTeX entry for the queried data """ # Search for the paper on Crossref cr = Crossref(mailto="*****@*****.**") # print("Querying Crossref") if author and title: r = cr.works(query=title + " " + author[0]) elif title: r = cr.works(query=title) else: raise ExtractionError("No suitable search criteria extracted") BibTeX = "" print(json.dumps(r), file=open("cn.json", "w")) if r["status"] == "ok": for result in r["message"]["items"]: # If the titles are similar enough if "title" in result: if SequenceMatcher(None, result["title"][0].upper(), title.upper()).ratio() > 0.9: # If the title is similar enough, perform content negotiaiton BibTeX = cn.content_negotiation(ids=result["DOI"], format="bibentry") return BibTeX else: raise ExtractionError("No suitable Crossref candidates") else: raise ExtractionError("Crossref returned an error")
def enrich_from_crossref(bib, email): cr = Crossref() if email: Crossref(mailto=email) okcount = 0 print("entries where no exact matching entry could be found on Crossref:") for entry in bib.entries: res = cr.works(query_bibliographic=entry["title"]) item = get_matching_item(entry, res['message']['items']) if item: okcount += 1 enrich_entry(entry, item) print(okcount, "of", len(bib.entries), "had matching titles")
def update_from_cr(config): """Retrieve records from Crossref. """ crossref = Crossref(mailto=config['settings']['email']) orcid_ids = [n for n in config['authors']] cr_results = crossref.works(filter={ 'orcid': orcid_ids, 'type': [config['settings']['article_type']] }, sort=config['settings']['sort_field'], order=config['settings']['order'], limit=config['settings']['num_records']) return DB_dict.parse_cr(cr_results['message']['items'])
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice( f"Querying and formatting {i + 1} out of {len(publications)} publications" ) link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and "doi.org" in link: doi = urlparse(link).path.strip("/") # Extract the DOI using the title else: results = cr.works(query_bibliographic=title, limit=1) if (results["message"]["total-results"] == 0 or results["message"]["items"][0]["title"][0].lower() != title.lower()): log.warn(f'Could not find the doi for "{title}"') continue doi = results["message"]["items"][0]["DOI"] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append( re.sub("^@.*{", "", reference.split("\n")[0]).strip(",")) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, "references.bib"), "w") as f: f.write("\n\n".join(lines)) return references
def get_crossref_results(query, index=10): cr = Crossref() filters = {"type": "journal-article"} limit = 10 sort = "score" order = "desc" results = cr.works( query_bibliographic=query, filters=filters, limit=limit, sort=sort, order=order, ) results = results["message"]["items"] return results[:index]
def query(request, query=None): if request.user.is_authenticated() == True: cr = Crossref() result = cr.works(query=query)['message']['items'] response = [] count = 0 for r in result: if count == 10: break try: title = r['title'][0] except: title = None if title:#Si existe titulo el documento se añade a la respuesta. De lo contrario se salta. timestamp = time.gmtime(int(r['created']['timestamp'])/1000) r['date'] = time.strftime('%d-%m-%Y', timestamp) r['year'] = timestamp.tm_year count = count + 1 authors = getAuthors(r) if(authors): r['author'] = authors response.append(r) #if 'author' in r: # given = r['author'][0]['given'] if 'given' in r['author'][0] else '' # family = r['author'][0]['family'] if 'family' in r['author'][0] else '' #if 'page' in r: # page = r['page'] #if 'created' in r: # timestamp = time.gmtime(int(r['created']['timestamp'])/1000) # date = time.strftime('%Y-%m-%d', timestamp) #row = '<li class="crossref-row"' #print title #if 'author' in r: # row = row + 'author="' + given + ' ' + family + '"' #if 'page' in r: # row = row + 'pages="' + page + '"' #if 'created' in r: # row = row + 'date="' + date + '"' # print '- year: ', timestamp.tm_year #row = row + 'issn="' + r['issn'][0] + '"' #row = row + 'url="' + r['url'] + '"' #row = row + 'doi="' + r['DOI'] + '"' #row = row + '>\n' #response = reponse + row return render(request, 'crossref/template.html', {'documents': response}) else: return JsonResponse({'error': True, 'message': _(u'Debe iniciar sesión.')})
def make_references(publications, output_dir): """ Create reference bib file Args: publications: the list of publications output_dir: the output directory Returns: A list of reference identifiers """ log = Logger() cr = Crossref() lines = [] references = [] for i, publication in enumerate(publications): log.notice(f'Querying and formatting {i + 1} out of {len(publications)} publications') link = publication[LINK] title = publication[TITLE] # Check if it is a DOI url if link and 'doi.org' in link: doi = urlparse(link).path.strip('/') # Extract the DOI using the title else: results = cr.works(query_title=title, limit=1) if results['message']['total-results'] == 0 or \ results['message']['items'][0]['title'][0].lower() != title.lower(): log.warn(f'Could not find the doi for "{title}"') continue doi = results['message']['items'][0]['DOI'] try: reference = cn.content_negotiation(doi) lines.append(reference) references.append(re.sub('^@.*{', '', reference.split('\n')[0]).strip(',')) except HTTPError: log.warn(f'Could not Create reference for "{title}"') with open(os.path.join(output_dir, 'references.bib'), 'w') as f: f.write('\n\n'.join(lines)) return references
def async_post_save_article_info(self, doi): from .models import Article from .signals import get_abstract, get_image_url article = Article.objects.get(DOI=doi) if not article.image_url and not article.abstract: cr = Crossref() article_meta = cr.works(ids=article.DOI) article.abstract = get_abstract(article_meta) article.image_url = get_image_url(article_meta) if not validators.url(article.image_url): article.image_url = str() article.save()
def titletodoi(keyword): cr = Crossref() result = cr.works(query=keyword) items = result['message']['items'] item_title = items[0]['title'] tmp = '' for it in item_title: tmp += it title = keyword.replace(' ', '').lower() title = re.sub(r'\W', '', title) # print('title: ' + title) tmp = tmp.replace(' ', '').lower() tmp = re.sub(r'\W', '', tmp) # print('tmp: ' + tmp) if (title == tmp): doi = items[0]['DOI'] return doi else: return None
def getCrossRefMetadata(self): if hasattr(self, "doi") == True: c = Crossref(mailto="*****@*****.**") try: r = c.works(ids = [self.doi]) r = r['message'] self.confidenceScore = r.get("score") if r.get("issued") != None: self.date = dateutil.parser.parse("-".join(str(x) for x in r.get("issued")['date-parts'][0])) if r.get("type") == "journal-article": self.type = "journal-article" self.title = r['title'][0] self.containerTitle = r['container-title'][0] self.issn = r['ISSN'] elif r.get("type") == "book-chapter": self.type = "book-chapter" self.title = r['title'][0] self.containerTitle = r['container-title'][0] self.isbn = r['ISBN'] elif r.get("type") == "book" or r.get("type") == "monograph": self.type = "monograph" self.title = r['title'][0] self.isbn = r['ISBN'] elif r.get("type") == "proceedings-article": self.type="proceedings-article" self.title = r['title'][0] self.containerTitle = r['container-title'][0] self.isbn = r['ISBN'] elif r.get("type") == "reference-entry": self.type = "reference-entry" self.isbn = r['ISBN'] self.title = r['title'][0] self.source = "CrossRef" except: pass
def search(ids=None, member=None, filter=None, limit=500, **kwargs): ''' Search Crossref :param ids: [Array] DOIs (digital object identifier) or other identifiers :param member: [String] member ids :param filter: [Hash] Filter options. See ... :param limit: [Fixnum] Number of results to return. Not relavant when searching with specific dois. Default: 20. Max: 1000 :param kwargs: any additional arguments will be passed on to `requests.get` :return: A dictionary, of results Usage:: from pyminer import search search.search(ids = "10.1371/journal.pone.0000308") search.search(filter = {'has_full_text': True}) search.search(filter = {'full_text_type': 'text/plain'}) ''' cr = Crossref() return Response(cr.works(ids = ids, limit = limit, filter = filter, **kwargs))
def getReferencesDOI(self): if self.doi != None: c = Crossref(mailto="*****@*****.**") try: response = c.works(ids=[self.doi]) response = response['message'] except: print("something went wrong") pass self.work_type = response.get("type") self.references = response.get("reference") if self.references != None: citations = [] for reference in self.references: citations.append( self.processReference(reference, "CrossRef")) self.references = citations
def search(ids=None, member=None, filter=None, limit=500, **kwargs): ''' Search Crossref to get text mining links :param ids: [Array] DOIs (digital object identifier) or other identifiers :param member: [String] member ids :param filter: [Hash] Filter options. See ... :param limit: [Fixnum] Number of results to return. Not relavant when searching with specific dois. Default: 20. Max: 1000 :param kwargs: any additional arguments will be passed on to ``requests.get`` :return: A dictionary, of results Usage:: from pyminer import miner miner.search(filter = {'has_full_text': True}, limit = 5) miner.search(filter = {'full_text_type': 'text/plain', 'license_url': "http://creativecommons.org/licenses/by-nc-nd/3.0"}) miner.search(filter = {'has_full_text': True, 'license_url': "http://creativecommons.org/licenses/by/4.0"}) ''' cr = Crossref() return Response(cr.works(ids = ids, limit = limit, filter = filter, **kwargs))
def populate_metadata_from_crossref(url, validated_data): try: doi = validated_data.get("doi") paper_title = validated_data.get("paper_title") cr = Crossref() params = { "filters": {"type": "journal-article"}, } if doi: params["ids"] = [doi] else: params["query_bibliographic"] = paper_title params["limit"] = 1 params["order"] = "desc" params["sort"] = "score" results = cr.works(**params)["message"] if "items" in results: data = results["items"][0] else: data = results["message"] validated_data = {} validated_data["doi"] = doi validated_data["abstract"] = clean_abstract(data.get("abstract", "")) validated_data["is_public"] = True validated_data["paper_title"] = data.get("title", [""])[0] validated_data["paper_publish_date"] = data.get("created").get("date-time") validated_data["raw_authors"] = get_raw_authors_from_csl_item(data) return validated_data, True except Exception as e: print(e) return None, False
def get_crossref(request, doi=""): # on recupere un objet crossref pour faire le requetage cr = Crossref() # on recupere les donnes associees au DOI si le doi ne correspond a rien # on a une erreur d'ou le try except try: data = cr.works(ids=doi) except: # la reponse retournee est un json vide d = {} # si il y a quelque chose on rempli les donnees if data: # initialisation du json d = { 'authors': '', 'title': '', 'published': '', 'doi': '', 'page': '' } m = data['message'] if m.get('author'): d['authors'] = ":".join( ["%s %s" % (i['given'], i['family']) for i in m['author']]) if m.get('issued'): d['published'] = "/".join(map(str, m['issued']['date-parts'][0])) if m.get('title'): d['title'] = m['title'][0] for k in ['DOI', 'publisher', 'page']: if m.get(k): d[k.lower()] = m[k] # just return a JsonResponse return JsonResponse(d)
with open(doiFile, 'r') as fp: for line in fp: doiList.append(line.rstrip('\n')) # setup logging environment logging.getLogger().setLevel(logging.DEBUG) requests_log = logging.getLogger("requests.packages.urllib3") requests_log.setLevel(logging.DEBUG) requests_log.propagate = True # retrieve links to DOIs through crossref: articleList = [] for item in doiList: print("searching for DOI:" + item) # filter={'has_full_text': True, 'type':'journal-article', 'full-text.application': 'text-mining'}) res = cr.works(ids=item) print("found" + res['message']['title'][0]) article = res['message'] print("getting link to: " + article['title'][0]) if article['link'][0][ 'intended-application'] == 'text-mining': #double check you get the right url #store as a tuple with (URL,title) print("success") articleList.append((article['link'][0]['URL'], re.sub(r'\W+', '', article['title'][0]))) else: print("url error") break # retrieve PDFs through Wiley API (Can fork this and change this to other API's e.g. elsevier if needed) header = {'CR-Clickthrough-Client-Token': clickThroughKey} for article in articleList:
def main(): """ NAME magic_geomagia.py DESCRIPTION Takes a MagIC file and outputs data for easier input into Max Brown's GEOMAGIA database SYNTAX magic_geomagia.py [command line options] OPTIONS -h: prints the help message and quits. -f FILE: the MagIC data file name that will be converted to GEOMAGIA files OUTPUT: print to stdout the GEOMAGIA insert command for the reference and all of the site level data EXAMPLE: magic_geomagia.py -f magic_contribution_16578.txt Nick Jarboe """ if '-h' in sys.argv: # check if help is needed print(main.__doc__) sys.exit() # graceful quit if '-f' in sys.argv: ind=sys.argv.index('-f') file_name=sys.argv[ind+1] else: print("MagIC file name needed. Please add the file name after the -f option.") # Create all the table files from the magic.txt file so they can be imported by the cb command = "download_magic.py -f " + file_name os.system(command) md = cb.Contribution() #md stands for magic file data md.propagate_location_to_measurements() md.propagate_location_to_specimens() md.propagate_location_to_samples() if not md.tables: print('-E- No MagIC tables could be found in this directory') error_log("No MagIC tables found") return doi=md.tables['contribution'].df.iloc[0]['reference'] id=md.tables['contribution'].df.iloc[0]['id'] timestamp=md.tables['contribution'].df.iloc[0]['timestamp'] contributor=md.tables['contribution'].df.iloc[0]['contributor'] print("c=",contributor) contributor=contributor.replace('@','') print("c=",contributor) cr = Crossref() ref=cr.works(doi) # authors = "Doe J.X., Alexander,T.G." status= ref["status"] message= ref["message"] # print("message=",message) authors= message["author"] # print("authors=",authors) authorList="" for author in authors: # print ("Name:",author['given'], author['family']) author_given="" names=author['given'].split(' ') for name in names: author_given +=name[0]+"." authorList += author['family'] + " " + author_given + ", " # print(authorList) authorList=authorList[:-2] # print(authorList) title = message['title'][0] year = message['created']['date-parts'][0][0] # print(year) journal = message['short-container-title'][0] volume = message['volume'] # print(volume) pages='0' if "page" in message.keys(): pages = message['page'] # print(pages) url = "https://earthref.org/MagIC/doi/" + doi print("REFS") print("Insert into REFS values(NULL,'", authorList, "','", title, "', ", year, ", '", journal, "', ", volume, ", '", pages, "', '", doi, "', '", url, "');", sep='') print() print("ARCHEODIJ") sites=md.tables['sites'].df locations=md.tables['locations'].df print("UID,NUM_SAMPLES,NUM_ACC_SPEC,NUM_MEAS_SPEC,BA,SIGMA_BA,AGE, AGE_MIN,AGE_MAX,NUM_SIGMAS,AGE_ERROR_TYPE_ID,SITE_LAT, SITE_LON,VADM,SIGMA_VADM,SITE_ID,PI_METHODS_ID,AC_ID,MD_CK_ ID,AN_CORR_ID,CR_CORR_ID,DM_METHOD_ID,AF_STEP,T_STEP,DM_ ANALYSIS_ID,SPECIMEN_TYPE_ID,MATERIAL_ID,REFERENCE_ID,NUM_ C14_SAMPLES,C14_ID,CALIB_C14_AGE,CALIB_C14_AGE_SIGMA_MIN, CALIB_C14_AGE_SIGMA_MAX,NUM_C14_SIGMAS,CALC_CALIB_C14_AGE, CALC_CALIB_C14_AGE_SIGMA_MIN,CALC_CALIB_C14_AGE_SIGMA_MAX, C14_CALIB_SOFTWARE_ID,CALC_C14_CALIB_SOFTWARE_ID,C14_CALIB_DATASET_ID,CALC_C14_ CALIB_DATASET_ID,DENDRO_ID,TOT_NUM_DENDRO,NUM_DENDRO_ USED,DATING_METHOD_ID,NUM_DIR_SAMPLES,NUM_DIR_SPECIMENS,NUM_ DIR_SPEC_COLLECTED,DECL,INCL,ALPHA_95,K,VDM,SIGMA_VDM,SAMPLE_ID,c_csv,SITE_NAME, SITE_HORIZON,1000,1001,1002,1003,1004,1005,1006,1007,1008,1009,1010,1011,1012,1013,1014, SUPERSEEDED,UPLOAD_YEAR,UPLOAD_MONTH,UPLOADER,EDITOR,EDIT_DATE,NOTES") for index, row in sites.iterrows(): int_n_samples,int_n_specimens,int_n_total_specimens,int_abs,int_abs_sigma=-1,-1,-1,-1,-1 if 'int_n_samples' in sites.columns.values: int_n_samples=row['int_n_samples'] if 'int_n_specimens' in sites.columns.values: int_n_specimens=row['int_n_specimens'] if 'int_n_total_specimens' in sites.columns.values: int_n_total_specimens=row['int_n_total_specimens'] if int_n_specimens == -1 and int_n_samples >0: int_n_spcimens = int_n_samples if 'int_abs' in sites.columns.values: int_abs=row['int_abs'] if int_abs is not None: int_abs=round(int_abs*1e6,1) if 'int_abs_sigma' in sites.columns.values: int_abs_sigma=row['int_abs_sigma'] if int_abs_sigma is not None: int_abs_sigma=round(row['int_abs_sigma']*1e6,1) age,age_high,age_low=-1e9,-1e9,-1e9 age_error_type='0' # if 'age_unit' not in sites.columns.values: print("Malformed Magic sites data table. Required column row 'age_unit' is missing") sys.exit() age_unit=row['age_unit'] if 'age' in sites.columns.values: age=row['age'] age=pmag.age_to_BP(age,age_unit) if 'age_high' in sites.columns.values: age_high=row['age_high'] age_high=pmag.age_to_BP(age_high,age_unit) if 'age_low' in sites.columns.values: age_low=row['age_low'] age_low=pmag.age_to_BP(age_low,age_unit) if 'age_sigma' in sites.columns.values: age_sigma=row['age_sigma'] age_sigma=pmag.age_to_BP(age_sigma,age_unit) age_high=age+age_sigma age_low=age-age_sigma age_error_type='5' #Magic is one sigma for all sigma state/province column to data modelages if age_low > age_high: # MagIC lets age_high and age_low be in any order. Fix that for GEOMAGIA temp=age_high age_high=age_low age_low=temp if age == -1e9: # If only age_low and age_high are in the MagIC file then calculate the age. age=(age_high+age_low)/2 age_error_type='8' #If MagIC age only high and low then error type is "range" age_min=age-age_low # GEOMAGIA has the max and min as differences from the age, not absolute. age_max=age_high-age age_BP=age age=1950-age #GEOMAGIA want +-AD/BC so convert BP to AD/-BC lat=row['lat'] lon=row['lon'] vadm,vadm_sigma=-1,-1 if 'vadm' in sites.columns.values: vadm=row['vadm'] vadm=vadm/1e22 if 'vadm_sigma' in sites.columns.values: vadm=row['vadm'] vadm=vadm/1e22 site_name=row['site'] # For paleointensity codes just give the method code list and Max will decide on the right # GEOMAGIA code. method_codes="No MagIC method codes available" if 'method_codes' in sites.columns.values: method_codes=row['method_codes'] # Just give Max all the method codes for him to decide for now paleointensity_procedure=method_codes alteration_monitor="0" alteration_monitor=method_codes_to_geomagia(method_codes,'ALTERATION_MONIT_CORR') multidomain_check="0" multidomain_check=method_codes_to_geomagia(method_codes,'MD_CHECKS') anisotropy_correction="0" anisotropy_correction=method_codes_to_geomagia(method_codes,'ANISOTROPY_CORRECTION') cooling_rate="0" cooling_rate=method_codes_to_geomagia(method_codes,'COOLING_RATE') demag_method="0" demag_method=method_codes_to_geomagia(method_codes,'DM_METHODS') demag_analysis="0" demag_analysis=method_codes_to_geomagia(method_codes,'DM_ANALYSIS') specimen_shape="0" specimen_shape=method_codes_to_geomagia(method_codes,'SPECIMEN_TYPE_ID') materials="" geologic_types="" if 'geologic_types' in sites.columns.values: geologic_types=row['geologic_types'] if ":" in geologic_types: gtypes=geologic_types.split(":") for gtype in gtypes: materials=materials+pmag.vocab_convert(gtype,"geomagia")+":" materials=materials[:-1] else: materials=pmag.vocab_convert(geologic_types,"geomagia") geochron_codes="" if ":" in method_codes: gcodes=method_codes.split(":") for gcode in gcodes: if "GM-" == gcode[:3]: geochron_codes=geochron_codes+pmag.vocab_convert(gcode,"geomagia")+":" geochron_codes=geochron_codes[:-1] else: geochron_codes=pmag.vocab_convert(geochron_codes,"geomagia") if geochron_codes == "": geochron_codes="0" dir_n_samples="-1" if 'dir_n_samples' in sites.columns.values: dir_n_samples=row['dir_n_samples'] dir_n_samples="-1" if 'dir_n_samples' in sites.columns.values: dir_n_samples=row['dir_n_samples'] # Not in MagIC dir_n_specimens="-1" # using total number of samples for total specimen number dir_n_total_samples="-1" if 'dir_n_total_samples' in sites.columns.values: dir_n_total_samples=row['dir_n_total_samples'] dir_dec="999" if 'dir_dec' in sites.columns.values: dir_dec=row['dir_dec'] dir_inc="999" if 'dir_inc' in sites.columns.values: dir_inc=row['dir_inc'] dir_alpha95="-1" if 'dir_alpha95' in sites.columns.values: dir_alpha95=row['dir_alpha95'] dir_k="-1" if 'dir_k' in sites.columns.values: dir_k=row['dir_k'] vdm=-1 if 'vdm' in sites.columns.values: vdm=float(row['vdm']) vdm=vdm/1e22 vdm_sigma=-1 if 'vdm_sigma' in sites.columns.values: vdm_sigma=float(row['vdm_sigma']) vdm_sigma=vdm_sigma/1e22 # Could try and get sample names from samples table (using Contribution object) but just taking the list # if it exists for now. sample_list="-1" if 'samples' in sites.columns.values: sample_list=row['samples'] # c_csv is in GEOMAGIA insert. What it is I don't know. Max said set to 0 c_csv='0' # This place_id is SITE_ID in GEOMAGIA place_id="0" location=row['location'] if 'state_province' in locations.columns.values: place=locations.loc[location,'state_province'] if place != "": place_id=pmag.vocab_convert(place,'GEOMAGIA') if place_id == "0": if 'country' in locations.columns.values: place=locations.loc[location,'country'] if place != "": place_id=pmag.vocab_convert(place,'GEOMAGIA') if place_id == "0": if 'continent_ocean' in locations.columns.values: place_id=locations.loc[location,'continent_ocean'] if place != "": place_id=pmag.vocab_convert(place,'GEOMAGIA') site=row['site'] dt=dateutil.parser.parse(timestamp) description="-1" if 'description' in sites.columns.values: description=row['description'] if age_BP <= 50000: print("0",int_n_samples,int_n_specimens,int_n_total_specimens,int_abs,int_abs_sigma,age,age_min,age_max,"1",age_error_type,lat,lon,vadm,vadm_sigma,place_id,paleointensity_procedure,alteration_monitor,multidomain_check,anisotropy_correction,cooling_rate,demag_method,"0","0",demag_analysis,specimen_shape,materials,doi,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1",geochron_codes,dir_n_samples,dir_n_samples,dir_n_total_samples,dir_dec,dir_inc,dir_alpha95,dir_k,vdm,vdm_sigma,sample_list,c_csv,location,site,"-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1","-1",dt.year,dt.month,contributor,"-1,-1",description,sep=',')
def update_contents(self, new_store_contents): """ Structure -> mpid -> BibTeX references from MP -> (optional doi lookup via Crossref) -> formatting. Formatting is very messy right now. DOI lookup and (possibly) formatting should be cached in a builder. """ struct = self.from_data(new_store_contents) if not isinstance(struct, Structure): raise PreventUpdate( "Literature mentions can only be retrieved for crystallographic " "structures at present and not molecules. Please make a feature " "request if this would be useful for you, and it will be " "prioritized." ) with MPRester() as mpr: mpids = mpr.find_structure(struct) if len(mpids) == 0: raise PreventUpdate( "No structures in the Materials Project database match this " "crystal structure, so literature mentions cannot be retrieved. " "Please submit this structure to Materials Project if you'd " "like it to be added to the Materials Project database." ) all_references = [] for mpid in mpids: all_references.append(mpr.get_materials_id_references(mpid)) self.logger.debug(f"Retrieved references for {mpid}.") if self.use_crossref: cr = Crossref(mailto=CROSSREF_MAILTO) individual_references = set() for references in all_references: individual_references.update(set(references.split("\n\n"))) # exclude Materials Proect references (these are intended to be # references for the structure specifically) refs_to_remove = set() for ref in individual_references: if "Jain2013" in ref: refs_to_remove.add(ref) individual_references -= refs_to_remove works = [cr.works(query=ref, limit=1) for ref in individual_references] self.logger.debug(f"Retrieved {len(works)} works from Crossref.") items = [ work["message"]["items"][0] for work in works if len(work["message"]["items"]) > 0 ] dois_to_item = { item["DOI"]: { "cited-by": item.get("is-referenced-by-count", 0), "score": item["score"], "title": item.get("title", None), "authors": item.get("author", []), "journal": item.get("container-title", [None])[0], "issue": item.get("issue", None), "volume": item.get("volume", None), "pages": item.get("page", None), "date-parts": item.get("issued", {}).get("date-parts", [[None]]), } for item in items if item["score"] > 40 } num_refs = len(dois_to_item) sorted_dois = sorted( list(dois_to_item.keys()), key=lambda doi: -dois_to_item[doi]["cited-by"], ) if self.use_crossref_formatting: # use Crossref to retrieve pre-formatted text # remove leading "1. " from Science CSL style refs = { doi: content_negotiation(ids=doi, format="text", style="science")[ 3: ] for doi in dois_to_item.keys() } self.logger.debug( f"Retrieved {len(refs)} formatted references from Crossref." ) md = " \n\n".join( f"> [{refs[doi]}](https://dx.doi.org/{doi}) " f"Cited by {dois_to_item[doi]['cited-by']}." for doi in sorted_dois ) formatted_references = dcc.Markdown( md, className="mpc-markdown" ) else: # else retrieve BibTeX entries to extract a nice author list # and perform our own formatting entries = { doi: content_negotiation(ids=doi, format="bibtex") for doi in sorted_dois } formatted_entries = [] for doi, entry in entries.items(): author_string = self._bibtex_entry_to_author_text(entry) journal_div = self._item_to_journal_div(dois_to_item[doi]) formatted_entries.append( html.Blockquote( [ html.A( [ html.Div( [ html.I( # necessary since titles can contain HTML for superscripts etc. dcc.Markdown( dois_to_item[doi]["title"], dangerously_allow_html=True ) ) ] ), html.Div([author_string]), html.Div( [ journal_div, html.Span( f" Cited by {dois_to_item[doi]['cited-by']}." ), ] ), ], href=f"https://dx.doi.org/{doi}", ) ], className="mpc", style={"padding-left": "1rem", "margin-bottom": "1rem"} ) ) formatted_references = html.Div(formatted_entries) else: # this uses pybtex directly on stored BibTeX entries from MP # most-accurate references and faster since no Crossref lookup # is required but no dois/hyperlinks available all_entries = {} for references in all_references: all_entries.update(Parser().parse_string(references).entries) md = self._pybtex_entries_to_markdown(all_entries) formatted_references = dcc.Markdown(md, className="mpc-markdown") num_refs = len(all_entries) return html.Div( [ Label(f"{num_refs} references found{':' if num_refs>0 else '.'}"), formatted_references, ], style={"max-height": "20rem", "overflow-y": "scroll"}, )
def crossref_query(authors, title): """ Query Crossref database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. filename (str): the original path of the file to link to. Returns: A tuple (bibtex, json, score) where the first element is the data in bibtex format (returned as a record/dict), the second element is the data returned in json format, and the third element is the score of the match given by Crossref. """ cr = Crossref() # works?query.title=An+Improved+Adaptive+Constraint+Aggregation+for+Integrated+Layout+and+Topology+Optimization&query.author=Gao+Zhu+Zhang+Zhou&sort=score&rows=1 # query = ['+' + name + '' for name in authors] # query = 'query.title=' + urllib.parse.quote_plus(title) + '&query.author=' + urllib.parse.quote_plus(' '.join(authors)) + '&sort=score&rows=1' # print(query) if ''.join(authors): args = dict( query_title=urllib.parse.quote_plus(title), query_author=urllib.parse.quote_plus(' '.join(authors)) ) else: args = dict( query=urllib.parse.quote_plus(title), ) x = cr.works(sort='score', limit=1, **args) # x = cr.works(query=query) assert x['status'] == "ok" # No result found if not x['message']['items']: print_score(0) return (None, [], 0) best_item = x['message']['items'][0] # print(json.dumps(best_item, indent=4)) for item in x['message']['items']: if item['score'] < best_item['score']: break else: best_item = pick_best(title, best_item, item) # Retrieve DOI and json item doi = best_item['DOI'] res_json = best_item # If the entry is invalid, return a score of 0 if 'author' not in res_json or not res_json['title']: print_score(0) return (None, res_json, 0) # Retrieve metadata as bibtex entry res_bib = cn.content_negotiation(ids=doi, format="bibentry") res_bib = re.sub('ä', 'ä', res_bib) res_bib = re.sub('Ă', 'Ö', res_bib) res_bib = re.sub('รถ', 'ö', res_bib) res_bib = re.sub('Ăź', 'ü', res_bib) res_bib = re.sub('Ěo', 'ö', res_bib) res_bib = re.sub('ďż˝', 'ø', res_bib) res_bib = re.sub('ĂŤ', 'ë', res_bib) db = bibtexparser.loads(res_bib) assert len(db.entries) == 1 res_bib = db.entries[0] # If article has subtitle(s), fix bibtex entry subtitles = None if 'subtitle' in res_json: subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)] if subtitles: # Discard subtitle that are all uppercase title = ' '.join(res_json['title']) subtitle = ' '.join(subtitles) if title.lower().startswith(subtitle.lower()) or utils.simratio(title, subtitle) > 0.95: # Don't repeat title if the subtitle is too similar to the title new_title = title else: new_title = title + ": " + subtitle res_bib['title'] = new_title else: new_title = ' '.join(res_json['title']) res_bib['title'] = new_title # Post-process title res_bib['title'] = re.sub('\\*$', '', res_bib['title']) res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title']) res_bib['title'] = re.sub('\\.*$', '', res_bib['title']) # If bibtex entry has a 'journal' field, then use the longest alias from the json if 'journal' in res_bib: best = "" for container in res_json['container-title']: if len(container) > len(best): best = container res_bib['journal'] = best # If entry is missing the year, set score to 0 score = res_json['score'] if 'year' not in res_bib: score = 0 # Fix incorrect year in crossref entry if 'published-print' in res_json: item = res_json['published-print'] if 'date-parts' in item and len(item['date-parts']) == 1: date = item['date-parts'][0] year = date[0] month = date[1] if len(date) > 1 else None if str(year) != res_bib['year']: res_bib['year'] = str(year) if month is None and 'month' in res_bib: del res_bib['month'] elif month is not None: assert month >= 1 and month <= 12 month_str = utils.MONTHS[month - 1] res_bib['month'] = month_str # Fix potential ambiguous author entries msg = utils.fix_author_field(res_bib, res_json) print('C: ' + nomenclature.gen_filename(res_bib)) print_score(score) # If score is above threshold, display msg from fix_author_field if score >= config.crossref_accept_threshold and msg: print(msg) # Return database entry return (res_bib, res_json, score)