def fetch_bibtex_by_fulltext_crossref(txt, **kw): work = Works(etiquette=my_etiquette) logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt)) # get the most likely match of the first results # results = [] # for i, r in enumerate(work.query(txt).sort('score')): # results.append(r) # if i > 50: # break query = work.query(txt, **kw).sort('score') query_result = query.do_http_request('get', query.url, custom_header=str( query.etiquette)).text results = json.loads(query_result)['message']['items'] if len(results) > 1: maxscore = 0 result = results[0] for res in results: score = _crossref_score(txt, res) if score > maxscore: maxscore = score result = res logger.info('score: ' + str(maxscore)) elif len(results) == 0: raise ValueError('crossref fulltext: no results') else: result = results[0] # convert to bibtex return crossref_to_bibtex(result).strip()
def fetch_bibtex_by_fulltext_scholar(txt, assess_results=True): import scholarly.scholarly scholarly._get_page = _get_page_fast # remove waiting time logger.debug(txt) search_query = scholarly.search_pubs_query(txt) # get the most likely match of the first results results = list(search_query) if len(results) > 1 and assess_results: maxscore = 0 result = results[0] for res in results: score = _scholar_score(txt, res.bib) if score > maxscore: maxscore = score result = res else: result = results[0] # use url_scholarbib to get bibtex from google if getattr(result, 'url_scholarbib', ''): bibtex = scholarly._get_page(result.url_scholarbib).strip() else: raise NotImplementedError( 'no bibtex import linke. Make crossref request using title?') return bibtex
def pdfhead(pdf, maxpages=10, minwords=200): """ read pdf header """ i = 0 txt = '' while len(txt.strip().split()) < minwords and i < maxpages: i += 1 logger.debug('read pdf page: '+str(i)) txt += readpdf(pdf, first=i, last=i) return txt
def pdfhead(pdf, maxpages=12, minwords=300, image=False): """ read pdf header """ i = 0 txt = '' while len(txt.strip().split()) < minwords and i < maxpages: i += 1 logger.debug('read pdf page: ' + str(i)) if image: txt += readpdf_image(pdf, first=i, last=i) else: txt += readpdf(pdf, first=i, last=i) return txt
def decorated(doi): if hashed_key: # use hashed parameter as key (for full text query) key = hashlib.sha256(doi.encode('utf-8')).hexdigest()[:6] else: key = doi if key in cache: logger.debug('load from cache: ' + repr((file, key))) return cache[key] else: res = cache[key] = fun(doi) if not DRYRUN: json.dump(cache, open(file, 'w')) return res
def extract_txt_metadata(txt, search_doi=True, search_fulltext=False, max_query_words=200, scholar=False): """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar """ assert search_doi or search_fulltext, 'no search criteria specified for metadata' bibtex = None if search_doi: try: logger.debug('parse doi') doi = parse_doi(txt) logger.info('found doi:' + doi) logger.debug('query bibtex by doi') bibtex = fetch_bibtex_by_doi(doi) logger.debug('doi query successful') except DOIParsingError as error: logger.debug(u'doi parsing error: ' + str(error)) except DOIRequestError as error: return '''@misc{{{doi}, doi = {{{doi}}}, url = {{http://dx.doi.org/{doi}}}, }}'''.format(doi=doi) except ValueError as error: raise # logger.debug(u'failed to obtained bibtex by doi search: '+str(error)) if search_fulltext and not bibtex: logger.debug('query bibtex by fulltext') query_txt = query_text(txt, max_query_words) if scholar: bibtex = fetch_bibtex_by_fulltext_scholar(query_txt) else: bibtex = fetch_bibtex_by_fulltext_crossref(query_txt) logger.debug('fulltext query successful') if not bibtex: raise ValueError('failed to extract metadata') return bibtex