Beispiel #1
0
def fetch_bibtex_by_fulltext_crossref(txt, **kw):
    work = Works(etiquette=my_etiquette)
    logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt))

    # get the most likely match of the first results
    # results = []
    # for i, r in enumerate(work.query(txt).sort('score')):
    #     results.append(r)
    #     if i > 50:
    #         break
    query = work.query(txt, **kw).sort('score')
    query_result = query.do_http_request('get',
                                         query.url,
                                         custom_header=str(
                                             query.etiquette)).text
    results = json.loads(query_result)['message']['items']

    if len(results) > 1:
        maxscore = 0
        result = results[0]
        for res in results:
            score = _crossref_score(txt, res)
            if score > maxscore:
                maxscore = score
                result = res
        logger.info('score: ' + str(maxscore))

    elif len(results) == 0:
        raise ValueError('crossref fulltext: no results')

    else:
        result = results[0]

    # convert to bibtex
    return crossref_to_bibtex(result).strip()
Beispiel #2
0
def fetch_bibtex_by_fulltext_scholar(txt, assess_results=True):
    import scholarly.scholarly
    scholarly._get_page = _get_page_fast  # remove waiting time
    logger.debug(txt)
    search_query = scholarly.search_pubs_query(txt)

    # get the most likely match of the first results
    results = list(search_query)
    if len(results) > 1 and assess_results:
        maxscore = 0
        result = results[0]
        for res in results:
            score = _scholar_score(txt, res.bib)
            if score > maxscore:
                maxscore = score
                result = res
    else:
        result = results[0]

    # use url_scholarbib to get bibtex from google
    if getattr(result, 'url_scholarbib', ''):
        bibtex = scholarly._get_page(result.url_scholarbib).strip()
    else:
        raise NotImplementedError(
            'no bibtex import linke. Make crossref request using title?')
    return bibtex
Beispiel #3
0
def pdfhead(pdf, maxpages=10, minwords=200):
    """ read pdf header
    """
    i = 0
    txt = ''
    while len(txt.strip().split()) < minwords and i < maxpages:
        i += 1
        logger.debug('read pdf page: '+str(i))
        txt += readpdf(pdf, first=i, last=i)
    return txt
Beispiel #4
0
def pdfhead(pdf, maxpages=12, minwords=300, image=False):
    """ read pdf header
    """
    i = 0
    txt = ''
    while len(txt.strip().split()) < minwords and i < maxpages:
        i += 1
        logger.debug('read pdf page: ' + str(i))
        if image:
            txt += readpdf_image(pdf, first=i, last=i)
        else:
            txt += readpdf(pdf, first=i, last=i)
    return txt
Beispiel #5
0
 def decorated(doi):
     if hashed_key:  # use hashed parameter as key (for full text query)
         key = hashlib.sha256(doi.encode('utf-8')).hexdigest()[:6]
     else:
         key = doi
     if key in cache:
         logger.debug('load from cache: ' + repr((file, key)))
         return cache[key]
     else:
         res = cache[key] = fun(doi)
         if not DRYRUN:
             json.dump(cache, open(file, 'w'))
     return res
Beispiel #6
0
def extract_txt_metadata(txt,
                         search_doi=True,
                         search_fulltext=False,
                         max_query_words=200,
                         scholar=False):
    """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar
    """
    assert search_doi or search_fulltext, 'no search criteria specified for metadata'

    bibtex = None

    if search_doi:
        try:
            logger.debug('parse doi')
            doi = parse_doi(txt)
            logger.info('found doi:' + doi)
            logger.debug('query bibtex by doi')
            bibtex = fetch_bibtex_by_doi(doi)
            logger.debug('doi query successful')

        except DOIParsingError as error:
            logger.debug(u'doi parsing error: ' + str(error))

        except DOIRequestError as error:
            return '''@misc{{{doi},
             doi = {{{doi}}},
             url = {{http://dx.doi.org/{doi}}},
            }}'''.format(doi=doi)

        except ValueError as error:
            raise
            # logger.debug(u'failed to obtained bibtex by doi search: '+str(error))

    if search_fulltext and not bibtex:
        logger.debug('query bibtex by fulltext')
        query_txt = query_text(txt, max_query_words)
        if scholar:
            bibtex = fetch_bibtex_by_fulltext_scholar(query_txt)
        else:
            bibtex = fetch_bibtex_by_fulltext_crossref(query_txt)
        logger.debug('fulltext query successful')

    if not bibtex:
        raise ValueError('failed to extract metadata')

    return bibtex