Exemple #1
0
 def gitcommit(self, branch=None, message=None):
     if os.path.exists(self._gitdir):
         target = os.path.join(self.gitdir, os.path.basename(self.bibtex))
         if not os.path.samefile(self.bibtex, target):
             shutil.copy(self.bibtex, target)
         message = message or 'save ' + self.bibtex + ' after command:\n\n    papers ' + ' '.join(
             sys.argv[1:])
         with open(os.devnull, 'w') as shutup:
             if branch is not None:
                 sp.check_call(['git', 'checkout', branch],
                               stdout=shutup,
                               stderr=shutup,
                               cwd=self.gitdir)
             sp.check_call(['git', 'add', target],
                           stdout=shutup,
                           stderr=shutup,
                           cwd=self.gitdir)
             res = sp.call(['git', 'commit', '-m', message],
                           stdout=shutup,
                           stderr=shutup,
                           cwd=self.gitdir)
             if res == 0:
                 logger.info('git commit')
     else:
         raise ValueError('git is not initialized in ' + self.gitdir)
Exemple #2
0
def readpdf_image(pdf, first=None, last=None):

    if not os.path.isfile(pdf):
        raise ValueError(repr(pdf) + ": not a file")

    tmpbase = tempfile.mktemp()
    tmppng = tmpbase + '.png'
    tmptxt = tmpbase + '.txt'

    # 1st create a .png image from the uniq pdf file
    cmd = ['pdftoppm', '-singlefile', '-png', '-q']
    if first is not None: cmd.extend(['-f', str(first)])
    if last is not None: cmd.extend(['-l', str(last)])
    cmd.extend([pdf, tmpbase])
    logger.info(' '.join(cmd))
    # print(' '.join(cmd))
    sp.check_call(cmd)

    # 2nd extract text from .png using tesseract
    cmd = ["tesseract", tmppng, tmpbase, "-l", "eng", "quiet"]
    logger.info(' '.join(cmd))
    # print(' '.join(cmd))
    sp.check_call(cmd)

    txt = open(tmptxt).read()

    os.remove(tmptxt)
    os.remove(tmppng)

    return txt
Exemple #3
0
def extract_txt_metadata(txt, search_doi=True, search_fulltext=False, space_digit=True, max_query_words=200, scholar=False):
    """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar
    """
    assert search_doi or search_fulltext, 'no search criteria specified for metadata'

    bibtex = None

    if search_doi:
        try:
            logger.debug('parse doi')
            doi = parse_doi(txt, space_digit=space_digit)
            logger.info('found doi:'+doi)
            logger.debug('query bibtex by doi')
            bibtex = fetch_bibtex_by_doi(doi)
            logger.debug('doi query successful')

        except ValueError as error:
            logger.debug(u'failed to obtained bibtex by doi search: '+str(error))

    if search_fulltext and not bibtex:
        logger.debug('query bibtex by fulltext')
        query_txt = query_text(txt, max_query_words)
        if scholar:
            bibtex = fetch_bibtex_by_fulltext_scholar(query_txt)
        else:
            bibtex = fetch_bibtex_by_fulltext_crossref(query_txt)
        logger.debug('fulltext query successful')

    if not bibtex:
        raise ValueError('failed to extract metadata')

    return bibtex
Exemple #4
0
def fetch_bibtex_by_fulltext_crossref(txt, **kw):
    work = Works(etiquette=my_etiquette)
    logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt))

    # get the most likely match of the first results
    # results = []
    # for i, r in enumerate(work.query(txt).sort('score')):
    #     results.append(r)
    #     if i > 50:
    #         break
    query = work.query(txt, **kw).sort('score')
    query_result = query.do_http_request('get',
                                         query.url,
                                         custom_header=str(
                                             query.etiquette)).text
    results = json.loads(query_result)['message']['items']

    if len(results) > 1:
        maxscore = 0
        result = results[0]
        for res in results:
            score = _crossref_score(txt, res)
            if score > maxscore:
                maxscore = score
                result = res
        logger.info('score: ' + str(maxscore))

    elif len(results) == 0:
        raise ValueError('crossref fulltext: no results')

    else:
        result = results[0]

    # convert to bibtex
    return crossref_to_bibtex(result).strip()
Exemple #5
0
def extract_txt_metadata(txt,
                         search_doi=True,
                         search_fulltext=False,
                         max_query_words=200,
                         scholar=False):
    """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar
    """
    assert search_doi or search_fulltext, 'no search criteria specified for metadata'

    bibtex = None

    if search_doi:
        try:
            logger.debug('parse doi')
            doi = parse_doi(txt)
            logger.info('found doi:' + doi)
            print(" -- Found DOI    ", doi)
            logger.debug('query bibtex by doi')
            try:
                bibtex = fetch_bibtex_by_doi(doi)
                print(" -- Found Bibtex !")
            except AttributeError:
                return '''@misc{{{doi},
                doi = {{{doi}}},
                author = {{{author}}},
                title = "-- check-entry --",
                url = {{http://dx.doi.org/{doi}}},
                }}'''.format(doi=doi, author=str(uuid.uuid4())[0:10])

            logger.debug('doi query successful')

        except DOIParsingError as error:
            logger.debug(u'doi parsing error: ' + str(error))

        except DOIRequestError as error:
            return '''@misc{{{doi},
             doi = {{{doi}}},
             url = {{http://dx.doi.org/{doi}}},
            }}'''.format(doi=doi)

        except ValueError as error:
            raise
            # logger.debug(u'failed to obtained bibtex by doi search: '+str(error))

    if search_fulltext and not bibtex:
        logger.debug('query bibtex by fulltext')
        query_txt = query_text(txt, max_query_words)
        if scholar:
            bibtex = fetch_bibtex_by_fulltext_scholar(query_txt)
        else:
            bibtex = fetch_bibtex_by_fulltext_crossref(query_txt)
        logger.debug('fulltext query successful')

    if not bibtex:
        raise ValueError('failed to extract metadata')

    return bibtex
Exemple #6
0
def move(f1, f2, copy=False, interactive=False, default="don't replace"):
    dirname = os.path.dirname(f2)
    if dirname and not os.path.exists(dirname):
        logger.info('create directory: ' + dirname)
        os.makedirs(dirname)
    if f1 == f2:
        logger.info('dest is identical to src: ' + f1)
        return
    if os.path.exists(f2):
        if interactive:
            ans = raw_input('dest file already exists: ' + f2 +
                            '. Replace? (y/n) ')
        else:
            if default is "don't replace":
                ans = 'n'
            else:
                ans = 'y'
        if ans != 'y':
            return

    if copy:
        cmd = u'cp {} {}'.format(f1, f2)
        logger.info(cmd)
        if not DRYRUN:
            shutil.copy(f1, f2)
    else:
        cmd = u'mv {} {}'.format(f1, f2)
        logger.info(cmd)
        if not DRYRUN:
            shutil.move(f1, f2)
Exemple #7
0
def readpdf(pdf, first=None, last=None, keeptxt=False):
    txtfile = pdf.replace('.pdf','.txt')
    # txtfile = os.path.join(os.path.dirname(pdf), pdf.replace('.pdf','.txt'))
    if True: #not os.path.exists(txtfile):
        # logger.info(' '.join(['pdftotext','"'+pdf+'"', '"'+txtfile+'"']))
        cmd = ['pdftotext']
        if first is not None: cmd.extend(['-f',str(first)])
        if last is not None: cmd.extend(['-l',str(last)])
        cmd.append(pdf)
        sp.check_call(cmd)
    else:
        logger.info('file already present: '+txtfile)
    txt = open(txtfile).read()
    if not keeptxt:
        os.remove(txtfile)
    return txt
Exemple #8
0
def readpdf(pdf, first=None, last=None):

    if not os.path.isfile(pdf):
        raise ValueError(repr(pdf) + ": not a file")

    tmptxt = tempfile.mktemp(suffix='.txt')

    cmd = ['pdftotext']
    if first is not None: cmd.extend(['-f', str(first)])
    if last is not None: cmd.extend(['-l', str(last)])
    cmd.extend([pdf, tmptxt])
    logger.info(' '.join(cmd))
    sp.check_call(cmd)

    txt = open(tmptxt).read()
    os.remove(tmptxt)

    return txt
Exemple #9
0
def readpdf(pdf, first=None, last=None, keeptxt=False):
    import shutil
    import tempfile
    if True:  # not os.path.exists(txtfile):
        if os.path.isfile(pdf):
            path, ext = os.path.splitext(pdf)
            fd, uniq_pdf = tempfile.mkstemp(suffix=ext)
            uniq_pdf = shutil.copy2(pdf, uniq_pdf)
            logger.info("\t->", uniq_pdf)
        else:
            # Not a file
            logger.info("\tSkipped:", "'" + pdf + "'", "Target is not a file")

        uniq_name, ext = os.path.splitext(uniq_pdf)
        uniq_img = uniq_name + '.png'
        uniq_txt = uniq_name + '.txt'

        cmd = ['pdftoppm']
        if first is not None: cmd.extend(['-f', str(first)])
        if last is not None: cmd.extend(['-l', str(last)])

        # 1st create a .png image from the uniq pdf file
        cmd.extend(['-singlefile'])
        cmd.extend(['-png'])
        cmd.extend(['-q'])
        cmd.append(uniq_pdf)
        cmd.append(uniq_name)
        sp.check_call(cmd)

        # 2nd extract text from .png using tesseract
        cmd = ["tesseract", uniq_img, uniq_name, "-l", "eng", "quiet"]
        sp.check_call(cmd)
    else:
        logger.info('file already present: ' + uniq_txt)
    txt = open(uniq_txt).read()
    if not keeptxt:
        os.remove(uniq_pdf)
        os.remove(uniq_img)
        os.remove(uniq_txt)
    return txt
Exemple #10
0
 def check_install(self):
     if not os.path.exists(self.cache):
         logger.info('make cache directory for DOI requests: ' + self.cache)
         os.makedirs(self.cache)