def gitcommit(self, branch=None, message=None): if os.path.exists(self._gitdir): target = os.path.join(self.gitdir, os.path.basename(self.bibtex)) if not os.path.samefile(self.bibtex, target): shutil.copy(self.bibtex, target) message = message or 'save ' + self.bibtex + ' after command:\n\n papers ' + ' '.join( sys.argv[1:]) with open(os.devnull, 'w') as shutup: if branch is not None: sp.check_call(['git', 'checkout', branch], stdout=shutup, stderr=shutup, cwd=self.gitdir) sp.check_call(['git', 'add', target], stdout=shutup, stderr=shutup, cwd=self.gitdir) res = sp.call(['git', 'commit', '-m', message], stdout=shutup, stderr=shutup, cwd=self.gitdir) if res == 0: logger.info('git commit') else: raise ValueError('git is not initialized in ' + self.gitdir)
def readpdf_image(pdf, first=None, last=None): if not os.path.isfile(pdf): raise ValueError(repr(pdf) + ": not a file") tmpbase = tempfile.mktemp() tmppng = tmpbase + '.png' tmptxt = tmpbase + '.txt' # 1st create a .png image from the uniq pdf file cmd = ['pdftoppm', '-singlefile', '-png', '-q'] if first is not None: cmd.extend(['-f', str(first)]) if last is not None: cmd.extend(['-l', str(last)]) cmd.extend([pdf, tmpbase]) logger.info(' '.join(cmd)) # print(' '.join(cmd)) sp.check_call(cmd) # 2nd extract text from .png using tesseract cmd = ["tesseract", tmppng, tmpbase, "-l", "eng", "quiet"] logger.info(' '.join(cmd)) # print(' '.join(cmd)) sp.check_call(cmd) txt = open(tmptxt).read() os.remove(tmptxt) os.remove(tmppng) return txt
def extract_txt_metadata(txt, search_doi=True, search_fulltext=False, space_digit=True, max_query_words=200, scholar=False): """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar """ assert search_doi or search_fulltext, 'no search criteria specified for metadata' bibtex = None if search_doi: try: logger.debug('parse doi') doi = parse_doi(txt, space_digit=space_digit) logger.info('found doi:'+doi) logger.debug('query bibtex by doi') bibtex = fetch_bibtex_by_doi(doi) logger.debug('doi query successful') except ValueError as error: logger.debug(u'failed to obtained bibtex by doi search: '+str(error)) if search_fulltext and not bibtex: logger.debug('query bibtex by fulltext') query_txt = query_text(txt, max_query_words) if scholar: bibtex = fetch_bibtex_by_fulltext_scholar(query_txt) else: bibtex = fetch_bibtex_by_fulltext_crossref(query_txt) logger.debug('fulltext query successful') if not bibtex: raise ValueError('failed to extract metadata') return bibtex
def fetch_bibtex_by_fulltext_crossref(txt, **kw): work = Works(etiquette=my_etiquette) logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt)) # get the most likely match of the first results # results = [] # for i, r in enumerate(work.query(txt).sort('score')): # results.append(r) # if i > 50: # break query = work.query(txt, **kw).sort('score') query_result = query.do_http_request('get', query.url, custom_header=str( query.etiquette)).text results = json.loads(query_result)['message']['items'] if len(results) > 1: maxscore = 0 result = results[0] for res in results: score = _crossref_score(txt, res) if score > maxscore: maxscore = score result = res logger.info('score: ' + str(maxscore)) elif len(results) == 0: raise ValueError('crossref fulltext: no results') else: result = results[0] # convert to bibtex return crossref_to_bibtex(result).strip()
def extract_txt_metadata(txt, search_doi=True, search_fulltext=False, max_query_words=200, scholar=False): """extract metadata from text, by parsing and doi-query, or by fulltext query in google scholar """ assert search_doi or search_fulltext, 'no search criteria specified for metadata' bibtex = None if search_doi: try: logger.debug('parse doi') doi = parse_doi(txt) logger.info('found doi:' + doi) print(" -- Found DOI ", doi) logger.debug('query bibtex by doi') try: bibtex = fetch_bibtex_by_doi(doi) print(" -- Found Bibtex !") except AttributeError: return '''@misc{{{doi}, doi = {{{doi}}}, author = {{{author}}}, title = "-- check-entry --", url = {{http://dx.doi.org/{doi}}}, }}'''.format(doi=doi, author=str(uuid.uuid4())[0:10]) logger.debug('doi query successful') except DOIParsingError as error: logger.debug(u'doi parsing error: ' + str(error)) except DOIRequestError as error: return '''@misc{{{doi}, doi = {{{doi}}}, url = {{http://dx.doi.org/{doi}}}, }}'''.format(doi=doi) except ValueError as error: raise # logger.debug(u'failed to obtained bibtex by doi search: '+str(error)) if search_fulltext and not bibtex: logger.debug('query bibtex by fulltext') query_txt = query_text(txt, max_query_words) if scholar: bibtex = fetch_bibtex_by_fulltext_scholar(query_txt) else: bibtex = fetch_bibtex_by_fulltext_crossref(query_txt) logger.debug('fulltext query successful') if not bibtex: raise ValueError('failed to extract metadata') return bibtex
def move(f1, f2, copy=False, interactive=False, default="don't replace"): dirname = os.path.dirname(f2) if dirname and not os.path.exists(dirname): logger.info('create directory: ' + dirname) os.makedirs(dirname) if f1 == f2: logger.info('dest is identical to src: ' + f1) return if os.path.exists(f2): if interactive: ans = raw_input('dest file already exists: ' + f2 + '. Replace? (y/n) ') else: if default is "don't replace": ans = 'n' else: ans = 'y' if ans != 'y': return if copy: cmd = u'cp {} {}'.format(f1, f2) logger.info(cmd) if not DRYRUN: shutil.copy(f1, f2) else: cmd = u'mv {} {}'.format(f1, f2) logger.info(cmd) if not DRYRUN: shutil.move(f1, f2)
def readpdf(pdf, first=None, last=None, keeptxt=False): txtfile = pdf.replace('.pdf','.txt') # txtfile = os.path.join(os.path.dirname(pdf), pdf.replace('.pdf','.txt')) if True: #not os.path.exists(txtfile): # logger.info(' '.join(['pdftotext','"'+pdf+'"', '"'+txtfile+'"'])) cmd = ['pdftotext'] if first is not None: cmd.extend(['-f',str(first)]) if last is not None: cmd.extend(['-l',str(last)]) cmd.append(pdf) sp.check_call(cmd) else: logger.info('file already present: '+txtfile) txt = open(txtfile).read() if not keeptxt: os.remove(txtfile) return txt
def readpdf(pdf, first=None, last=None): if not os.path.isfile(pdf): raise ValueError(repr(pdf) + ": not a file") tmptxt = tempfile.mktemp(suffix='.txt') cmd = ['pdftotext'] if first is not None: cmd.extend(['-f', str(first)]) if last is not None: cmd.extend(['-l', str(last)]) cmd.extend([pdf, tmptxt]) logger.info(' '.join(cmd)) sp.check_call(cmd) txt = open(tmptxt).read() os.remove(tmptxt) return txt
def readpdf(pdf, first=None, last=None, keeptxt=False): import shutil import tempfile if True: # not os.path.exists(txtfile): if os.path.isfile(pdf): path, ext = os.path.splitext(pdf) fd, uniq_pdf = tempfile.mkstemp(suffix=ext) uniq_pdf = shutil.copy2(pdf, uniq_pdf) logger.info("\t->", uniq_pdf) else: # Not a file logger.info("\tSkipped:", "'" + pdf + "'", "Target is not a file") uniq_name, ext = os.path.splitext(uniq_pdf) uniq_img = uniq_name + '.png' uniq_txt = uniq_name + '.txt' cmd = ['pdftoppm'] if first is not None: cmd.extend(['-f', str(first)]) if last is not None: cmd.extend(['-l', str(last)]) # 1st create a .png image from the uniq pdf file cmd.extend(['-singlefile']) cmd.extend(['-png']) cmd.extend(['-q']) cmd.append(uniq_pdf) cmd.append(uniq_name) sp.check_call(cmd) # 2nd extract text from .png using tesseract cmd = ["tesseract", uniq_img, uniq_name, "-l", "eng", "quiet"] sp.check_call(cmd) else: logger.info('file already present: ' + uniq_txt) txt = open(uniq_txt).read() if not keeptxt: os.remove(uniq_pdf) os.remove(uniq_img) os.remove(uniq_txt) return txt
def check_install(self): if not os.path.exists(self.cache): logger.info('make cache directory for DOI requests: ' + self.cache) os.makedirs(self.cache)