def cache_pdfs(self, cachepath="data/cache/", refresh=False): if not os.path.exists(cachepath): os.makedirs(cachepath) all_pdfs = set(os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))[0] for entry in self.index_data) if not refresh: already_done = set(os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt")) todo = list(all_pdfs - already_done) else: todo = list(all_pdfs) if not todo: print "cache up to date" else: pb = ProgressBar(len(todo), timer=True) for pdf_filename in todo: pb.tap() pm = PdfReader(PDF_PATH + pdf_filename + '.pdf') text = pm.get_text() with open(cachepath + pdf_filename + '.txt', 'wb') as f: f.write(text)
def cache_pdfs(self, cachepath="data/cache/", refresh=False): if not os.path.exists(cachepath): os.makedirs(cachepath) all_pdfs = set( os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']])) [0] for entry in self.index_data) if not refresh: already_done = set( os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt")) todo = list(all_pdfs - already_done) else: todo = list(all_pdfs) if not todo: print "cache up to date" else: pb = ProgressBar(len(todo), timer=True) for pdf_filename in todo: pb.tap() pm = PdfReader(PDF_PATH + pdf_filename + '.pdf') text = pm.get_text() with open(cachepath + pdf_filename + '.txt', 'wb') as f: f.write(text)
def second_view(self, study, cachepath="data/cache/"): """ overrides code which gets pubmed abstract and instead returns the full text of an associated PDF""" try: # try to read first as plain text from the cache if exists with open(cachepath + os.path.splitext(os.path.basename(self.pdf_index[study['pmid']]))[0] + '.txt', 'rb') as f: text = f.read() return {"text": text, "pmid": study['pmid']} except: # otherwise run through pdftotext pm = PdfReader(self.pdf_index[study['pmid']]) return {"text": pm.get_text(), "pmid": study['pmid']}