Exemple #1
0
    def cache_pdfs(self, cachepath="data/cache/", refresh=False):

        if not os.path.exists(cachepath):
            os.makedirs(cachepath)
        
        all_pdfs = set(os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))[0] for entry in self.index_data)

        if not refresh:
            already_done = set(os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt"))
            todo = list(all_pdfs - already_done)
        else:
            todo = list(all_pdfs)

        if not todo:
            print "cache up to date"
        else:
            pb = ProgressBar(len(todo), timer=True)

        for pdf_filename in todo:
            
            pb.tap()

            pm = PdfReader(PDF_PATH + pdf_filename + '.pdf')
            text = pm.get_text()

            with open(cachepath + pdf_filename + '.txt', 'wb') as f:
                f.write(text)
Exemple #2
0
    def cache_pdfs(self, cachepath="data/cache/", refresh=False):

        if not os.path.exists(cachepath):
            os.makedirs(cachepath)

        all_pdfs = set(
            os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))
            [0] for entry in self.index_data)

        if not refresh:
            already_done = set(
                os.path.splitext(os.path.basename(filename))[0]
                for filename in glob(cachepath + "*.txt"))
            todo = list(all_pdfs - already_done)
        else:
            todo = list(all_pdfs)

        if not todo:
            print "cache up to date"
        else:
            pb = ProgressBar(len(todo), timer=True)

        for pdf_filename in todo:

            pb.tap()

            pm = PdfReader(PDF_PATH + pdf_filename + '.pdf')
            text = pm.get_text()

            with open(cachepath + pdf_filename + '.txt', 'wb') as f:
                f.write(text)
Exemple #3
0
    def second_view(self, study, cachepath="data/cache/"):
        """ overrides code which gets pubmed abstract
        and instead returns the full text of an associated PDF"""

        try:
            # try to read first as plain text from the cache if exists
            with open(cachepath + os.path.splitext(os.path.basename(self.pdf_index[study['pmid']]))[0] + '.txt', 'rb') as f:
                text = f.read()
            return {"text": text, "pmid": study['pmid']}
        except:
            # otherwise run through pdftotext
            pm = PdfReader(self.pdf_index[study['pmid']])
            return {"text": pm.get_text(), "pmid": study['pmid']}