Exemple #1
0
    def cache_pdfs(self, cachepath="data/cache/", refresh=False):

        if not os.path.exists(cachepath):
            os.makedirs(cachepath)

        all_pdfs = set(
            os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))
            [0] for entry in self.index_data)

        if not refresh:
            already_done = set(
                os.path.splitext(os.path.basename(filename))[0]
                for filename in glob(cachepath + "*.txt"))
            todo = list(all_pdfs - already_done)
        else:
            todo = list(all_pdfs)

        if not todo:
            print "cache up to date"
        else:
            pb = ProgressBar(len(todo), timer=True)

        for pdf_filename in todo:

            pb.tap()

            pm = PdfReader(PDF_PATH + pdf_filename + '.pdf')
            text = pm.get_text()

            with open(cachepath + pdf_filename + '.txt', 'wb') as f:
                f.write(text)
Exemple #2
0
    def cache_pdfs(self, cachepath="data/cache/", refresh=False):

        if not os.path.exists(cachepath):
            os.makedirs(cachepath)
        
        all_pdfs = set(os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))[0] for entry in self.index_data)

        if not refresh:
            already_done = set(os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt"))
            todo = list(all_pdfs - already_done)
        else:
            todo = list(all_pdfs)

        if not todo:
            print "cache up to date"
        else:
            pb = ProgressBar(len(todo), timer=True)

        for pdf_filename in todo:
            
            pb.tap()

            pm = PdfReader(PDF_PATH + pdf_filename + '.pdf')
            text = pm.get_text()

            with open(cachepath + pdf_filename + '.txt', 'wb') as f:
                f.write(text)
Exemple #3
0
    def second_view(self, study, cachepath="data/cache/"):
        """ overrides code which gets pubmed abstract
        and instead returns the full text of an associated PDF"""

        try:
            # try to read first as plain text from the cache if exists
            with open(cachepath + os.path.splitext(os.path.basename(self.pdf_index[study['pmid']]))[0] + '.txt', 'rb') as f:
                text = f.read()
            return {"text": text, "pmid": study['pmid']}
        except:
            # otherwise run through pdftotext
            pm = PdfReader(self.pdf_index[study['pmid']])
            return {"text": pm.get_text(), "pmid": study['pmid']}
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False):

    import color

    print "Document demo: " + testfile
    print "=" * 40
    print

    raw_text = PdfReader(testfile).get_text()
    text = unidecode(raw_text)
    text = re.sub('\n', ' ', text)

    # text_sents = sent_tokenizer.tokenize(text)
    # tokenize into sentences
    sents = sent_tokenizer.tokenize(text)

    domain_limiter = 1 if test_mode else len(
        CORE_DOMAINS)  # only parse first domain in test mode

    for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip(
            CORE_DOMAINS[:domain_limiter], *models):

        ####
        ## PART ONE - get the predicted sentences with risk of bias information
        ####

        # vectorize the sentences
        X_sents = sent_vec.transform(sents)

        # get predicted 1 / -1 for the sentences
        pred_sents = sent_model.predict(X_sents)

        # get the sentences which are predicted 1
        positive_sents = [
            sent for sent, pred in zip(sents, pred_sents) if pred == 1
        ]

        # make a single string per doc
        summary_text = " ".join(positive_sents)

        ####
        ##  PART TWO - integrate summarized and full text, then predict the document class
        ####

        doc_vec.builder_clear()
        doc_vec.builder_add_docs([text])
        doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-")

        X_doc = doc_vec.builder_transform()

        prediction = doc_model.predict(X_doc)[0]
        print "-" * 30
        print test_domain

        prediction = {1: "Low", -1: "Unknown or high"}[prediction]

        print prediction

        if prediction == "Low":
            text_color = color.GREEN
        elif prediction == "Unknown or high":
            text_color = color.YELLOW

        color.printout(prediction, text_color)

        print "-" * 30