Example #1
0
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]):
    # sample_negative_examples = n: for low rate of positive examples; random sample
    # of n negative examples if > n negative examples in article; if n=0 then all examples
    # used

    q = QualityQuoteReader(quotes_only=False)
    y = []
    X_words = []

    study_indices = []

    study_sent_indices = []  # list of (start, end) indices corresponding to each study
    sent_index_counter = 0

    domains = q.domains()
    counter = 0

    for i, study in enumerate(q):

        if i > 200:
            # pdb.set_trace()
            print "WARNING RETURNING SMALL SUBSET OF DATA!"
            break

        study_indices.append(i)

        # fast forward to the matching domain
        for domain in study.cochrane["QUALITY"]:
            if domain["DOMAIN"] == test_domain:
                break
        else:
            # pdb.set_trace()
            # if no matching domain continue to the next study
            # study_sent_indices.append(())
            # continue
            pass

        quote = None
        has_quote = False
        try:
            quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1)
            has_quote = True
        except:
            ## formerly this was freaking out, instead, let's just pass
            pass
        #    print "Unable to extract quote:"
        #    print domain["DESCRIPTION"]
        #    raise

        pdf_sents = sent_tokenizer.tokenize(study.studypdf)

        if has_quote:
            quote_words = word_tokenizer.tokenize(quote)
            quote_sent_bow = set((word.lower() for word in quote_words))
            rankings = []

            for pdf_i, pdf_sent in enumerate(pdf_sents):
                pdf_words = word_tokenizer.tokenize(pdf_sent)
                pdf_sent_bow = set((word.lower() for word in pdf_words))

                if not pdf_sent_bow or not quote_sent_bow:
                    prop_quote_in_sent = 0
                else:
                    prop_quote_in_sent = 100 * (
                        1 - (float(len(quote_sent_bow - pdf_sent_bow)) / float(len(quote_sent_bow)))
                    )

                # print "%.0f" % (prop_quote_in_sent,)

                rankings.append((prop_quote_in_sent, pdf_i))

            rankings.sort(key=lambda x: x[0], reverse=True)
            best_match_index = rankings[0][1]

        y_study = np.zeros(len(pdf_sents))  # all zeros when we don't have a quote
        if has_quote:
            y_study[best_match_index] = 1
        X_words.extend(pdf_sents)

        sent_end_index = sent_index_counter + len(pdf_sents)
        study_sent_indices.append((sent_index_counter, sent_end_index))
        sent_index_counter = sent_end_index
        y.extend(y_study)

    print len(X_words)
    print X_words[0]

    print "fitting vectorizer"
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(X_words)
    print "done!"
    y = np.array(y)

    return X, y, X_words, vectorizer, study_sent_indices, study_indices

    print "Finished! %d studies included domain %s" % (counter, test_domain)
Example #2
0
def word_sent_tokenize(raw_text):
    return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
Example #3
0
def _simple_BoW(study):
    return [s for s in word_tokenizer.tokenize(study.studypdf) if not s in string.punctuation]
Example #4
0
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]):
    # sample_negative_examples = n: for low rate of positive examples; random sample
    # of n negative examples if > n negative examples in article; if n=0 then all examples
    # used


    q = QualityQuoteReader(quotes_only=False)
    y = []
    X_words = []
    
    study_indices = []

    study_sent_indices = [] # list of (start, end) indices corresponding to each study
    sent_index_counter = 0


    domains = q.domains()
    counter = 0

    for i, study in enumerate(q):

        if i > 200:
            #pdb.set_trace()
            print "WARNING RETURNING SMALL SUBSET OF DATA!"
            break

        study_indices.append(i)

        # fast forward to the matching domain
        for domain in study.cochrane["QUALITY"]:
            if domain["DOMAIN"] == test_domain:
                break
        else:
            #pdb.set_trace()
            # if no matching domain continue to the next study
            #study_sent_indices.append(())
            #continue
            pass

        quote = None
        has_quote = False
        try:
            quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1)
            has_quote = True
        except:
            ## formerly this was freaking out, instead, let's just pass
            pass
        #    print "Unable to extract quote:"
        #    print domain["DESCRIPTION"]
        #    raise



        pdf_sents = sent_tokenizer.tokenize(study.studypdf)

 
        
        if has_quote:
            quote_words = word_tokenizer.tokenize(quote)
            quote_sent_bow = set((word.lower() for word in quote_words))
            rankings = []

            for pdf_i, pdf_sent in enumerate(pdf_sents):
                pdf_words = word_tokenizer.tokenize(pdf_sent)
                pdf_sent_bow = set((word.lower() for word in pdf_words))

                if not pdf_sent_bow or not quote_sent_bow:
                    prop_quote_in_sent = 0
                else:
                    prop_quote_in_sent = 100* (1 - (float(len(quote_sent_bow-pdf_sent_bow))/float(len(quote_sent_bow))))

            # print "%.0f" % (prop_quote_in_sent,)

                rankings.append((prop_quote_in_sent, pdf_i))

            rankings.sort(key=lambda x: x[0], reverse=True)
            best_match_index = rankings[0][1]

        y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote
        if has_quote:
            y_study[best_match_index] = 1
        X_words.extend(pdf_sents)



        sent_end_index = sent_index_counter + len(pdf_sents)
        study_sent_indices.append((sent_index_counter, sent_end_index))
        sent_index_counter = sent_end_index
        y.extend(y_study)

 


                    
                    
                


    print len(X_words)
    print X_words[0]

    print "fitting vectorizer"
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(X_words)            
    print "done!"
    y = np.array(y)

    return X, y, X_words, vectorizer, study_sent_indices, study_indices

    print "Finished! %d studies included domain %s" % (counter, test_domain)
Example #5
0
def word_sent_tokenize(raw_text):
    return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
Example #6
0
def _simple_BoW(study):
    return [s for s in word_tokenizer.tokenize(study.studypdf) 
                if not s in string.punctuation]