def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]): # sample_negative_examples = n: for low rate of positive examples; random sample # of n negative examples if > n negative examples in article; if n=0 then all examples # used q = QualityQuoteReader(quotes_only=False) y = [] X_words = [] study_indices = [] study_sent_indices = [] # list of (start, end) indices corresponding to each study sent_index_counter = 0 domains = q.domains() counter = 0 for i, study in enumerate(q): if i > 200: # pdb.set_trace() print "WARNING RETURNING SMALL SUBSET OF DATA!" break study_indices.append(i) # fast forward to the matching domain for domain in study.cochrane["QUALITY"]: if domain["DOMAIN"] == test_domain: break else: # pdb.set_trace() # if no matching domain continue to the next study # study_sent_indices.append(()) # continue pass quote = None has_quote = False try: quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1) has_quote = True except: ## formerly this was freaking out, instead, let's just pass pass # print "Unable to extract quote:" # print domain["DESCRIPTION"] # raise pdf_sents = sent_tokenizer.tokenize(study.studypdf) if has_quote: quote_words = word_tokenizer.tokenize(quote) quote_sent_bow = set((word.lower() for word in quote_words)) rankings = [] for pdf_i, pdf_sent in enumerate(pdf_sents): pdf_words = word_tokenizer.tokenize(pdf_sent) pdf_sent_bow = set((word.lower() for word in pdf_words)) if not pdf_sent_bow or not quote_sent_bow: prop_quote_in_sent = 0 else: prop_quote_in_sent = 100 * ( 1 - (float(len(quote_sent_bow - pdf_sent_bow)) / float(len(quote_sent_bow))) ) # print "%.0f" % (prop_quote_in_sent,) rankings.append((prop_quote_in_sent, pdf_i)) rankings.sort(key=lambda x: x[0], reverse=True) best_match_index = rankings[0][1] y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote if has_quote: y_study[best_match_index] = 1 X_words.extend(pdf_sents) sent_end_index = sent_index_counter + len(pdf_sents) study_sent_indices.append((sent_index_counter, sent_end_index)) sent_index_counter = sent_end_index y.extend(y_study) print len(X_words) print X_words[0] print "fitting vectorizer" vectorizer = CountVectorizer(max_features=10000) X = vectorizer.fit_transform(X_words) print "done!" y = np.array(y) return X, y, X_words, vectorizer, study_sent_indices, study_indices print "Finished! %d studies included domain %s" % (counter, test_domain)
def word_sent_tokenize(raw_text): return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
def _simple_BoW(study): return [s for s in word_tokenizer.tokenize(study.studypdf) if not s in string.punctuation]
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]): # sample_negative_examples = n: for low rate of positive examples; random sample # of n negative examples if > n negative examples in article; if n=0 then all examples # used q = QualityQuoteReader(quotes_only=False) y = [] X_words = [] study_indices = [] study_sent_indices = [] # list of (start, end) indices corresponding to each study sent_index_counter = 0 domains = q.domains() counter = 0 for i, study in enumerate(q): if i > 200: #pdb.set_trace() print "WARNING RETURNING SMALL SUBSET OF DATA!" break study_indices.append(i) # fast forward to the matching domain for domain in study.cochrane["QUALITY"]: if domain["DOMAIN"] == test_domain: break else: #pdb.set_trace() # if no matching domain continue to the next study #study_sent_indices.append(()) #continue pass quote = None has_quote = False try: quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1) has_quote = True except: ## formerly this was freaking out, instead, let's just pass pass # print "Unable to extract quote:" # print domain["DESCRIPTION"] # raise pdf_sents = sent_tokenizer.tokenize(study.studypdf) if has_quote: quote_words = word_tokenizer.tokenize(quote) quote_sent_bow = set((word.lower() for word in quote_words)) rankings = [] for pdf_i, pdf_sent in enumerate(pdf_sents): pdf_words = word_tokenizer.tokenize(pdf_sent) pdf_sent_bow = set((word.lower() for word in pdf_words)) if not pdf_sent_bow or not quote_sent_bow: prop_quote_in_sent = 0 else: prop_quote_in_sent = 100* (1 - (float(len(quote_sent_bow-pdf_sent_bow))/float(len(quote_sent_bow)))) # print "%.0f" % (prop_quote_in_sent,) rankings.append((prop_quote_in_sent, pdf_i)) rankings.sort(key=lambda x: x[0], reverse=True) best_match_index = rankings[0][1] y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote if has_quote: y_study[best_match_index] = 1 X_words.extend(pdf_sents) sent_end_index = sent_index_counter + len(pdf_sents) study_sent_indices.append((sent_index_counter, sent_end_index)) sent_index_counter = sent_end_index y.extend(y_study) print len(X_words) print X_words[0] print "fitting vectorizer" vectorizer = CountVectorizer(max_features=10000) X = vectorizer.fit_transform(X_words) print "done!" y = np.array(y) return X, y, X_words, vectorizer, study_sent_indices, study_indices print "Finished! %d studies included domain %s" % (counter, test_domain)