def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def classify_query(query_words): ''' input: `query_words`: a list of strings corresponding to a question ouput: a Question object ''' question_word = query_words[0] query_no_stops = ut.remove_stopwords(query_words) found_np = False focus = [] nouns = set(['NN','NNS','NNP','NNPS']) for word,tag in nltk.pos_tag(query_words)[1:]: if tag in nouns: found_np = True if found_np and tag in nouns: focus.append((word,tag)) if tag not in nouns and found_np: break return Question(question_word.lower(), focus, query_no_stops)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path+id)
def synset_review(review): review = unicodedata.normalize('NFKD', review).encode('ascii','ignore') review = remove_stopwords(remove_punctuation(review.lower())) words = review.split() return ' '.join([' '.join(synset_word(word)) for word in words])
'Best Motion Picture – Drama', 'Best Motion Picture – Musical or Comedy', 'Best Director', 'Best Actor – Motion Picture Drama', 'Best Actor – Motion Picture Musical or Comedy', 'Best Actress – Motion Picture Drama', 'Best Actress – Motion Picture Musical or Comedy', 'Best Supporting Actor – Motion Picture', 'Best Supporting Actress – Motion Picture', 'Best Screenplay', 'Best Original Score', 'Best Original Song', 'Best Foreign Language Film', 'Best Animated Feature Film', 'Cecil DeMille Award for Lifetime Achievement in Motion Pictures' ] AWARD_NAMES_TELEVISION = [ 'Best Drama Series', 'Best Comedy Series', 'Best Actor in a Television Drama Series', 'Best Actor in a Television Comedy Series', 'Best Actress in a Television Drama Series', 'Best Actress in a Television Comedy Series', 'Best Limited Series or Motion Picture made for Television', 'Best Actor in a Limited Series or Motion Picture made for Television', 'Best Actress in a Limited Series or Motion Picture made for Television', 'Best Supporting Actor in a Series, Limited Series or Motion Picture made for Television', 'Best Supporting Actress in a Series Limited Series or Motion Picture made for Television' ] ALL_AWARDS = AWARD_NAMES_MOTION_PICTURE + AWARD_NAMES_TELEVISION ALL_AWARDS_LOWER = [award_name.lower().split() for award_name in ALL_AWARDS] ALL_AWARDS_LOWER_FILTERED = [ remove_stopwords(award_name, STOP_WORDS) for award_name in ALL_AWARDS_LOWER ]
def get_terms(self): """ Return terms separated by whitespace """ terms = util.remove_punctuations(self.desc) terms = [i for i in (terms.split()[4:]) if i is not ''] return [util.normalize_token(i) for i in util.remove_stopwords(terms) if not i.isdigit()]
def synset_review(review): review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore') review = remove_stopwords(remove_punctuation(review.lower())) words = review.split() return ' '.join([' '.join(synset_word(word)) for word in words])
def make_concepts_exp(id, path, sents, query): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % ( iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def make_concepts_compress2(id, path, sents, query, compressed_sents): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) ## different processing for set A and set B if '-B' in id: first_weight = 2 count_thresh = 4 query_thresh = 0 else: first_weight = 1 count_thresh = 3 query_thresh = 1 for sent in sents: ## store this sentence's concepts sent.concepts = set() concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) >= query_thresh: for concept in concepts: if sent.order == 0: all_concepts[concept].add('first' + sent.doc) else: all_concepts[concept].add(sent.doc) ## ignore some sents skip = False #if not sent.new_par: skip = True #if sent.length <= 20: skip = True if sent.tok in seen_sents: skip = True #if sent.ignore: skip = True if skip: continue seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) firsts = len([1 for d in docs if 'first' in d]) count = count + (first_weight * firsts) if count < count_thresh: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) for sent in compressed_sents: sent.concepts = set([]) if sent.unresolved: continue if sent.length < 10: continue if re.match('^["(].*[")]$', sent.orig): skip = True concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = concepts.intersection(final_concept_set) return create_ilp_output(compressed_sents, final_concepts, path + id)
def make_concepts_baseline(id, path, sents, query): """ only use first sentences TODO: choose best of first 3 """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) seen_sents = set() all_concepts = collections.defaultdict(set) max_order = 0 for sent in sents: ## store this sentence's concepts sent.concepts = set([]) concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) ## get query overlap query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) ## aggregate all concepts if len(query_overlap) > 0: for concept in concepts: all_concepts[concept].add(sent.doc) if sent.order == 0: for concept in concepts: all_concepts[concept].add(sent.doc + 'first') ## ignore some sents if sent.order == 0: max_order = 0 skip = False if sent.length <= 5: skip = True if sent.tok in seen_sents: skip = True #if sent.length < 20: skip = True if sent.order > max_order or max_order > 0: skip = True max_order = 0 if skip: max_order += 1 continue #print sent.order, max_order, sent.doc, sent seen_sents.add(sent.tok) sent.concepts = concepts ## create final concept set final_concepts = {} for concept, docs in all_concepts.items(): count = len(docs) #if count < 3: continue if util.is_just_stopwords(concept.split('_')): continue final_concepts[concept] = count final_concept_set = set(final_concepts.keys()) for sent in sents: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def get_terms(self): """ Return list of normalized terms extracted from title and abstract field """ # self.fields['title'] = util.remove_punctuations(self.fields['title']) terms = util.remove_punctuations(self.fields["title"]).split() + self.abstract normalized_terms = [util.normalize_token(term) for term in util.remove_stopwords(terms)] return normalized_terms