Ejemplo n.º 1
def make_concepts_baseline(id, path, sents, query):
    only use first sentences
    TODO: choose best of first 3
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:
        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0: 
            skip = True
            max_order = 0
        if skip: 
            max_order += 1
        #print sent.order, max_order, sent.doc, sent
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
    return create_ilp_output(sents, final_concepts, path+id)
Ejemplo n.º 2
	def classify_query(query_words):
		input: `query_words`: a list of strings corresponding to a question
		ouput: a Question object
		question_word = query_words[0]
		query_no_stops = ut.remove_stopwords(query_words)
		found_np = False
		focus = []
		nouns = set(['NN','NNS','NNP','NNPS'])
		for word,tag in nltk.pos_tag(query_words)[1:]:
			if tag in nouns:
				found_np = True
			if found_np and tag in nouns:
			if tag not in nouns and found_np:
		return Question(question_word.lower(), focus, query_no_stops)
def make_concepts_exp(id, path, sents, query):
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
    return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id: 
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:
        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0: all_concepts[concept].add('first' + sent.doc)
                else: all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)
    return create_ilp_output(compressed_sents, final_concepts, path+id)
def synset_review(review):
	review = unicodedata.normalize('NFKD', review).encode('ascii','ignore')
	review = remove_stopwords(remove_punctuation(review.lower()))
	words = review.split()
	return ' '.join([' '.join(synset_word(word)) for word in words])
    'Best Motion Picture – Drama', 'Best Motion Picture – Musical or Comedy',
    'Best Director', 'Best Actor – Motion Picture Drama',
    'Best Actor – Motion Picture Musical or Comedy',
    'Best Actress – Motion Picture Drama',
    'Best Actress – Motion Picture Musical or Comedy',
    'Best Supporting Actor – Motion Picture',
    'Best Supporting Actress – Motion Picture', 'Best Screenplay',
    'Best Original Score', 'Best Original Song', 'Best Foreign Language Film',
    'Best Animated Feature Film',
    'Cecil DeMille Award for Lifetime Achievement in Motion Pictures'

    'Best Drama Series', 'Best Comedy Series',
    'Best Actor in a Television Drama Series',
    'Best Actor in a Television Comedy Series',
    'Best Actress in a Television Drama Series',
    'Best Actress in a Television Comedy Series',
    'Best Limited Series or Motion Picture made for Television',
    'Best Actor in a Limited Series or Motion Picture made for Television',
    'Best Actress in a Limited Series or Motion Picture made for Television',
    'Best Supporting Actor in a Series, Limited Series or Motion Picture made for Television',
    'Best Supporting Actress in a Series Limited Series or Motion Picture made for Television'

ALL_AWARDS_LOWER = [award_name.lower().split() for award_name in ALL_AWARDS]
    remove_stopwords(award_name, STOP_WORDS) for award_name in ALL_AWARDS_LOWER
 def get_terms(self):
     """ Return terms separated by whitespace """
     terms = util.remove_punctuations(self.desc)
     terms = [i for i in (terms.split()[4:]) if i is not '']
     return [util.normalize_token(i) for i in util.remove_stopwords(terms) if not i.isdigit()]
def synset_review(review):
    review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore')
    review = remove_stopwords(remove_punctuation(review.lower()))
    words = review.split()
    return ' '.join([' '.join(synset_word(word)) for word in words])
 def get_terms(self):
     """ Return list of normalized terms extracted from title and abstract field """
     # self.fields['title'] = util.remove_punctuations(self.fields['title'])
     terms = util.remove_punctuations(self.fields["title"]).split() + self.abstract
     normalized_terms = [util.normalize_token(term) for term in util.remove_stopwords(terms)]
     return normalized_terms