def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0: 
            skip = True
            max_order = 0
        
        if skip: 
            max_order += 1
            continue
        
        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    return create_ilp_output(sents, final_concepts, path+id)
Exemple #2
0
	def classify_query(query_words):
		'''
		input: `query_words`: a list of strings corresponding to a question
		ouput: a Question object
		'''
		question_word = query_words[0]
		query_no_stops = ut.remove_stopwords(query_words)
		found_np = False
		focus = []
		nouns = set(['NN','NNS','NNP','NNPS'])
		for word,tag in nltk.pos_tag(query_words)[1:]:
			if tag in nouns:
				found_np = True
			if found_np and tag in nouns:
				focus.append((word,tag))
			if tag not in nouns and found_np:
				break
		return Question(question_word.lower(), focus, query_no_stops)
def make_concepts_exp(id, path, sents, query):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
        
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
        
    return create_ilp_output(sents, final_concepts, path+id)
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id: 
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else: 
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0: all_concepts[concept].add('first' + sent.doc)
                else: all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue
        
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)
        
    return create_ilp_output(compressed_sents, final_concepts, path+id)
def synset_review(review):
	review = unicodedata.normalize('NFKD', review).encode('ascii','ignore')
	review = remove_stopwords(remove_punctuation(review.lower()))
	words = review.split()
	return ' '.join([' '.join(synset_word(word)) for word in words])
Exemple #6
0
    'Best Motion Picture – Drama', 'Best Motion Picture – Musical or Comedy',
    'Best Director', 'Best Actor – Motion Picture Drama',
    'Best Actor – Motion Picture Musical or Comedy',
    'Best Actress – Motion Picture Drama',
    'Best Actress – Motion Picture Musical or Comedy',
    'Best Supporting Actor – Motion Picture',
    'Best Supporting Actress – Motion Picture', 'Best Screenplay',
    'Best Original Score', 'Best Original Song', 'Best Foreign Language Film',
    'Best Animated Feature Film',
    'Cecil DeMille Award for Lifetime Achievement in Motion Pictures'
]

AWARD_NAMES_TELEVISION = [
    'Best Drama Series', 'Best Comedy Series',
    'Best Actor in a Television Drama Series',
    'Best Actor in a Television Comedy Series',
    'Best Actress in a Television Drama Series',
    'Best Actress in a Television Comedy Series',
    'Best Limited Series or Motion Picture made for Television',
    'Best Actor in a Limited Series or Motion Picture made for Television',
    'Best Actress in a Limited Series or Motion Picture made for Television',
    'Best Supporting Actor in a Series, Limited Series or Motion Picture made for Television',
    'Best Supporting Actress in a Series Limited Series or Motion Picture made for Television'
]

ALL_AWARDS = AWARD_NAMES_MOTION_PICTURE + AWARD_NAMES_TELEVISION
ALL_AWARDS_LOWER = [award_name.lower().split() for award_name in ALL_AWARDS]
ALL_AWARDS_LOWER_FILTERED = [
    remove_stopwords(award_name, STOP_WORDS) for award_name in ALL_AWARDS_LOWER
]
Exemple #7
0
 def get_terms(self):
     """ Return terms separated by whitespace """
     terms = util.remove_punctuations(self.desc)
     terms = [i for i in (terms.split()[4:]) if i is not '']
     return [util.normalize_token(i) for i in util.remove_stopwords(terms) if not i.isdigit()]
def synset_review(review):
    review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore')
    review = remove_stopwords(remove_punctuation(review.lower()))
    words = review.split()
    return ' '.join([' '.join(synset_word(word)) for word in words])
Exemple #9
0
def make_concepts_exp(id, path, sents, query):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]
    concept_vals = prob_util.normalize(concept_vals)

    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % (
            iter, se, ce)
        if iter >= 1: break

        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)

        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]
        concept_vals = prob_util.normalize(concept_vals)

    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept

    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Exemple #10
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id:
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else:
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0:
                    all_concepts[concept].add('first' + sent.doc)
                else:
                    all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue

        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(compressed_sents, final_concepts, path + id)
Exemple #11
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """

    query_words = set(
        util.porter_stem_sent(
            util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:

        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(
            util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(
            sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0:
            skip = True
            max_order = 0

        if skip:
            max_order += 1
            continue

        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Exemple #12
0
 def get_terms(self):
     """ Return list of normalized terms extracted from title and abstract field """
     # self.fields['title'] = util.remove_punctuations(self.fields['title'])
     terms = util.remove_punctuations(self.fields["title"]).split() + self.abstract
     normalized_terms = [util.normalize_token(term) for term in util.remove_stopwords(terms)]
     return normalized_terms