def make_concepts_exp(id, path, sents, query): """ """ query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path+id)
def query_expand(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: #if doc.doctype != 'NEWS STORY': continue for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence previous_entropy_sent = 0 previous_entropy_unit = 0 for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(sent_values).displaySorted(N=20) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) if entropy_sent >= previous_entropy_sent or entropy_unit >= previous_entropy_unit: break previous_entropy_sent = entropy_sent previous_entropy_unit = entropy_unit dist = prob_util.klDistance(prev_sent_values, sent_values) sys.stderr.write('%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist)) #if iter == 2: break if dist < 0.0001: sys.stderr.write('----------------------------') break return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def map_iterative_sents(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue ## skip sentences with no query overlap if query: sim = sent.sim_basic(query) else: sim = 1 if sim <= 0: continue sents.append(sent) ## initialize uniform sentence priors sent_values = prob_util.Counter() for sent in sents: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get unit values from doc values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(sent_values).displaySorted(N=3) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) #print '%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist) if iter == 2: break if dist < 0.0001: #print '----------------------------' break return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % ( iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def query_expand(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: #if doc.doctype != 'NEWS STORY': continue for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() original_sent_values = sent_values.copy() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence prev_unit_entropy = 0 prev_sent_entropy = 0 prev_unit_values = {} prev_sent_values = {} for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() ## interpolate with original sent weights sent_prior = 0.1 for sent in sent_values: new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent]) #sent_values[sent] = new_value #prob_util.Counter(unit_values).displaySorted(N=100) #prob_util.Counter(sent_values).displaySorted(N=20) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) sys.stderr.write('%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist)) if iter == 2: break if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): unit_values = prev_unit_values sent_values = prev_sent_values break prev_unit_entropy = entropy_unit prev_sent_entropy = entropy_sent prev_unit_values = unit_values prev_sent_values = sent_values if dist < 0.0001: break #prob_util.Counter(unit_values).displaySorted(N=10) #prob_util.Counter(sent_values).displaySorted(N=20) return prob_util.Counter(unit_values), prob_util.Counter(sent_values)