def concept_compare(mapper, gold_mapper): """ compare mapper's concepts to the gold concepts """ ## get concepts for the gold mapper (mapper should already be done) gold_mapper.map_concepts() gold_mapper.choose_sents() gold_mapper.format_output() for update_index in [0]: print 'update [%d]' %update_index gold_sorted_keys = prob_util.Counter(gold_mapper.concept_weight_sets[update_index]).sortedKeys() for concept in gold_sorted_keys: gold_weight = gold_mapper.concept_weight_sets[update_index][concept] try: heuristic_weight = mapper.concept_weight_sets[update_index][concept] except: heuristic_weight = 0 print 'my[%1.2f] gold[%1.2f] [%s]' %(heuristic_weight, gold_weight, ' '.join(concept), ) heur_sorted_keys = prob_util.Counter(mapper.concept_weight_sets[update_index]).sortedKeys() for concept in heur_sorted_keys: if concept in gold_sorted_keys: continue heuristic_weight = mapper.concept_weight_sets[update_index][concept] print 'my[%1.2f] gold[%1.2f] [%s]' %(heuristic_weight, 0, ' '.join(concept)) print '----------------------------'
def get_full_concepts(docs, query): """ """ ## get sentence set sents = [] used_sents = set() for doc in docs: for sent in doc.sentences: ## ignore duplicate sentences sent_stemmed_str = ' '.join(sent.stemmed) if sent_stemmed_str in used_sents: continue used_sents.add(sent_stemmed_str) sents.append(sent) ngrams = prob_util.Counter() for i in range(len(sents) - 1): for j in range(i + 1, len(sents)): matches = get_overlaps(sents[i], sents[j]) for match in matches: if text.text_processor.is_just_stopwords(match): continue ngrams[match] += 1 #ngrams[match] += (10 ** (len(match)-1)) / 1000.0 for ngram, count in ngrams.items(): if count <= 1: ngrams.pop(ngram) else: ngrams[ngram] = (1.0 / 10000) * count * (10**len(ngram) - 1) ngrams.displaySorted(N=40) return ngrams
def make_query(problem): """ """ mapper = concept_mapper.HeuristicMapper(problem, 'n1') mapper.map_concepts() concepts = prob_util.Counter(mapper.concepts).sortedKeys() concepts = [c[0] for c in concepts] return concepts
def map_concepts(self): """ """ #do_train = True do_train = False ## get features in boostexter format lines, concepts, concept_freq = setup_features(self.problem, self.unit_selector, train=do_train) ## write to file filename = '../train/%s.data' % self.problem.id fh = open(filename, 'w') fh.write('\n'.join(lines) + '\n') fh.close() if do_train: return ## classify model_stem = '../train/all' cmd = '%s -S %s -C < %s' % (BOOSTING_LEARNER, model_stem, filename) results = os.popen(cmd).readlines() concept_weights = prob_util.Counter() all_concept_weights = {} for i in range(len(results)): score = float(results[i].split()[-1]) concept_weights[concepts[i]] += score if not concepts[i] in all_concept_weights: all_concept_weights[concepts[i]] = [] all_concept_weights[concepts[i]].append(score) #concept_weights.displaySorted(N=1000) ## pruning final_concept_weights = {} count = 0 for key in concept_weights.sortedKeys()[:300]: count += 1 value = concept_weights[key] if value <= 0: break final_concept_weights[key] = value mean_value = sum(all_concept_weights[key]) / len( all_concept_weights[key]) final_concept_weights[key] = mean_value * concept_freq[key] if count <= 10: print key, mean_value * concept_freq[key] print 'concepts used: %d' % count self.concept_sets = [final_concept_weights]
def map_iterative_docs(docs, unit_selector, query): ## initialize uniform doc priors doc_values = prob_util.Counter() for doc in docs: doc_values[doc.docid] = 1 doc_values = doc_values.makeProbDist() ## get units in each doc doc_units = {} used_sents = set() for doc in docs: doc_units[doc.docid] = prob_util.Counter() for sent in doc.sentences: if query: sim = sent.sim_basic(query) else: sim = 1 if sim <= 0: continue units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue doc_units[doc.docid][unit] += 1 ## repeat until convergence for iter in range(1, 51): prev_doc_values = doc_values.copy() ## get unit values from doc values unit_values = prob_util.Counter() for doc in doc_units: for unit in doc_units[doc]: unit_values[unit] += doc_values[doc] unit_values = unit_values.makeProbDist() ## get doc values from unit values doc_values = prob_util.Counter() for doc in doc_units: for unit in doc_units[doc]: doc_values[doc] += unit_values[unit] / len(doc_units[doc]) #print '%d, %s %1.4f %d' %(iter, unit, unit_values[unit], len(doc_units[doc])) doc_values = doc_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(doc_values).displaySorted(N=10) ## check for convergence if iter == 1: break dist = prob_util.euclidianDistance(prev_doc_values, doc_values) print 'dist [%1.6f]' % dist if dist < 0.0001: break #sys.exit() return prob_util.Counter(unit_values), prob_util.Counter(doc_values)
def prep_docs(path, out_path): files = os.popen('ls %s*.sent' %path).read().splitlines() ## on the first pass, create a vocab mapping vocab = set() for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) vocab.update(concepts) fh = open(out_path+'vocab', 'w') vocab = zip(vocab, range(len(vocab))) for concept, count in vocab: fh.write('%s %d\n' %(concept, count)) fh.close() vocab = dict(vocab) ## on the second pass, output one doc per line for file in files: if '-B' in file: continue sents = open(file).read().splitlines() doc = prob_util.Counter() for sent in sents[:20]: s = util.porter_stem_sent(util.tokenize(fix_text(sent))) concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True)) for concept in concepts: doc[concept] += 1 ## doc output output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()])) print output
def map_concepts(self): """ """ ## get document statistics concept_sets = [] sent_count = 0 used_sents = set() for doc_set in [self.problem.new_docs]: concept_set = prob_util.Counter() concept_set, doc_values = query_expand(doc_set, self.unit_selector, self.problem.query) concept_sets.append(concept_set) ## apply a few transformations max_concepts = 60 max_concept_sum = 0.5 self.concept_sets = [] for update_index in range(len(concept_sets)): final_concept_set = {} num_used_concepts = 0 concept_sum = 0 for concept in concept_sets[update_index].sortedKeys(): score = concept_sets[update_index][concept] ## don't include more than max_concepts if num_used_concepts >= max_concepts: break if concept_sum >= max_concept_sum: print 'concepts used: %d' %num_used_concepts break remove = False ## downweight concepts appearing in previous sets #for prev_index in range(update_index): # if concept in concept_sets[prev_index]: score = 0.5*score ## add to final concept set if not remove: final_concept_set[concept] = score num_used_concepts += 1 concept_sum += score #print count, concept self.concept_sets.append(final_concept_set)
def make_concepts_exp(id, path, sents, query): """ """ query_words = set( util.porter_stem_sent( util.remove_stopwords(util.tokenize(fix_text(query)))).split()) ## get sentence values sent_vals = prob_util.Counter() for sent in sents: query_overlap = set(util.remove_stopwords( sent.tok2.split())).intersection(query_words) sent_vals[sent] = max(0, len(query_overlap)) #if sent.relevance < 0.3: sent_vals[sent] = 0.0 #else: sent_vals[sent] = 100000**sent.relevance concepts = set( util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True)) sent.concepts = set() for concept in concepts: if util.is_just_stopwords(concept.split('_')): continue sent.concepts.add(concept) sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) iter = 0 while True: iter += 1 se = prob_util.entropy(sent_vals) ce = prob_util.entropy(concept_vals) print >> sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' % ( iter, se, ce) if iter >= 1: break ## get sent vals again sent_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: sent_vals[sent] += concept_vals[concept] sent_vals = prob_util.normalize(sent_vals) ## get concept values concept_vals = prob_util.Counter() for sent in sents: for concept in sent.concepts: concept_vals[concept] += sent_vals[sent] concept_vals = prob_util.normalize(concept_vals) sorted_sents = sent_vals.sortedKeys() #for sent in sorted_sents: # print sent_vals[sent], sent.order, sent.new_par, sent sorted_concepts = concept_vals.sortedKeys() #for concept in sorted_concepts: # print concept_vals[concept], concept ## create final concept set final_concepts = {} for concept in sorted_concepts: val = concept_vals[concept] #if val < 0.00001: continue final_concepts[concept] = val final_concept_set = set(final_concepts.keys()) ## get final sentence list and their concepts seen_sents = set() for sent in sents: skip = False if sent.length <= 5: skip = True if sent in seen_sents: skip = True if sent.order > 0: skip = True else: seen_sents.add(sent) if skip: sent.concepts = set() else: sent.concepts = sent.concepts.intersection(final_concept_set) return create_ilp_output(sents, final_concepts, path + id)
def run_standard(options, max_sents=10000): ## create output directory try: os.popen('rm -rf %s' % options.output) except: pass try: os.popen('mkdir -p %s' % options.output) except: sys.stderr.write('Error: could not create output directory [%s]\n') sys.exit() ## summarize! sys.stderr.write('generating summaries for task [%s]\n' % options.task) sys.stderr.write('length limit [%d]\n' % task.length_limit) sys.stderr.write('writing output to [%s]\n' % options.output) map_times, run_times = {}, {} ## sentence compression if options.compress: for problem in task.problems: if not '-A' in problem.id: continue sys.stderr.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) #mapper = concept_mapper.HeuristicMapper(problem, "n2") mapper = concept_mapper.CheatingMapper(problem, "n2") mapper.map_concepts() mapper.choose_sents() concept_weights = mapper.concept_weights #print concept_weight #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program( problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) selection = ordering.by_date(selection) summary = "\n".join(sentence.original for sentence in selection) #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms) ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') output_file.write(summary) output_file.close() elif options.mcd: for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) if num_problem_sentences < 500: continue used_sent_count = 0 for sentence in problem.get_new_sentences(): used_sent_count += 1 sentence.set_text(sentence.original) if used_sent_count < max_sents: sentence.used = True else: sentence.used = False problem.query.set_text(problem.query.original) sys.stdout.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) # compute idf values word_idf = {} for doc in problem.new_docs: seen_words = {} for sentence in doc.sentences: if not sentence.used: continue for word in sentence.no_stop_freq: if word not in seen_words: seen_words[word] = 1 for word in seen_words: if word not in word_idf: word_idf[word] = 1 else: word_idf[word] += 1 for word in word_idf: word_idf[word] = 1.0 / word_idf[word] # compare sentences to centroid and derive McDonald's relevance score sentences = [] index = 0 for doc in problem.new_docs: doc_text = " ".join([ sentence.original for sentence in doc.sentences if sentence.used ]) centroid = text.Sentence(doc_text) centroid.compute_norm() problem.query.compute_norm() for sentence in doc.sentences: if not sentence.used: continue sentence.compute_norm() sentence.rel_score = sentence.sim_cosine( centroid, word_idf) + 1 / (sentence.order + 1) #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf) sentences.append(sentence) sentence.index = index index += 1 # apply cutoff sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1) if options.cutoff > 0 and len(sentences) > options.cutoff: sentences = sentences[0:options.cutoff] # construct ILP program = ilp.IntegerLinearProgram(debug=0) objective = [] length_constraint = [] for sentence in sentences: objective.append("%+g s%d" % (sentence.rel_score, sentence.index)) program.binary["s%d" % sentence.index] = sentence length_constraint.append("%+g s%d" % (sentence.length, sentence.index)) for peer in sentences: if sentence == peer: continue score = sentence.sim_cosine(peer, word_idf) if score > 0: objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index)) program.binary["s%d_%d" % (sentence.index, peer.index)] = [ sentence, peer ] program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index) program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index) program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \ "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index) program.objective["score"] = " ".join(objective) program.constraints["length"] = " ".join( length_constraint) + " <= %g" % task.length_limit run_times[problem.id] = time.time() program.run() run_times[problem.id] = time.time() - run_times[problem.id] selection = [] score = 0 # get solution and check consistency for variable in program.binary: if variable in program.output and program.output[variable] == 1: if type(program.binary[variable]) == type(sentences[0]): selection.append(program.binary[variable]) score += program.binary[variable].rel_score for peer in program.output: if program.output[ peer] == 0 or peer == variable or type( program.binary[peer]) != type( sentences[0]): continue if program.binary[variable].sim_cosine( program.binary[peer], word_idf) == 0: continue quadratic = "s%d_%d" % ( program.binary[variable].index, program.binary[peer].index) if quadratic not in program.output or program.output[ quadratic] != 1: print "WARNING: %s selected but %s not selected" % ( variable, quadratic) else: score -= program.binary[variable][0].sim_cosine( program.binary[variable][1], word_idf) if program.output[ "s%d" % program.binary[variable][0].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][0].index) if program.output[ "s%d" % program.binary[variable][1].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][1].index) #if math.fabs(program.result["score"] - score) > .1: # print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score) selection = ordering.by_date(selection) new_id = re.sub(r'.-(.)$', r'-\1', problem.id) output_file = open("%s/%s" % (options.output, new_id), "w") for sentence in selection: output_file.write(sentence.original + "\n") output_file.close() else: hist = prob_util.Counter() input_sents = [] for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) #if num_problem_sentences < 300: continue if not '-A' in problem.id: continue if options.ir: #docs = [doc for doc, val in problem.ir_docs] #for doc in docs: doc.get_sentences() num_overlap = len( set([d.id for d in problem.ir_docs ]).intersection(set([d.id for d in problem.new_docs]))) print '%s overlap: %d' % (problem.id, num_overlap) info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap)) sys.stderr.write('problem [%s] input sentences [%d]' % (problem.id, num_problem_sentences)) input_sents.append(num_problem_sentences) ## select a concept mapper map_times[problem.id] = time.time() if options.cheat: mapper = concept_mapper.CheatingMapper(problem, options.units) else: mapper = concept_mapper.HeuristicMapperExp( problem, options.units) ## timing test mapper.max_sents = max_sents ## map input concepts to weights success = mapper.map_concepts() if not success: sys.exit() ## choose a subset of the input sentences based on the mapping success = mapper.choose_sents() if not success: sys.exit() map_times[problem.id] = time.time() - map_times[problem.id] ## testing #fh = open('concept_matrix', 'w') for sent in mapper.relevant_sent_concepts: hist[len(sent)] += 1 #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]])) #fh.write('%d\n' %sent[-1]) hist[0] += (num_problem_sentences - len(mapper.relevant_sent_concepts)) #hist.displaySorted(N=100) #sys.exit() ## end testing ## setup and run the ILP run_times[problem.id] = time.time() selection = mapper.run(task.length_limit) selection = ordering.by_date(selection) run_times[problem.id] = time.time() - run_times[problem.id] ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') word_count = 0 for sentence in selection: output_file.write(sentence.original + '\n') word_count += len(sentence.original.split()) output_file.close() curr_time = map_times[problem.id] + run_times[problem.id] sys.stderr.write(' word count [%d] time [%1.2fs]\n' % (word_count, curr_time))
def setup_features(problem, unit_selector, train=True): ## for training, get gold concepts gold_concepts = prob_util.Counter() if train: for annotator in problem.annotators: annotator_concepts = {} for sent in problem.training[annotator]: sentence = text.Sentence(sent) units = unit_selector(sentence.stemmed) for unit in units: if unit not in annotator_concepts: annotator_concepts[unit] = 0 annotator_concepts[unit] += 1 for concept in annotator_concepts: gold_concepts[concept] += 1 ## get all sentences and unit frequencies sents = [] doc_freq = prob_util.Counter() sent_freq = prob_util.Counter() raw_freq = prob_util.Counter() for doc in problem.new_docs: #if doc.doctype != 'NEWS STORY': continue doc_counts = prob_util.Counter() for sent in doc.sentences: sent_counts = prob_util.Counter() sents.append(sent) for unit in unit_selector(sent.stemmed): doc_counts[unit] += 1 sent_counts[unit] += 1 for unit in sent_counts: sent_freq[unit] += 1 for unit in doc_counts: doc_freq[unit] += 1 raw_freq[unit] += doc_counts[unit] ## get features for each concept unit lines = [] concepts = [] title = text.Sentence(problem.title) narr = text.Sentence(problem.narr) for sent in sents: ## sentence features sentence_sim = sent.sim_basic(problem.query) sentence_order = sent.order sentence_source = sent.source sentence_length = sent.length units = unit_selector(sent.stemmed) for unit in units: ## concept features stopword_ratio = 1 - (1.0*len(text.text_processor.remove_stopwords(unit)) / len(unit)) doc_ratio = 1.0 * doc_freq[unit] / len(problem.new_docs) sent_ratio = 1.0 * sent_freq[unit] / len(sents) ngram = ' '.join(unit) sunit = text.Sentence(ngram) title_sim = sunit.sim_basic(title) narr_sim = sunit.sim_basic(narr) ## output format (boostexter) line = '%s, %1.2f, %1.2f, %1.2f, ' %(ngram, doc_ratio, sent_ratio, stopword_ratio) line += '%1.2f, %d, %s, %d, ' %(sentence_sim, sentence_order, sentence_source, sentence_length) line += '%1.2f, %1.2f, ' %(title_sim, narr_sim) if train: line += '%s' %int(gold_concepts[unit]>0) else: line += '0' line += '.' if stopword_ratio == 1: continue lines.append(line) concepts.append(unit) for rep in range(int(gold_concepts[unit]-1)): if train: lines.append(line) concepts.append(unit) return lines, concepts, doc_freq
def query_expand(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: #if doc.doctype != 'NEWS STORY': continue for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() original_sent_values = sent_values.copy() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence prev_unit_entropy = 0 prev_sent_entropy = 0 prev_unit_values = {} prev_sent_values = {} for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() ## interpolate with original sent weights sent_prior = 0.1 for sent in sent_values: new_value = (sent_prior * original_sent_values[sent]) + ( (1-sent_prior) * sent_values[sent]) #sent_values[sent] = new_value #prob_util.Counter(unit_values).displaySorted(N=100) #prob_util.Counter(sent_values).displaySorted(N=20) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) sys.stderr.write('%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]\n' %(iter, entropy_sent, entropy_unit, dist)) if iter == 2: break if (entropy_unit >= prev_unit_entropy) and (entropy_sent >= prev_sent_entropy): unit_values = prev_unit_values sent_values = prev_sent_values break prev_unit_entropy = entropy_unit prev_sent_entropy = entropy_sent prev_unit_values = unit_values prev_sent_values = sent_values if dist < 0.0001: break #prob_util.Counter(unit_values).displaySorted(N=10) #prob_util.Counter(sent_values).displaySorted(N=20) return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def map_iterative_sents(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: for sent in doc.sentences: ## skip short sentences #if sent.length <= 5: continue ## skip sentences with no query overlap if query: sim = sent.sim_basic(query) else: sim = 1 if sim <= 0: continue sents.append(sent) ## initialize uniform sentence priors sent_values = prob_util.Counter() for sent in sents: sent_values[sent.original] = 1 sent_values = sent_values.makeProbDist() ## get units in each sent sent_units = {} for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 ## repeat until convergence for iter in range(1, 51): prev_sent_values = sent_values.copy() ## get unit values from doc values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: unit_values[unit] += sent_values[sent] unit_values = unit_values.makeProbDist() ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist() #prob_util.Counter(unit_values).displaySorted(N=5) #prob_util.Counter(sent_values).displaySorted(N=3) ## check for convergence entropy_sent = prob_util.entropy(sent_values) entropy_unit = prob_util.entropy(unit_values) dist = prob_util.klDistance(prev_sent_values, sent_values) #print '%d sent entropy [%1.4f] unit entropy [%1.4f] sent dist [%1.6f]' %(iter, entropy_sent, entropy_unit, dist) if iter == 2: break if dist < 0.0001: #print '----------------------------' break return prob_util.Counter(unit_values), prob_util.Counter(sent_values)
def map_concepts(self): """ """ min_count = 3 use_log_weights = False ## get document statistics concept_sets = [] sent_count = 0 used_sents = set() for doc_set in [self.problem.new_docs]: concept_set = prob_util.Counter() for doc in doc_set: #if doc.doctype != 'NEWS STORY': continue doc_concepts = {} for sent in doc.sentences: sent_count += 1 ## ignore short sentences if sent.length < self.min_sent_length: continue ## ignore duplicate sentences sent_stemmed_str = ' '.join(sent.stemmed) if sent_stemmed_str in used_sents: continue used_sents.add(sent_stemmed_str) ## don't consider sentences with no query overlap if self.problem.query: sim = sent.sim_basic(self.problem.query) else: sim = 1 if sim <= 0: continue ## TODO: using sent.stemmed -- could make this more general units = self.unit_selector(sent.stemmed) for unit in units: if not unit in doc_concepts: doc_concepts[unit] = 0 doc_concepts[unit] += 1 # simple count use_doc_freq = len(doc_set) > min_count for concept, count in doc_concepts.items(): if not concept in concept_set: concept_set[concept] = 0 if use_doc_freq: concept_set[concept] += 1 # doc frequency else: concept_set[concept] += count # raw frequency concept_sets.append(concept_set) ## apply a few transformations self.concept_sets = [] for update_index in range(len(concept_sets)): final_concept_set = {} num_used_concepts = 0 for concept in concept_sets[update_index].sortedKeys(): count = concept_sets[update_index][concept] remove = False ## remove low frequency concepts if count < min_count: remove = True ## remove stop word concepts (word ngrams only!) if self.unit_name[0] in ['n', 's']: if text.text_processor.is_just_stopwords(concept): remove = True ## use log weights if use_log_weights: score = math.log(count, 2) else: score = count ## add to final concept set if not remove: final_concept_set[concept] = score num_used_concepts += 1 self.concept_sets.append(final_concept_set)
def get_values(docs, unit_selector, query): ## get sentence set sents = [] for doc in docs: for sent in doc.sentences: sents.append(sent) ## initialize sentences with query similarity sent_values = prob_util.Counter() for sent in sents: try: sent_values[sent.original] = sent.sim_basic(query) except: sent_values[sent.original] = 1 #sent_values = sent_values.makeProbDist() original_sent_values = sent_values.copy() ## get units in each sent and co-occurrences of units sent_units = {} co_units = prob_util.CondCounter() for sent in sents: sent_units[sent.original] = prob_util.Counter() units = unit_selector(sent.stemmed) for unit in units: if text.text_processor.is_just_stopwords(unit): continue sent_units[sent.original][unit] += 1 for co_unit in units: if unit == co_unit: continue co_units[unit][co_unit] += 1 ## get new unit values from sent values unit_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: #unit_values[unit] += sent_values[sent] unit_values[unit] += 1 ## greedy procedure for removing co-occurrence values curr_unit_values = unit_values.copy() new_unit_values = prob_util.Counter() while True: best_unit = curr_unit_values.sortedKeys()[0] new_unit_values[best_unit] = curr_unit_values[best_unit] print best_unit, new_unit_values[best_unit] curr_unit_values.pop(best_unit) for unit in curr_unit_values: new_val = curr_unit_values[unit] - co_units[best_unit][unit] if new_val > 1: curr_unit_values[unit] = new_val if max(curr_unit_values.values()) < 2: break if len(new_unit_values) >= 65: break unit_values = new_unit_values print '--------------', len(unit_values) return unit_values, sent_values ## get sent values from unit values sent_values = prob_util.Counter() for sent in sent_units: for unit in sent_units[sent]: sent_values[sent] += unit_values[unit] #/ len(sent_units[sent]) sent_values = sent_values.makeProbDist()