#if problem.id != 'D0704': continue sys.stderr.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) acronyms = framework.removeAcronymsFromProblem(problem) mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None) mapper.map_concepts() mapper.choose_sents() concept_weight = mapper.concept_weight_sets[0] #print concept_weight.keys() #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program( problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0], longuest_candidate_only=False, providedAcronyms=acronyms) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) if len(selection) == 0: sys.stderr.write( 'ERROR: empty summary, check the output of the solver\n')
for sentence in problem.get_new_sentences(): sentence.set_text(sentence.original) # sentence.stemmed = sentence.no_stop problem.query.set_text(problem.query.original) #problem.query.stemmed = problem.query.no_stop #if problem.id != 'D0704': continue sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) acronyms = framework.removeAcronymsFromProblem(problem) mapper = concept_mapper.HeuristicMapperExp(problem, "n2", None) mapper.map_concepts() mapper.choose_sents() concept_weight = mapper.concept_weight_sets[0] #print concept_weight.keys() #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0], longuest_candidate_only=False, providedAcronyms=acronyms) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) if len(selection) == 0: sys.stderr.write('ERROR: empty summary, check the output of the solver\n') sys.exit(1) #selection = ordering.by_date(selection) selection = ordering.by_dendrogram(selection, concept_weight, problem) summary = "\n".join(sentence.original for sentence in selection) summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms)
def run_standard(options, max_sents=10000): ## create output directory try: os.popen('rm -rf %s' % options.output) except: pass try: os.popen('mkdir -p %s' % options.output) except: sys.stderr.write('Error: could not create output directory [%s]\n') sys.exit() ## summarize! sys.stderr.write('generating summaries for task [%s]\n' % options.task) sys.stderr.write('length limit [%d]\n' % task.length_limit) sys.stderr.write('writing output to [%s]\n' % options.output) map_times, run_times = {}, {} ## sentence compression if options.compress: for problem in task.problems: if not '-A' in problem.id: continue sys.stderr.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) #mapper = concept_mapper.HeuristicMapper(problem, "n2") mapper = concept_mapper.CheatingMapper(problem, "n2") mapper.map_concepts() mapper.choose_sents() concept_weights = mapper.concept_weights #print concept_weight #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program( problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) selection = ordering.by_date(selection) summary = "\n".join(sentence.original for sentence in selection) #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms) ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') output_file.write(summary) output_file.close() elif options.mcd: for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) if num_problem_sentences < 500: continue used_sent_count = 0 for sentence in problem.get_new_sentences(): used_sent_count += 1 sentence.set_text(sentence.original) if used_sent_count < max_sents: sentence.used = True else: sentence.used = False problem.query.set_text(problem.query.original) sys.stdout.write( "%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) # compute idf values word_idf = {} for doc in problem.new_docs: seen_words = {} for sentence in doc.sentences: if not sentence.used: continue for word in sentence.no_stop_freq: if word not in seen_words: seen_words[word] = 1 for word in seen_words: if word not in word_idf: word_idf[word] = 1 else: word_idf[word] += 1 for word in word_idf: word_idf[word] = 1.0 / word_idf[word] # compare sentences to centroid and derive McDonald's relevance score sentences = [] index = 0 for doc in problem.new_docs: doc_text = " ".join([ sentence.original for sentence in doc.sentences if sentence.used ]) centroid = text.Sentence(doc_text) centroid.compute_norm() problem.query.compute_norm() for sentence in doc.sentences: if not sentence.used: continue sentence.compute_norm() sentence.rel_score = sentence.sim_cosine( centroid, word_idf) + 1 / (sentence.order + 1) #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf) sentences.append(sentence) sentence.index = index index += 1 # apply cutoff sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1) if options.cutoff > 0 and len(sentences) > options.cutoff: sentences = sentences[0:options.cutoff] # construct ILP program = ilp.IntegerLinearProgram(debug=0) objective = [] length_constraint = [] for sentence in sentences: objective.append("%+g s%d" % (sentence.rel_score, sentence.index)) program.binary["s%d" % sentence.index] = sentence length_constraint.append("%+g s%d" % (sentence.length, sentence.index)) for peer in sentences: if sentence == peer: continue score = sentence.sim_cosine(peer, word_idf) if score > 0: objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index)) program.binary["s%d_%d" % (sentence.index, peer.index)] = [ sentence, peer ] program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index) program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index) program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \ "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index) program.objective["score"] = " ".join(objective) program.constraints["length"] = " ".join( length_constraint) + " <= %g" % task.length_limit run_times[problem.id] = time.time() program.run() run_times[problem.id] = time.time() - run_times[problem.id] selection = [] score = 0 # get solution and check consistency for variable in program.binary: if variable in program.output and program.output[variable] == 1: if type(program.binary[variable]) == type(sentences[0]): selection.append(program.binary[variable]) score += program.binary[variable].rel_score for peer in program.output: if program.output[ peer] == 0 or peer == variable or type( program.binary[peer]) != type( sentences[0]): continue if program.binary[variable].sim_cosine( program.binary[peer], word_idf) == 0: continue quadratic = "s%d_%d" % ( program.binary[variable].index, program.binary[peer].index) if quadratic not in program.output or program.output[ quadratic] != 1: print "WARNING: %s selected but %s not selected" % ( variable, quadratic) else: score -= program.binary[variable][0].sim_cosine( program.binary[variable][1], word_idf) if program.output[ "s%d" % program.binary[variable][0].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][0].index) if program.output[ "s%d" % program.binary[variable][1].index] != 1: print "WARNING: %s selected while s%d not selected" % ( variable, program.binary[variable][1].index) #if math.fabs(program.result["score"] - score) > .1: # print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score) selection = ordering.by_date(selection) new_id = re.sub(r'.-(.)$', r'-\1', problem.id) output_file = open("%s/%s" % (options.output, new_id), "w") for sentence in selection: output_file.write(sentence.original + "\n") output_file.close() else: hist = prob_util.Counter() input_sents = [] for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) #if num_problem_sentences < 300: continue if not '-A' in problem.id: continue if options.ir: #docs = [doc for doc, val in problem.ir_docs] #for doc in docs: doc.get_sentences() num_overlap = len( set([d.id for d in problem.ir_docs ]).intersection(set([d.id for d in problem.new_docs]))) print '%s overlap: %d' % (problem.id, num_overlap) info_fh.write('%s overlap [%d]\n' % (problem.id, num_overlap)) sys.stderr.write('problem [%s] input sentences [%d]' % (problem.id, num_problem_sentences)) input_sents.append(num_problem_sentences) ## select a concept mapper map_times[problem.id] = time.time() if options.cheat: mapper = concept_mapper.CheatingMapper(problem, options.units) else: mapper = concept_mapper.HeuristicMapperExp( problem, options.units) ## timing test mapper.max_sents = max_sents ## map input concepts to weights success = mapper.map_concepts() if not success: sys.exit() ## choose a subset of the input sentences based on the mapping success = mapper.choose_sents() if not success: sys.exit() map_times[problem.id] = time.time() - map_times[problem.id] ## testing #fh = open('concept_matrix', 'w') for sent in mapper.relevant_sent_concepts: hist[len(sent)] += 1 #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]])) #fh.write('%d\n' %sent[-1]) hist[0] += (num_problem_sentences - len(mapper.relevant_sent_concepts)) #hist.displaySorted(N=100) #sys.exit() ## end testing ## setup and run the ILP run_times[problem.id] = time.time() selection = mapper.run(task.length_limit) selection = ordering.by_date(selection) run_times[problem.id] = time.time() - run_times[problem.id] ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5] + problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') word_count = 0 for sentence in selection: output_file.write(sentence.original + '\n') word_count += len(sentence.original.split()) output_file.close() curr_time = map_times[problem.id] + run_times[problem.id] sys.stderr.write(' word count [%d] time [%1.2fs]\n' % (word_count, curr_time))
def run_standard(options, max_sents=10000): ## create output directory try: os.popen('rm -rf %s' %options.output) except: pass try: os.popen('mkdir -p %s' %options.output) except: sys.stderr.write('Error: could not create output directory [%s]\n') sys.exit() ## summarize! sys.stderr.write('generating summaries for task [%s]\n' %options.task) sys.stderr.write('length limit [%d]\n' %task.length_limit) sys.stderr.write('writing output to [%s]\n' %options.output) map_times, run_times = {}, {} ## sentence compression if options.compress: for problem in task.problems: if not '-A' in problem.id: continue sys.stderr.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) #mapper = concept_mapper.HeuristicMapper(problem, "n2") mapper = concept_mapper.CheatingMapper(problem, "n2") mapper.map_concepts() mapper.choose_sents() concept_weights = mapper.concept_weights #print concept_weight #program = framework.build_program(problem, concept_weight, length=task.length_limit, sentences=mapper.relevant_sent_sets[0]) program = framework.build_alternative_program(problem, concept_weights, length=task.length_limit, sentences=mapper.relevant_sents, longuest_candidate_only=False) # run the program and get the output program.debug = 0 program.run() #selection = framework.get_program_result(program) selection = [] for variable in program.output: if re.match(r'^s\d+$', variable) and program.output[variable] == 1: selection.append(program.binary[variable]) selection = ordering.by_date(selection) summary = "\n".join(sentence.original for sentence in selection) #summary = compression.addAcronymDefinitionsToSummary(summary, program.acronyms) ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') output_file.write(summary) output_file.close() elif options.mcd: for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) if num_problem_sentences < 500: continue used_sent_count = 0 for sentence in problem.get_new_sentences(): used_sent_count += 1 sentence.set_text(sentence.original) if used_sent_count < max_sents: sentence.used = True else: sentence.used = False problem.query.set_text(problem.query.original) sys.stdout.write("%s %d\n" % (problem.id, sum([len(doc.sentences) for doc in problem.new_docs]))) # compute idf values word_idf = {} for doc in problem.new_docs: seen_words = {} for sentence in doc.sentences: if not sentence.used: continue for word in sentence.no_stop_freq: if word not in seen_words: seen_words[word] = 1 for word in seen_words: if word not in word_idf: word_idf[word] = 1 else: word_idf[word] += 1 for word in word_idf: word_idf[word] = 1.0 / word_idf[word] # compare sentences to centroid and derive McDonald's relevance score sentences = [] index = 0 for doc in problem.new_docs: doc_text = " ".join([sentence.original for sentence in doc.sentences if sentence.used]) centroid = text.Sentence(doc_text) centroid.compute_norm() problem.query.compute_norm() for sentence in doc.sentences: if not sentence.used: continue sentence.compute_norm() sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + 1 / (sentence.order + 1) #sentence.rel_score = sentence.sim_cosine(centroid, word_idf) + sentence.sim_cosine(problem.query, word_idf) sentences.append(sentence) sentence.index = index index += 1 # apply cutoff sentences.sort(lambda x, y: 1 if x.rel_score < y.rel_score else -1) if options.cutoff > 0 and len(sentences) > options.cutoff: sentences = sentences[0:options.cutoff] # construct ILP program = ilp.IntegerLinearProgram(debug=0) objective = [] length_constraint = [] for sentence in sentences: objective.append("%+g s%d" % (sentence.rel_score, sentence.index)) program.binary["s%d" % sentence.index] = sentence length_constraint.append("%+g s%d" % (sentence.length, sentence.index)) for peer in sentences: if sentence == peer: continue score = sentence.sim_cosine(peer, word_idf) if score > 0: objective.append("%+g s%d_%d" % (-score, sentence.index, peer.index)) program.binary["s%d_%d" % (sentence.index, peer.index)] = [sentence, peer] program.constraints["c1_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, sentence.index) program.constraints["c2_%d_%d" % (sentence.index, peer.index)] = \ "s%d_%d - s%d <= 0" % (sentence.index, peer.index, peer.index) program.constraints["c3_%d_%d" % (sentence.index, peer.index)] = \ "s%d + s%d - s%d_%d <= 1" % (sentence.index, peer.index, sentence.index, peer.index) program.objective["score"] = " ".join(objective) program.constraints["length"] = " ".join(length_constraint) + " <= %g" % task.length_limit run_times[problem.id] = time.time() program.run() run_times[problem.id] = time.time() - run_times[problem.id] selection = [] score = 0 # get solution and check consistency for variable in program.binary: if variable in program.output and program.output[variable] == 1: if type(program.binary[variable]) == type(sentences[0]): selection.append(program.binary[variable]) score += program.binary[variable].rel_score for peer in program.output: if program.output[peer] == 0 or peer == variable or type(program.binary[peer]) != type(sentences[0]): continue if program.binary[variable].sim_cosine(program.binary[peer], word_idf) == 0: continue quadratic = "s%d_%d" % (program.binary[variable].index, program.binary[peer].index) if quadratic not in program.output or program.output[quadratic] != 1: print "WARNING: %s selected but %s not selected" % (variable, quadratic) else: score -= program.binary[variable][0].sim_cosine(program.binary[variable][1], word_idf) if program.output["s%d" % program.binary[variable][0].index] != 1: print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][0].index) if program.output["s%d" % program.binary[variable][1].index] != 1: print "WARNING: %s selected while s%d not selected" % (variable, program.binary[variable][1].index) #if math.fabs(program.result["score"] - score) > .1: # print "WARNING: difference between score = %g and expected = %g" % (program.result["score"], score) selection = ordering.by_date(selection) new_id = re.sub(r'.-(.)$', r'-\1', problem.id) output_file = open("%s/%s" % (options.output, new_id), "w") for sentence in selection: output_file.write(sentence.original + "\n") output_file.close() else: hist = prob_util.Counter() input_sents = [] for problem in task.problems: num_problem_sentences = len(problem.get_new_sentences()) #if num_problem_sentences < 300: continue if not '-A' in problem.id: continue if options.ir: #docs = [doc for doc, val in problem.ir_docs] #for doc in docs: doc.get_sentences() num_overlap = len(set([d.id for d in problem.ir_docs]).intersection(set([d.id for d in problem.new_docs]))) print '%s overlap: %d' %(problem.id, num_overlap) info_fh.write('%s overlap [%d]\n' %(problem.id, num_overlap)) sys.stderr.write('problem [%s] input sentences [%d]' %(problem.id, num_problem_sentences)) input_sents.append(num_problem_sentences) ## select a concept mapper map_times[problem.id] = time.time() if options.cheat: mapper = concept_mapper.CheatingMapper(problem, options.units) else: mapper = concept_mapper.HeuristicMapperExp(problem, options.units) ## timing test mapper.max_sents = max_sents ## map input concepts to weights success = mapper.map_concepts() if not success: sys.exit() ## choose a subset of the input sentences based on the mapping success = mapper.choose_sents() if not success: sys.exit() map_times[problem.id] = time.time() - map_times[problem.id] ## testing #fh = open('concept_matrix', 'w') for sent in mapper.relevant_sent_concepts: hist[len(sent)] += 1 #fh.write(''.join(['%d, ' %concept for concept in sent[:-1]])) #fh.write('%d\n' %sent[-1]) hist[0] += (num_problem_sentences-len(mapper.relevant_sent_concepts)) #hist.displaySorted(N=100) #sys.exit() ## end testing ## setup and run the ILP run_times[problem.id] = time.time() selection = mapper.run(task.length_limit) selection = ordering.by_date(selection) run_times[problem.id] = time.time() - run_times[problem.id] ## TAC id convention is annoying output_id = problem.id if options.task in ['u09', 'u08']: output_id = problem.id[:5]+problem.id[6:] output_file = open('%s/%s' % (options.output, output_id), 'w') word_count = 0 for sentence in selection: output_file.write(sentence.original + '\n') word_count += len(sentence.original.split()) output_file.close() curr_time = map_times[problem.id] + run_times[problem.id] sys.stderr.write(' word count [%d] time [%1.2fs]\n' %(word_count, curr_time))