def removeAcronymsFromProblem(problem): acronymMapping = compression.generateAcronymMapping( problem.get_new_and_old_sentences()) # this will modify the sentences, but keep parse trees compression.replaceAcronyms(problem.get_new_sentences(), acronymMapping) return acronymMapping
def build_alternative_program(problem, concept_weight, length=100, sentences = None, longuest_candidate_only=False): if not sentences: sentences = problem.get_new_sentences() for sentence in sentences: if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode(sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping([s.compression_node for s in sentences]) #print "generating acronyms" acronymMapping = compression.generateAcronymMapping(problem.get_new_sentences()) print problem.id, acronymMapping compressed_sentences = [] seen_sentences = {} group_id = 0 for sentence in sentences: subsentences = sentence.compression_node.getNodesByFilter(compression.TreebankNode.isSubsentence) candidates = {} for node in subsentences: candidates.update(node.getCandidates(mapping=nounPhraseMapping)) if longuest_candidate_only: max_length = 0 argmax = None for candidate in candidates: if len(candidate) > max_length: max_length = len(candidate) argmax = candidate if argmax != None: candidates = [argmax] for candidate in candidates: new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date) if new_sentence.length <= 5: continue # skip short guys new_sentence.group_id = group_id compressed_sentences.append(new_sentence) seen_sentences[new_sentence.original] = 1 group_id += 1 compression.replaceAcronyms(compressed_sentences, acronymMapping) log_file = open("%s.log" % problem.id, "w") for sentence in compressed_sentences: log_file.write("%d %s\n" %( group_id, str(sentence))) log_file.close() # generate ids for acronyms acronym_id = {} acronym_length = {} for definition, acronym in acronymMapping.items(): if acronym not in acronym_id: acronym_id[acronym] = len(acronym_id) acronym_length[acronym] = len(definition.strip().split()) # get concepts relevant_sentences = [] sentence_concepts = [] groups = {} used_concepts = set() acronym_index = {} sent_index = 0 for sentence in compressed_sentences: units = util.get_ngrams(sentence.stemmed, n=2, bounds=False) overlapping = set([u for u in units if u in concept_weight]) if len(overlapping) == 0: continue relevant_sentences.append(sentence) sentence_concepts.append(overlapping) used_concepts.update(overlapping) if sentence.group_id not in groups: groups[sentence.group_id] = [] groups[sentence.group_id].append(sent_index) # generate an acronym index for acronym in acronym_id: if re.search(r'\b' + acronym + r'\b', sentence.original): if acronym not in acronym_index: acronym_index[acronym] = [] acronym_index[acronym].append(sent_index) sent_index += 1 # build inverted index filtered_concepts = {} concept_index = {} index = 0 for concept in used_concepts: concept_index[concept] = index filtered_concepts[concept] = concept_weight[concept] index += 1 relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts] concept_weights = filtered_concepts curr_concept_sents = {} for sent_index in range(len(relevant_sentences)): concepts = relevant_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) # generate the actual ILP program = ilp.IntegerLinearProgram() program.objective["score"] = ' + '.join(['%f c%d' %(concept_weight[concept], concept_index[concept]) for concept in concept_index]) s1 = ' + '.join(['%d s%d' %(relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences))]) # add enough space to fit the definition of each acronym employed in the summary s_acronyms = ' + '.join(['%d a%d' %(acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id]) if s_acronyms != "": s_acronyms = " + " + s_acronyms s2 = ' <= %s\n' %length program.constraints["length"] = s1 + s_acronyms + s2 for concept, index in concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) s2 = ' - c%d >= 0' %index program.constraints["presence_%d" % index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) s2 = '- %d c%d <= 0' %(len(curr_concept_sents[index]), index) program.constraints["absence_%d" % index] = s1 + s2 # constraints so that acronyms get selected along with sentences they belong to for acronym, index in acronym_index.items(): s1 = ' + '.join([ 's%d' %sent_index for sent_index in index]) s2 = ' - a%d >= 0' %acronym_id[acronym] program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2 s1 = ' + '.join([ 's%d' %sent_index for sent_index in index]) s2 = '- %d a%d <= 0' %(len(index), acronym_id[acronym]) program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2 # add sentence compression groups for group in groups: program.constraints["group_%d" % group] = " + ".join(["s%d" % sent_index for sent_index in groups[group]]) + " <= 1" for sent_index in range(len(relevant_sentences)): program.binary["s%d" % sent_index] = relevant_sentences[sent_index] for concept, concept_index in concept_index.items(): program.binary["c%d" % concept_index] = 1 for acronym, id in acronym_id.items(): program.binary["a%d" % id] = 1 sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences))) program.acronyms = acronymMapping return program
def build_alternative_program(problem, concept_weight, length=100, sentences=None, longuest_candidate_only=False, providedAcronyms=None): if not sentences: sentences = problem.get_new_sentences() for sentence in problem.get_new_and_old_sentences(): if not hasattr(sentence, "compression_node"): sentence.compression_node = compression.TreebankNode( sentence.parsed) nounPhraseMapping = compression.generateNounPhraseMapping( [s.compression_node for s in problem.get_new_and_old_sentences()]) #print "generating acronyms" acronymMapping = None if providedAcronyms: acronymMapping = providedAcronyms else: acronymMapping = compression.generateAcronymMapping( problem.get_new_and_old_sentences()) print problem.id, acronymMapping compressed_sentences = [] seen_sentences = {} group_id = 0 for sentence in sentences: subsentences = sentence.compression_node.getNodesByFilter( compression.TreebankNode.isSubsentence) candidates = {} for node in subsentences: candidates.update( node.getCandidates(beam=100, mapping=nounPhraseMapping, use_mandatory_removals=True)) if longuest_candidate_only: max_length = 0 argmax = None for candidate in candidates: if len(candidate) > max_length: max_length = len(candidate) argmax = candidate if argmax != None: candidates = [argmax] for candidate in candidates: new_sentence = text.Sentence(compression.postProcess(candidate), sentence.order, sentence.source, sentence.date) if new_sentence.length <= 5: continue # skip short guys new_sentence.group_id = group_id compressed_sentences.append(new_sentence) seen_sentences[new_sentence.original] = 1 group_id += 1 compression.replaceAcronyms(compressed_sentences, acronymMapping) #log_file = open("%s.log" % problem.id, "w") #for sentence in compressed_sentences: # log_file.write("%d %s\n" %( group_id, str(sentence))) #log_file.close() # generate ids for acronyms acronym_id = {} acronym_length = {} for definition, acronym in acronymMapping.items(): if acronym not in acronym_id: acronym_id[acronym] = len(acronym_id) acronym_length[acronym] = len(definition.strip().split()) # get concepts relevant_sentences = [] sentence_concepts = [] groups = {} used_concepts = set() acronym_index = {} sent_index = 0 for sentence in compressed_sentences: units = util.get_ngrams(sentence.stemmed, n=2, bounds=False) overlapping = set([u for u in units if u in concept_weight]) if len(overlapping) == 0: continue # get rid of sentences that do not overlap with concepts relevant_sentences.append(sentence) sentence_concepts.append(overlapping) used_concepts.update(overlapping) if sentence.group_id not in groups: groups[sentence.group_id] = [] groups[sentence.group_id].append(sent_index) # generate an acronym index for acronym in acronym_id: if re.search(r'\b' + acronym + r'\b', sentence.original): if acronym not in acronym_index: acronym_index[acronym] = [] acronym_index[acronym].append(sent_index) sent_index += 1 # build inverted index filtered_concepts = {} concept_index = {} index = 0 for concept in used_concepts: concept_index[concept] = index filtered_concepts[concept] = concept_weight[concept] index += 1 relevant_sent_concepts = [[concept_index[c] for c in cs] for cs in sentence_concepts] concept_weights = filtered_concepts curr_concept_sents = {} for sent_index in range(len(relevant_sentences)): concepts = relevant_sent_concepts[sent_index] for concept in concepts: if not concept in curr_concept_sents: curr_concept_sents[concept] = [] curr_concept_sents[concept].append(sent_index) # generate the actual ILP program = ilp.IntegerLinearProgram() program.objective["score"] = ' + '.join([ '%f c%d' % (concept_weight[concept], concept_index[concept]) for concept in concept_index ]) s1 = ' + '.join([ '%d s%d' % (relevant_sentences[sent_index].length, sent_index) for sent_index in range(len(relevant_sentences)) ]) # add enough space to fit the definition of each acronym employed in the summary s_acronyms = ' + '.join([ '%d a%d' % (acronym_length[acronym], acronym_id[acronym]) for acronym in acronym_id ]) if s_acronyms != "": s_acronyms = " + " + s_acronyms s2 = ' <= %s\n' % length program.constraints["length"] = s1 + s_acronyms + s2 for concept, index in concept_index.items(): ## at least one sentence containing a selected bigram must be selected s1 = ' + '.join( ['s%d' % sent_index for sent_index in curr_concept_sents[index]]) s2 = ' - c%d >= 0' % index program.constraints["presence_%d" % index] = s1 + s2 ## if a bigram is not selected then all sentences containing it are deselected #### this constraint is disabled since it is not necessary when all sentences contain at least one concept #### it might also be the reason for singlar matrices that crash the solver #s1 = ' + '.join([ 's%d' %sent_index for sent_index in curr_concept_sents[index]]) #s2 = ' - %d c%d <= 0' %(len(curr_concept_sents[index]), index) #program.constraints["absence_%d" % index] = s1 + s2 # constraints so that acronyms get selected along with sentences they belong to for acronym, index in acronym_index.items(): s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = ' - a%d >= 0' % acronym_id[acronym] program.constraints["acronym_presence_%d" % acronym_id[acronym]] = s1 + s2 s1 = ' + '.join(['s%d' % sent_index for sent_index in index]) s2 = ' - %d a%d <= 0' % (len(index), acronym_id[acronym]) program.constraints["acronym_absence_%d" % acronym_id[acronym]] = s1 + s2 # add sentence compression groups for group in groups: if len(groups[group]) > 1: program.constraints["group_%d" % group] = " + ".join( ["s%d" % sent_index for sent_index in groups[group]]) + " <= 1" for sent_index in range(len(relevant_sentences)): program.binary["s%d" % sent_index] = relevant_sentences[sent_index] for concept, concept_index in concept_index.items(): program.binary["c%d" % concept_index] = 1 for acronym, id in acronym_id.items(): program.binary["a%d" % id] = 1 sys.stderr.write("compression candidates: %d, original: %d\n" % (len(relevant_sentences), len(sentences))) program.acronyms = acronymMapping return program
def removeAcronymsFromProblem(problem): acronymMapping = compression.generateAcronymMapping(problem.get_new_and_old_sentences()) # this will modify the sentences, but keep parse trees compression.replaceAcronyms(problem.get_new_sentences(), acronymMapping) return acronymMapping