def extract_ngrams2(self, concept_type='ngrams', n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): untokenized_concepts = [] if concept_type == 'ngrams': ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n) pruned_list = prune_ngrams(ngrams, self.stoplist, n) elif concept_type == 'phrase': pruned_list = self.sentences[i].phrases for concept in pruned_list: wrds = unstem_ngram(concept, sentence) untokenized_concepts.append(" ".join(wrds)) self.sentences[i].concepts = pruned_list self.sentences[i].untokenized_concepts = untokenized_concepts #print(untokenized_concepts) if len(self.sentences[i].concepts) != len( self.sentences[i].untokenized_concepts): raise BaseException( "unexpected length difference between concepts and untokenized_concepts" )
def solve_ilp(self, N): # build the A matrix: a_ij is 1 if j-th gram appears in the i-th sentence A = np.zeros((len(self.sentences_idx), len(self.ref_ngrams_idx))) for i in self.sentences_idx: sent = self.sentences[i].untokenized_form sngrams = list(extract_ngrams2([sent], self.stemmer, self.LANGUAGE, N)) for j in self.ref_ngrams_idx: if self.ref_ngrams[j] in sngrams: A[i][j] = 1 # Define ILP variable, x_i is 1 if sentence i is selected, z_j is 1 if gram j appears in the created summary x = pulp.LpVariable.dicts('sentences', self.sentences_idx, lowBound=0, upBound=1, cat=pulp.LpInteger) z = pulp.LpVariable.dicts('grams', self.ref_ngrams_idx, lowBound=0, upBound=1, cat=pulp.LpInteger) # Define ILP problem, maximum coverage of grams from the reference summaries prob = pulp.LpProblem("ExtractiveUpperBound", pulp.LpMaximize) prob += sum(z[j] for j in self.ref_ngrams_idx) # Define ILP constraints, length constraint and consistency constraint (impose that z_j is 1 if j # appears in the created summary) prob += sum(x[i] * self.sentences[i].length for i in self.sentences_idx) <= self.sum_length for j in self.ref_ngrams_idx: prob += sum(A[i][j] * x[i] for i in self.sentences_idx) >= z[j] # Solve ILP problem and post-processing to get the summary prob.solve(pulp.GLPK(msg=0)) summary_idx = [] for idx in self.sentences_idx: if x[idx].value() == 1.0: summary_idx.append(idx) return summary_idx
def extract_ngrams2(self, concept_type='ngrams', n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): if concept_type == 'ngrams': ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n) pruned_list = prune_ngrams(ngrams, self.stoplist, n) elif concept_type == 'phrase': pruned_list = self.sentences[i].phrases self.sentences[i].concepts = pruned_list
def __init__(self, rouge, models, parse_info, language, stemmer, summary_length=100, N=2, stopwords=None, ub_score=None, ub_summary=None, summarizer=None, parser_type=None): self.rouge = rouge self.models = models self.language = language self.stopwords = stopwords or Set() self.summary_length = summary_length self.ref_ngrams = Set() # set of ngrams that are in the reference summaries (for the feedback to peek) self.ref_phrases = Set() # set of phrases that are in the reference summaries (for the feedback to peek) self.__ub_summary__ = ub_summary or [] self.__ub_score__ = ub_score or (0.0, 0.0, 0.0) # this only deals with the reference summaries parse_info = parse_info or [] for model_name, model in models: y = Set(extract_ngrams2(model, stemmer, language, N)) self.ref_ngrams = self.ref_ngrams.union(y) if parser_type == PARSE_TYPE_PARSE: for _, parse_sents in parse_info[1]: for parse_sent in parse_sents: _, phrases = get_parse_info(parse_sent, stemmer, language, stopwords) y = Set(prune_phrases(phrases, stopwords, stemmer, language)) self.ref_phrases = self.ref_phrases.union(y) if summarizer is not None: if parser_type is None or parser_type == PARSE_TYPE_NGRAMS: concept_match = [key for key in summarizer.weights if key in self.ref_ngrams] log.debug('Total uniq ref concepts (ngr): %s' % (len(self.ref_ngrams))) elif parser_type == PARSE_TYPE_PARSE: concept_match = [key for key in summarizer.weights if key in self.ref_phrases] log.debug('Total uniq ref concepts (phr): %s' % (len(self.ref_phrases))) else: raise ValueError("parse_type '%s' is invalid, should be %s or %s" % (parser_type, None, PARSE_TYPE_PARSE)) log.debug('UB Accept concepts: %s' % (len(concept_match)))
def get_ref_ngrams(self, N): for _, summary in self.models: self.ref_ngrams.extend(extract_ngrams2(summary, self.stemmer, self.LANGUAGE, N))
def __call__(self, docs, models, summary_length, oracle_type, ub_score, ub_summary, parser_type=None, parse_info=[], max_iteration_count=11, weights_override={}, clear_before_override=None, propagation=False): """ This starts of the simualted feedback for a single cluster of documents, towards a list of models. i.e. the models get united, and then the feedback loop is simulated. :param docs: :param models: :param summary_length: :param oracle_type: :param ub_score: :param ub_summary: :param parser_type: :param parse_info: :param max_iteration_count: int: Maximum number of iterations to run. :param weights_override: dict: (concept -> double) dictionary containing the override weights for propagation """ self.models = models self.summary_length = summary_length self.ub_score = ub_score self.parse_type = parser_type self.cluster_size = len(docs) self.MAX_WEIGHT = len(docs) for model_name, model in models: y = set(extract_ngrams2(model, self.stemmer, self.language, self.N)) self.ref_ngrams = self.ref_ngrams.union(y) if parser_type == PARSE_TYPE_PARSE: for _, parse_sents in parse_info[1]: for parse_sent in parse_sents: _, phrases = get_parse_info(parse_sent, self.stemmer, self.language, self.stoplist) y = set( prune_phrases(phrases, self.stoplist, self.stemmer, self.language)) self.ref_phrases = self.ref_phrases.union(y) self.summarizer.sentences = self.SumeWrap.load_sume_sentences( docs, parser_type, parse_info) parse_info = [] # extract bigrams as concepts if self.parse_type == PARSE_TYPE_PARSE: print('Get concept types Phrases') self.summarizer.extract_ngrams2(concept_type='phrase') if self.parse_type == None: print('Get concept types ngrams') self.summarizer.extract_ngrams2(concept_type='ngrams') # compute document frequency as concept weights self.summarizer.compute_document_frequency() # compute word_frequency self.summarizer.compute_word_frequency() old_sentences = self.summarizer.sentences self.summarizer.prune_sentences(remove_citations=True, remove_redundancy=True, imp_list=[]) # from all concepts that are going to be pruned, keep only those that also appear elsewhere retained_concepts = [ concept for s in self.summarizer.sentences for concept in s.concepts ] print('Total concepts before sentence pruning: ', len(self.summarizer.weights)) for sentence in set(old_sentences).difference( self.summarizer.sentences): for concept in sentence.concepts: if concept not in retained_concepts and self.summarizer.weights.has_key( concept): del self.summarizer.weights[concept] print('Total concepts found: ', len(self.summarizer.weights)) if self.parse_type == None: concept_match = [ key for key in self.summarizer.weights if key in self.ref_ngrams ] print('Total ref concepts: ', len(self.ref_ngrams)) elif self.parse_type == PARSE_TYPE_PARSE: concept_match = [ key for key in self.summarizer.weights if key in self.ref_phrases ] print('Total ref concepts: ', len(self.ref_phrases)) print('UB Accept concepts: ', len(concept_match)) if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING): self.get_feature_vector() self.data = np.array(self.fvector) model = svm.SVC(kernel='linear', C=1.0, probability=True, class_weight='balanced') self.initial_weights = self.summarizer.weights self.__apply_initial_weights_override__(weights_override, clear_before_override) ''' # create the coocurence graph self.graph.clear() self.graph.add_sentences(self.summarizer.sentences) dump_dir=tempfile.mkdtemp(dir=self.debug_dump_target_dir) ''' print('Summarizing %s sentences down to %s words' % (len(self.summarizer.sentences), self.summary_length)) # core algorithm for feedback calculation... (as in paper) flag = 0 # get_details is the personalizedSummary function which gets updated weights in every iteration. # Starting with boudin as starting weights (except in case of weights_override != None). # initial iteration summary, self.score, subset = self.get_details(1, summary_length, oracle_type) self.prev_score = (0.0, 0.0, 0.0) prev_summary = '' for iteration in range(2, max_iteration_count): self.dump_current_weight_map(self.debug_dump_target_dir, max_iteration_count) # here, depending on the oracle_type, a intermediate summary is generated. This intermediate summary is # satisfies other optimization criteria, so that the amount/probability of getting useful feedback is maximized if iteration > 2: subset = self.__generate_optimal_feedback_summary__( flag, oracle_type, summary_length) print('Summary Subset:', subset) # acquire feedback and record it using the flight_recorder #new_accepts, new_rejects, new_implicits = self.get_feedback(subset, RECOMMENDER_METHOD_HIGHEST_WEIGHT) new_accepts, new_rejects, new_implicits = self.get_feedback(subset) self.flight_recorder.record(new_accepts, new_rejects, new_implicits) # update the summarizer weights for next iteration self.recalculate_weights(oracle_type, propagation) summary, self.score, _ = self.get_details(iteration, summary_length, oracle_type) if oracle_type.startswith(ORACLE_TYPE_ACTIVE_LEARNING): self.uncertainity, self.labels = self.get_uncertainity_labels( model) if self.check_break_condition(iteration, prev_summary, summary, ub_summary, self.prev_score): break self.prev_score = self.score prev_summary = summary return summary
def plot_ngrams(): data_path = "%s/data" % (path.dirname(path.dirname( path.abspath(__file__)))) summary_len, data_set, language = get_args() stemmer = SnowballStemmer(language) reader = CorpusReader(data_path) data = reader.get_data(data_set, summary_len) for topic, docs, models in data: print topic summarizer = ExtractiveUpperbound(language) ub_summary = summarizer(docs, models, summary_len, ngram_type=2) summarizer = SumeWrap(language) summarizer.s.sentences = summarizer.load_sume_sentences(docs) summarizer.s.extract_ngrams2() summarizer.s.compute_document_frequency() sorted_list = get_sorted(summarizer.s.weights) ngrams_ub = extract_ngrams2(ub_summary, stemmer, language) ngrams_models = [] for _, model in models: ngrams_models.append(extract_ngrams2(model, stemmer, language)) inter_ngrams = [] for i in range(len(ngrams_models)): for j in range(i + 1, len(ngrams_models)): inter_ngrams.extend( Set(ngrams_models[i]).intersection(Set(ngrams_models[j]))) final_ngrams_models = [] for i in range(len(ngrams_models)): final_ngrams_models.append( list(Set(ngrams_models[i]) - Set(inter_ngrams))) all_ngrams_unique = [] for ngrams in ngrams_models: all_ngrams_unique = list(Set(all_ngrams_unique).union(Set(ngrams))) all_ngrams = [] for ngrams in ngrams_models: all_ngrams.extend(ngrams) all_ngrams = list(Set(all_ngrams)) final_ngrams_models.append(all_ngrams) x = [0] y = [[0] for _ in range(len(final_ngrams_models))] for i in range(50, len(sorted_list), 10): docs_ngrams = sorted_list[:i] x.append(i) prev_y = 0 for index in range(len(final_ngrams_models)): val = len(ngrams_match(docs_ngrams, final_ngrams_models[index])) if index == len(final_ngrams_models) - 1: y[index].append(val) else: y[index].append(prev_y + val) prev_y += val plt.fill_between(x, [0] * len(y[0]), y[0], facecolor='green', interpolate=True) plt.plot(x, y[0], 'g', label='Unique bigrams by User 1') plt.fill_between(x, y[0], y[1], facecolor='blue', interpolate=True) plt.plot(x, y[1], 'r', label='Unique bigrams by User 2') plt.fill_between(x, y[1], y[2], facecolor='red', interpolate=True) plt.plot(x, y[2], 'b', label='Unique bigrams by User 3') plt.fill_between(x, y[2], y[3], facecolor='yellow', interpolate=True) plt.plot(x, y[3], 'y', label='Unique bigrams by User 4') plt.fill_between(x, y[3], y[4], facecolor='black', interpolate=True) plt.plot(x, y[4], 'k', label='Overlapping bigrams between atleast two Users') #plt.plot(x, y[5], 'k', label='Upper Bound') plt.legend(loc="upper left", fontsize=10) plt.xlabel("No. of sorted bigrams in the source documents", fontsize=20) plt.ylabel("Overlapping w.r.t. reference summaries", fontsize=20) plt.yscale("linear", linewidth=1) plt.grid(True) plt.show()