def extract_ngrams2(self, concept_type='ngrams', n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): untokenized_concepts = [] if concept_type == 'ngrams': ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n) pruned_list = prune_ngrams(ngrams, self.stoplist, n) elif concept_type == 'phrase': pruned_list = self.sentences[i].phrases for concept in pruned_list: wrds = unstem_ngram(concept, sentence) untokenized_concepts.append(" ".join(wrds)) self.sentences[i].concepts = pruned_list self.sentences[i].untokenized_concepts = untokenized_concepts #print(untokenized_concepts) if len(self.sentences[i].concepts) != len( self.sentences[i].untokenized_concepts): raise BaseException( "unexpected length difference between concepts and untokenized_concepts" )
def prune_concepts(self, method="threshold", value=3, rejected_list=[]): """Prune the concepts for efficient summarization. Args: method (str): the method for pruning concepts that can be whether by using a minimal value for concept scores (threshold) or using the top-N highest scoring concepts (top-n), defaults to threshold. value (int): the value used for pruning concepts, defaults to 3. """ if method == 'stopwords': concepts = self.weights.keys() for concept in concepts: pruned_list = prune_ngrams(concept, self.stoplist, 1) if not pruned_list: #print concept, self.weights[concept] del self.weights[concept] if method == "list": concepts = self.weights.keys() for concept in concepts: if concept in rejected_list: #print concept, self.weights[concept] del self.weights[concept] # 'threshold' pruning method if method == "threshold": # iterates over the concept weights concepts = self.weights.keys() for concept in concepts: if self.weights[concept] < value: del self.weights[concept] # 'top-n' pruning method elif method == "top-n": # sort concepts by scores sorted_concepts = sorted(self.weights, key=lambda x: self.weights[x], reverse=True) # iterates over the concept weights concepts = self.weights.keys() for concept in concepts: if concept not in sorted_concepts[:value]: del self.weights[concept] # iterates over the sentences for i in range(len(self.sentences)): # current sentence concepts concepts = self.sentences[i].concepts # prune concepts self.sentences[i].concepts = [ c for c in concepts if c in self.weights ]
def __call__(self, docs, models, length, ngram_type=2): self.sum_length = int(length) self.load_data(docs, models) self.get_ref_ngrams(ngram_type) self.ref_ngrams = prune_ngrams(self.ref_ngrams, self.stoplist, ngram_type) #self.prune_sentences(remove_citations=True, remove_redundancy=True) self.sentences_idx = range(len(self.sentences)) self.ref_ngrams_idx = range(len(self.ref_ngrams)) summary_idx = self.solve_ilp(ngram_type) summary_txt = self.get_summary_text(summary_idx) return summary_txt
def extract_ngrams2(self, concept_type='ngrams', n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): if concept_type == 'ngrams': ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n) pruned_list = prune_ngrams(ngrams, self.stoplist, n) elif concept_type == 'phrase': pruned_list = self.sentences[i].phrases self.sentences[i].concepts = pruned_list
def recommend_highest_weight(self, samples, limit=1, prune=True): w = dict(self.graph.get_weights()) s = sorted(w, key=w.get, reverse=True) s = [item for item in s if item not in self.flight_recorder.union().reject and item not in self.flight_recorder.union().accept and item not in self.flight_recorder.union().implicit_reject] pruned = prune_ngrams(s, self.stoplist, self.N) result = [] for concept in s: if concept in samples: # log.debug ("adding %s with weight %s to result" % (concept, w[concept])) result.append(concept) return result[:limit]
def get_feedback(self, subset, recommender=None): """ Generate feedback for the subset sentences by peeking into the reference summary. :param subset: The indices of the sentences to get feedback for. :param allowed_number_of_feedbacks: how many concepts may be sent to the oracle, default all """ new_implicit_rejects = set( ) # currently not used (all writing occurences are commented out) summary = [ self.summarizer.sentences[j].untokenized_form for j in subset ] # print('Feedback-optimal summary:', summary) if self.parse_type == 'parse': print('feedback on phrases') summary_phrases = [ self.summarizer.sentences[j].phrases for j in subset ] samples = list(itertools.chain(*summary_phrases)) references = self.ref_phrases elif self.parse_type == None: print('feedback on ngrams') summary_concepts = [ self.summarizer.sentences[j].concepts for j in subset ] samples = list(itertools.chain(*summary_concepts)) references = self.ref_ngrams # from all samples, use a sub-set if recommender is None: use_samples = samples elif recommender == RECOMMENDER_METHOD_SAMPLING: use_samples = random.sample( samples, self.allowed_number_of_feedback_per_iteration) elif recommender == RECOMMENDER_METHOD_HIGHEST_WEIGHT: use_samples = self.recommend_highest_weight( samples, self.allowed_number_of_feedback_per_iteration) new_rejects = list( self.Oracle.reject_concepts(use_samples, references) - self.flight_recorder.union().reject) new_accepts = list( self.Oracle.accept_concepts(use_samples, references) - self.flight_recorder.union().accept) new_rejects = prune_ngrams(new_rejects, self.stoplist, self.N) new_accepts = prune_ngrams(new_accepts, self.stoplist, self.N) ''' if self.parse_type == 'parse': self.recorder.total_accept_keys += self.project_phrase_ngrams(self.recorder.accepted_concepts) self.recorder.total_reject_keys += self.project_phrase_ngrams(self.recorder.rejected_concepts) x = list(Set(self.recorder.total_accept + self.recorder.union.reject)) new_implicit_rejects = list(self.get_implicit_feedback(summ_ngrams, x) - Set(self.recorder.total_implicit_reject)) # self.recorder.total_implicit_reject += self.recorder.latest().implicit_reject ''' # self.recorder.total_accept += self.recorder.accepted_concepts # self.recorder.total_reject += self.recorder.rejected_concepts # self.recorder.total_implicit_reject += self.recorder.latest().implicit_reject return (new_accepts, new_rejects, new_implicit_rejects)