def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()][citance]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append( t[0]['sentence']) import pdb pdb.set_trace() summarizer = LexRankSummarizer(Stemmer('english')) final_summ = defaultdict(lambda: defaultdict(dict)) counts = defaultdict(lambda: defaultdict(dict)) for t in summaries: for c in summaries[t]: for facet in summaries[t][c]: summs = list( itertools.chain.from_iterable(summaries[t][c][facet])) parser = PlaintextParser.from_string( ' '.join(summs), Tokenizer('english')) summ = summarizer(parser.document, max_length) final_summ[t][c][facet] = [unicode(sent) for sent in summ] counts[t][c][facet] = len(final_summ[t][c][facet]) summ = defaultdict(list) tokzer = WordTokenizer(stem=False) for k in final_summ: i = 0 c = final_summ[k].keys()[0] while tokzer.count_words(summ[k]) < max_length: for c in final_summ[k]: for f in final_summ[k][c]: if len(final_summ[k][c][f]) > i and\ tokzer.count_words(summ[k]) < max_length: summ[k].append(final_summ[k][c][f][i]) return summ
class Summarizer(Summarizer): ''' Random selection of the sentences based on citations ''' def __init__(self): self.w_t = WordTokenizer(stem=False) def summarize(self, citations, max_length=250): ''' Randomly select from citations Args: citations(list) A list of strings max_length(int) maximum length of the summary in words Returns ------- List A list of ranked strings for the final summary ''' final_sum = deepcopy(citations) while self.w_t.count_words(final_sum) > max_length: i = random.randint(0, len(final_sum) - 1) del final_sum[i] final_sum.pop() return final_sum
def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Chooses from facets naively Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append( t[0]['sentence']) final_summ = defaultdict(lambda: defaultdict(dict)) counts = defaultdict(lambda: defaultdict(dict)) sent_tok = SentTokenizer(offsets=False) for t in summaries: for facet in summaries[t]: sents = [] for e in summaries[t][facet]: sents.extend(sent_tok(e)) final_summ[t][facet] = sents counts[t][facet] = len(final_summ[t][facet]) summ = defaultdict(list) tokzer = WordTokenizer(stem=False) for k in final_summ: for f in final_summ[k]: for i in range(len(final_summ[k][f])): if len(final_summ[k][f]) > i and\ tokzer.count_words(summ[k]) < max_length: summ[k].append(final_summ[k][f][i]) return summ
class Summarizer(Summarizer): ''' classdocs ''' def __init__(self, args, opts): ''' Constructor ''' self.s_t = SentTokenizer(offsets=False) self.w_t = WordTokenizer(stem=False) def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Uses LexRank to choose the most salient sentences from reference sentences in each facet Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(list) for t in extracted_refs: topic = t[0]['topic'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()].append(t[0]['citation_text']) for t in summaries: while self.w_t.count_words(summaries[t]) > max_length: i = randint(0, len(summaries[t]) - 1) summaries[t].pop(i) return summaries
class MMR(Summarizer): ''' classdocs ''' def __init__(self, lmbda=0.3): ''' Initializes MMR Args: lmbda(float) the lambda param for MMR ''' self.lmbda = lmbda self.w_t = WordTokenizer(stem=False) def summarize(self, doc, max_length=10): ''' Summarizes a document or list of docs MMR(S) = lambda*Sim(S,D)-(1-lambda)*Sim(S,Summary) Arg: doc: (list) | (str) max_length: The maximum length of the desired summary Returns ------- str ''' if isinstance(doc, str): # list of sentences, no need to tokenize s_t = SentTokenizer() docs = s_t(doc, offsets=False) docs += [doc] # Dummy sentence, The whole document else: docs = doc + [' '.join(doc)] tokzr = self.get_tokenizer('regex', True) vectorizer = TfidfVectorizer(min_df=1, max_df=len(doc) * .95, tokenizer=tokzr, stop_words=stopwords.words('english')) vectors = vectorizer.fit_transform(docs).toarray() doc_texts = {i: v for i, v in enumerate(docs)} doc_dict = {i: v for i, v in enumerate(vectors)} feature_names = vectorizer.get_feature_names() # idf_vals = vectorizer.idf_ summ_scores = [] # includes tuples (mmr_score, sentence_id) for i, s in doc_texts.iteritems(): # iterate through sentences to # select them for summary if len(summ_scores) > 0: summ_v = ' '.join([doc_texts[e[1]] for e in summ_scores]) # summarization vector else: summ_v = '' if summ_v != '': summ_v = vectorizer.transform([summ_v ]).toarray()[0] # to tf-idf score = -1 * self._mmr( vectorizer.transform([s]).toarray()[0], doc_dict[len(doc_dict) - 1], summ_v, self.lmbda, self.cossim) if len(summ_scores) < max_length / 30 + 3: # max heap data structure for mmr heappush(summ_scores, (score, i)) else: # Get rid of lowest score heappushpop(summ_scores, (score, i)) print summ_scores final_sum = [] for s in summ_scores: if self.w_t.count_words(final_sum) < max_length: final_sum.append(doc_texts[s[1]]) # print 'before: %d' % self.w_t.count_words(final_sum) if self.w_t.count_words(final_sum) > max_length: tmp = final_sum.pop() if self.w_t.count_words(final_sum) == 0: final_sum.append(tmp) # print 'after: %d' % self.w_t.count_words(final_sum) return final_sum def _mmr(self, s, D, Summ, lmbda, sim): ''' s: Sentence for evaluation D: The whole document Summ: The summary lmbda: Lambda parameter sim: The similarity function Returns ------ float ''' if Summ == '': return lmbda * sim(s, D) return lmbda * sim(s, D) - (1 - lmbda) * (sim(s, Summ))
class MMR(Summarizer): ''' classdocs ''' def __init__(self, lmbda=0.3): ''' Initializes MMR Args: lmbda(float) the lambda param for MMR ''' self.lmbda = lmbda self.w_t = WordTokenizer(stem=False) def summarize(self, doc, max_length=10): ''' Summarizes a document or list of docs MMR(S) = lambda*Sim(S,D)-(1-lambda)*Sim(S,Summary) Arg: doc: (list) | (str) max_length: The maximum length of the desired summary Returns ------- str ''' if isinstance(doc, str): # list of sentences, no need to tokenize s_t = SentTokenizer() docs = s_t(doc, offsets=False) docs += [doc] # Dummy sentence, The whole document else: docs = doc + [' '.join(doc)] tokzr = self.get_tokenizer('regex', True) vectorizer = TfidfVectorizer( min_df=1, max_df=len(doc) * .95, tokenizer=tokzr, stop_words=stopwords.words('english')) vectors = vectorizer.fit_transform(docs).toarray() doc_texts = {i: v for i, v in enumerate(docs)} doc_dict = {i: v for i, v in enumerate(vectors)} feature_names = vectorizer.get_feature_names() # idf_vals = vectorizer.idf_ summ_scores = [] # includes tuples (mmr_score, sentence_id) for i, s in doc_texts.iteritems(): # iterate through sentences to # select them for summary if len(summ_scores) > 0: summ_v = ' '.join([doc_texts[e[1]] for e in summ_scores]) # summarization vector else: summ_v = '' if summ_v != '': summ_v = vectorizer.transform( [summ_v]).toarray()[0] # to tf-idf score = -1 * self._mmr( vectorizer.transform( [s]).toarray()[0], doc_dict[len(doc_dict) - 1], summ_v, self.lmbda, self.cossim) if len(summ_scores) < max_length / 30 + 3: # max heap data structure for mmr heappush(summ_scores, (score, i)) else: # Get rid of lowest score heappushpop(summ_scores, (score, i)) print summ_scores final_sum = [] for s in summ_scores: if self.w_t.count_words(final_sum) < max_length: final_sum.append(doc_texts[s[1]]) # print 'before: %d' % self.w_t.count_words(final_sum) if self.w_t.count_words(final_sum) > max_length: tmp = final_sum.pop() if self.w_t.count_words(final_sum) == 0: final_sum.append(tmp) # print 'after: %d' % self.w_t.count_words(final_sum) return final_sum def _mmr(self, s, D, Summ, lmbda, sim): ''' s: Sentence for evaluation D: The whole document Summ: The summary lmbda: Lambda parameter sim: The similarity function Returns ------ float ''' if Summ == '': return lmbda * sim(s, D) return lmbda * sim(s, D) - (1 - lmbda) * (sim(s, Summ))
def pick_from_cluster(self, cluster, max_length=250, weighted=False, mode=None, lmb=0.3): ''' Picks sentences from a cluster of sentences Args: cluster(dict) -- A dictionary whose keys are cluster ids and values are list of sentences belonging to that cluster max_length(int) -- maximum length of the summary (in words) weighted(bool) -- Weighted based on the number of sentences in each cluster ''' word_tokenize = WordTokenizer(stem=False) final_sum = [] if not mode: if weighted: counts = defaultdict(int) idx = {} for k in cluster: counts[k] += 1 idx[k] = 0 num_sents = max_length / float(50) + 2 while len(final_sum) < num_sents and\ word_tokenize.count_words(final_sum) < max_length: weighted_choice = [(k, v) for k, v in counts.iteritems()] avg_cnt = np.mean(counts.values()) def cnvrt(cnt, idx, avg_cnt): if cnt < 2 and idx == 0: return (cnt + avg_cnt) / 2 else: return cnt population = [ val for val, cnt in weighted_choice for _ in range(cnvrt(cnt, idx[val], int(avg_cnt)))] to_pick = random.choice(population) idx[to_pick] += 1 if (idx[to_pick] < len(cluster[to_pick])) and\ (word_tokenize.count_words(final_sum) < max_length): final_sum.append(cluster[to_pick][idx[to_pick]]) else: idx = 0 end = False while word_tokenize.count_words(final_sum) < max_length and not end: for k in cluster: if (idx < len(cluster[k])) and\ (word_tokenize.count_words(final_sum) < max_length): final_sum.append(cluster[k][idx]) idx += 1 if idx > 10: end = True elif mode == 'mmr': def summarize1(self, doc, max_length=10): ''' Summarizes a document or list of docs MMR(S) = lambda*Sim(S,D)-(1-lambda)*Sim(S,Summary) Arg: doc: (list) | (str) max_length: The maximum length of the desired summary Returns ------- str ''' if isinstance(doc, str): # list of sentences, no need to tokenize s_t = SentTokenizer() docs = s_t(doc, offsets=False) docs += [doc] # Dummy sentence, The whole document else: docs = doc + [' '.join(doc)] tokzr = self.get_tokenizer('regex', True) vectorizer = TfidfVectorizer( min_df=1, max_df=len(doc) * .95, tokenizer=tokzr, stop_words=stopwords.words('english')) vectors = vectorizer.fit_transform(docs).toarray() doc_texts = {i: v for i, v in enumerate(docs)} doc_dict = {i: v for i, v in enumerate(vectors)} feature_names = vectorizer.get_feature_names() # idf_vals = vectorizer.idf_ summ_scores = [] # includes tuples (mmr_score, sentence_id) # iterate through sentences to for i, s in doc_texts.iteritems(): # select them for summary if len(summ_scores) > 0: summ_v = ' '.join([doc_texts[e[1]] for e in summ_scores]) # summarization vector else: summ_v = '' if summ_v != '': summ_v = vectorizer.transform( [summ_v]).toarray()[0] # to tf-idf score = -1 * self._mmr( vectorizer.transform( [s]).toarray()[0], doc_dict[len(doc_dict) - 1], summ_v, self.lmbda, self.cossim) if len(summ_scores) < max_length / 30 + 3: # max heap data structure for mmr heappush(summ_scores, (score, i)) else: # Get rid of lowest score heappushpop(summ_scores, (score, i)) print summ_scores final_sum = [] for s in summ_scores: if self.w_t.count_words(final_sum) < max_length: final_sum.append(doc_texts[s[1]]) # print 'before: %d' % self.w_t.count_words(final_sum) if self.w_t.count_words(final_sum) > max_length: tmp = final_sum.pop() if self.w_t.count_words(final_sum) == 0: final_sum.append(tmp) # print 'after: %d' % self.w_t.count_words(final_sum) return final_sum def _mmr(self, s, D, Summ, lmbda, sim): ''' s: Sentence for evaluation D: The whole document Summ: The summary lmbda: Lambda parameter sim: The similarity function Returns ------ float ''' if Summ == '': return lmbda * sim(s, D) return lmbda * sim(s, D) - (1 - lmbda) * (sim(s, Summ)) vals = [] idx = 0 l = sum([len(e) for e in cluster.values()]) while (len(vals) < len(cluster.keys()) * 3) and idx < l: for e in cluster.values(): if idx < len(e): vals.append(e[idx]) idx += 1 final_sum = summarize1( vals, max_length) elif mode=='knapsack': return final_sum def cossim(self, a, b): return (np.inner(a, b) / (LA.norm(a) * LA.norm(b))) def modified_cosine(self, sentence1, sentence2, tf1, tf2, idf_metrics): common_words = frozenset(sentence1) & frozenset(sentence2) numerator = 0.0 for term in common_words: numerator += tf1[term] * tf2[term] * idf_metrics[term]**2 denominator1 = sum((tf1[t] * idf_metrics[t])**2 for t in sentence1) denominator2 = sum((tf2[t] * idf_metrics[t])**2 for t in sentence2) if denominator1 > 0 and denominator2 > 0: return numerator / (math.sqrt(denominator1) * math.sqrt(denominator2)) else: return 0.0 def get_tokenizer(self, alg='regex', stem=False): ''' Tokenizes a string Args: text(str) alg(str) -- tokenization algorithm, valid options: regex, word stem(bool) ''' w_t = WordTokenizer(alg=alg, stem=stem) return w_t def tokenize1(self, text, alg='regex', stem=False): ''' Tokenizes a string Args: text(str) alg(str) -- tokenization algorithm, valid options: regex, word stem(bool) ''' w_t = WordTokenizer(alg=alg, stem=stem) return w_t(text) def dump_data(self, data, **kwargs): if not hasattr(self, 'dumped'): self.dumped = {} for k, data in kwargs.iteritems(): fn = '%s_%s_%s.pickle' % (k, hash_obj(data), self.dumped()) cache_path = os.path.join(self.cachedir, fn) if not os.path.exists(cache_path): with file(cache_path, 'wb') as f: try: pickle.dump(data, f) except Exception, e: print e self.dumped[k] = cache_path
class Summarizer(Summarizer): ''' classdocs ''' def __init__(self, args, opts): ''' Constructor ''' self.s_t = SentTokenizer(offsets=False) self.w_t = WordTokenizer(stem=False) def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append([t[0]['citation_text']]) summarizer = TextRankSummarizer(Stemmer('english')) final_summ = defaultdict(lambda: defaultdict(dict)) ret_summ = defaultdict(list) counts = defaultdict(lambda: defaultdict(dict)) for t in summaries: for facet in summaries[t]: if len(summaries[t][facet]) > 1: summs = list( itertools.chain.from_iterable(summaries[t][facet])) parser = PlaintextParser.from_string( ' '.join(summs), Tokenizer('english')) summ = summarizer(parser.document, max_length) final_summ[t][facet] = [unicode(sent) for sent in summ] counts[t][facet] = len(final_summ[t][facet]) else: final_summ[t][facet] = self.s_t(summaries[t][facet][0]) i = 0 while self.w_t.count_words(ret_summ[t]) < max_length: for fct in final_summ[t]: if i < len(final_summ[t][fct]): ret_summ[t].append(final_summ[t][fct][i]) i += 1 while self.w_t.count_words(ret_summ[t]) > max_length: ret_summ[t].pop() # summ = defaultdict(list) # tokzer = WordTokenizer(stem=False) # for k in final_summ: # i = 0 # while tokzer.count_words(summ[k]) < max_length: # for f in final_summ[k]: # if len(final_summ[k][f]) > i and\ # tokzer.count_words(summ[k]) < max_length: # summ[k].append(final_summ[k][f][i]) return ret_summ