d * sum((rank[i] * link) for i, link in enumerate(row))) for row in matrix] return rank def has_converged(x, y, epsilon=EPSILON): """Are all the elements in x are within epsilon of their y's?""" for a, b in itertools.izip(x, y): if abs(a - b) > epsilon: return False return True def gen_lexrank_summary(orig_sents, max_words): tok_sents = [tokenize.word_tokenize(orig_sent) for orig_sent in orig_sents] adj_matrix = normalize_matrix(sim_adj_matrix(tok_sents)) rank = pagerank(adj_matrix) return gen_summary_from_rankings(rank, tok_sents, orig_sents, max_words) ############################################################################### if __name__ == '__main__': # Gen summaries # gen_summaries('lexrank', gen_lexrank_summary, 10) # sums = [(i, models) for i, _, models, _ in get_collections(False)][10:] gen_config('lexrank', 'rouge/lexrank-config.xml', 'lexrank')#, sums)
# TODO: Remove funcwords, etc? feat_space = sorted(set().union(*tok_sents)) vects = [binary_vectorize(feat_space, tok_sent) for tok_sent in tok_sents] return gen_summary_from_rankings(centrality(vects), tok_sents, orig_sents, max_words) def gen_summary_from_rankings(score, tok_sents, orig_sents, max_words): ranked_sents = sorted(zip(score, tok_sents, orig_sents), reverse=True) summary, tok_summary = [], [] word_count = 0 for score, tok_sent, orig_sent in ranked_sents: if word_count >= max_words: break if (is_valid_sent_len(tok_sent) and not is_repeat(tok_sent, tok_summary)): summary.append(orig_sent) tok_summary.append(tok_sent) word_count += len(tok_sent) return summary if __name__ == '__main__': # Gen summaries # gen_summaries('centrality-binary', gen_centrality_summary, 44, 50) # gen_config('centrality-binary', 'rouge/centrality-binary-config.xml', # 'centrality-binary') gen_config('centrality', 'rouge/centrality-config.xml', 'centrality')
new_rank = [0.0] * n while not has_converged(rank, new_rank): rank = new_rank new_rank = [(((1.0 - d) / n) + d * sum((rank[i] * link) for i, link in enumerate(row))) for row in matrix] return rank def has_converged(x, y, epsilon=EPSILON): """Are all the elements in x are within epsilon of their y's?""" for a, b in itertools.izip(x, y): if abs(a - b) > epsilon: return False return True def gen_lexrank_summary(orig_sents, max_words): tok_sents = [tokenize.word_tokenize(orig_sent) for orig_sent in orig_sents] adj_matrix = normalize_matrix(sim_adj_matrix(tok_sents)) rank = pagerank(adj_matrix) return gen_summary_from_rankings(rank, tok_sents, orig_sents, max_words) ############################################################################### if __name__ == '__main__': # Gen summaries # gen_summaries('lexrank', gen_lexrank_summary, 10) # sums = [(i, models) for i, _, models, _ in get_collections(False)][10:] gen_config('lexrank', 'rouge/lexrank-config.xml', 'lexrank') #, sums)
the summary.""" tok_sents = [tokenize.word_tokenize(orig_sent) for orig_sent in orig_sents] # TODO: Remove funcwords, etc? feat_space = sorted(set().union(*tok_sents)) vects = [binary_vectorize(feat_space, tok_sent) for tok_sent in tok_sents] return gen_summary_from_rankings(centrality(vects), tok_sents, orig_sents, max_words) def gen_summary_from_rankings(score, tok_sents, orig_sents, max_words): ranked_sents = sorted(zip(score, tok_sents, orig_sents), reverse=True) summary, tok_summary = [], [] word_count = 0 for score, tok_sent, orig_sent in ranked_sents: if word_count >= max_words: break if is_valid_sent_len(tok_sent) and not is_repeat(tok_sent, tok_summary): summary.append(orig_sent) tok_summary.append(tok_sent) word_count += len(tok_sent) return summary if __name__ == "__main__": # Gen summaries # gen_summaries('centrality-binary', gen_centrality_summary, 44, 50) # gen_config('centrality-binary', 'rouge/centrality-binary-config.xml', # 'centrality-binary') gen_config("centrality", "rouge/centrality-config.xml", "centrality")