Example #1
0
                     d * sum((rank[i] * link) for i, link in enumerate(row)))
                    for row in matrix]
    return rank


def has_converged(x, y, epsilon=EPSILON):
    """Are all the elements in x are within epsilon of their y's?"""
    for a, b in itertools.izip(x, y):
        if abs(a - b) > epsilon:
            return False
    return True


def gen_lexrank_summary(orig_sents, max_words):
    tok_sents = [tokenize.word_tokenize(orig_sent)
                 for orig_sent in orig_sents]
    adj_matrix = normalize_matrix(sim_adj_matrix(tok_sents))
    rank = pagerank(adj_matrix)
    return gen_summary_from_rankings(rank, tok_sents, orig_sents, max_words)



###############################################################################
if __name__ == '__main__':
    # Gen summaries
#    gen_summaries('lexrank', gen_lexrank_summary, 10)
#    sums = [(i, models) for i, _, models, _ in get_collections(False)][10:]
    gen_config('lexrank', 'rouge/lexrank-config.xml',
               'lexrank')#, sums)

Example #2
0
    # TODO: Remove funcwords, etc?
    feat_space = sorted(set().union(*tok_sents))
    vects = [binary_vectorize(feat_space, tok_sent) for tok_sent in tok_sents]
    return gen_summary_from_rankings(centrality(vects), tok_sents, orig_sents,
                                     max_words)


def gen_summary_from_rankings(score, tok_sents, orig_sents, max_words):
    ranked_sents = sorted(zip(score, tok_sents, orig_sents), reverse=True)
    summary, tok_summary = [], []
    word_count = 0

    for score, tok_sent, orig_sent in ranked_sents:
        if word_count >= max_words:
            break
        if (is_valid_sent_len(tok_sent)
                and not is_repeat(tok_sent, tok_summary)):
            summary.append(orig_sent)
            tok_summary.append(tok_sent)
            word_count += len(tok_sent)

    return summary


if __name__ == '__main__':
    # Gen summaries
    #    gen_summaries('centrality-binary', gen_centrality_summary, 44, 50)
    #    gen_config('centrality-binary', 'rouge/centrality-binary-config.xml',
    #               'centrality-binary')
    gen_config('centrality', 'rouge/centrality-config.xml', 'centrality')
Example #3
0
    new_rank = [0.0] * n
    while not has_converged(rank, new_rank):
        rank = new_rank
        new_rank = [(((1.0 - d) / n) + d * sum((rank[i] * link)
                                               for i, link in enumerate(row)))
                    for row in matrix]
    return rank


def has_converged(x, y, epsilon=EPSILON):
    """Are all the elements in x are within epsilon of their y's?"""
    for a, b in itertools.izip(x, y):
        if abs(a - b) > epsilon:
            return False
    return True


def gen_lexrank_summary(orig_sents, max_words):
    tok_sents = [tokenize.word_tokenize(orig_sent) for orig_sent in orig_sents]
    adj_matrix = normalize_matrix(sim_adj_matrix(tok_sents))
    rank = pagerank(adj_matrix)
    return gen_summary_from_rankings(rank, tok_sents, orig_sents, max_words)


###############################################################################
if __name__ == '__main__':
    # Gen summaries
    #    gen_summaries('lexrank', gen_lexrank_summary, 10)
    #    sums = [(i, models) for i, _, models, _ in get_collections(False)][10:]
    gen_config('lexrank', 'rouge/lexrank-config.xml', 'lexrank')  #, sums)
Example #4
0
    the summary."""
    tok_sents = [tokenize.word_tokenize(orig_sent) for orig_sent in orig_sents]
    # TODO: Remove funcwords, etc?
    feat_space = sorted(set().union(*tok_sents))
    vects = [binary_vectorize(feat_space, tok_sent) for tok_sent in tok_sents]
    return gen_summary_from_rankings(centrality(vects), tok_sents, orig_sents, max_words)


def gen_summary_from_rankings(score, tok_sents, orig_sents, max_words):
    ranked_sents = sorted(zip(score, tok_sents, orig_sents), reverse=True)
    summary, tok_summary = [], []
    word_count = 0

    for score, tok_sent, orig_sent in ranked_sents:
        if word_count >= max_words:
            break
        if is_valid_sent_len(tok_sent) and not is_repeat(tok_sent, tok_summary):
            summary.append(orig_sent)
            tok_summary.append(tok_sent)
            word_count += len(tok_sent)

    return summary


if __name__ == "__main__":
    # Gen summaries
    #    gen_summaries('centrality-binary', gen_centrality_summary, 44, 50)
    #    gen_config('centrality-binary', 'rouge/centrality-binary-config.xml',
    #               'centrality-binary')
    gen_config("centrality", "rouge/centrality-config.xml", "centrality")