Esempio n. 1
0
def get_best_ROUGE_L_for_each_abs_sent(art_oovs, all_original_abstracts_sents,
                                       vocab, enc_tokens):
    human_tokens = get_tokens_for_human_summaries(
        art_oovs, all_original_abstracts_sents, vocab,
        split_sents=True)  # list (of 4 human summaries) of list of token ids
    if len(human_tokens) > 1:
        raise Exception(
            'human_tokens (len %d) should have 1 entry, because cnn/dm has one abstract per article.'
            % len(human_tokens))
    human_tokens = human_tokens[0]
    metric = 'recall'
    similarity_matrix = util.rouge_l_similarity_matrix(enc_tokens,
                                                       human_tokens,
                                                       vocab,
                                                       metric=metric)
    best_indices = []
    for col_idx in range(similarity_matrix.shape[1]):
        col = similarity_matrix[:, col_idx]
        sorted_indices = np.argsort(col)[::-1]
        idx = 0
        while sorted_indices[idx] in best_indices:
            idx += 1
            if idx >= len(
                    sorted_indices
            ):  #  If all sentences have been used then just continue
                idx = 0
                break
        best_idx = sorted_indices[idx]
        best_indices.append(best_idx)
    binary_y = np.zeros([len(enc_tokens)], dtype=float)
    binary_y[best_indices] = 1
    return binary_y
def cluster_similar_source_sents(article_sent_tokens, similar_source_indices,
                                 vocab, threshold):
    chosen_article_sents = [
        sent for i, sent in enumerate(article_sent_tokens)
        if i in similar_source_indices
    ]
    temp_similarity_matrix = util.rouge_l_similarity_matrix(
        chosen_article_sents, chosen_article_sents, vocab, 'f1')
    similarity_matrix = np.zeros(
        [len(article_sent_tokens),
         len(article_sent_tokens)], dtype=float)
    for row_idx in range(len(temp_similarity_matrix)):
        for col_idx in range(len(temp_similarity_matrix)):
            similarity_matrix[
                similar_source_indices[row_idx],
                similar_source_indices[col_idx]] = temp_similarity_matrix[
                    row_idx, col_idx]

    groups = [[similar_source_indices[0]]]
    for sent_idx in similar_source_indices[1:]:
        found_group = False
        for group in groups:
            for group_member in group:
                similarity = similarity_matrix[sent_idx, group_member]
                if similarity >= threshold:
                    found_group = True
                    group.append(sent_idx)
                    break
            if found_group:
                break
        if not found_group:
            groups.append([sent_idx])
    return groups
Esempio n. 3
0
def get_sent_similarities(summ_sent,
                          article_sent_tokens,
                          vocab,
                          only_rouge_l=False,
                          remove_stop_words=True):
    similarity_matrix = util.rouge_l_similarity_matrix(article_sent_tokens,
                                                       [summ_sent], vocab,
                                                       'recall')
    similarities = np.squeeze(similarity_matrix, 1)

    if not only_rouge_l:
        rouge_l = similarities
        rouge_1 = np.squeeze(
            util.rouge_1_similarity_matrix(article_sent_tokens, [summ_sent],
                                           vocab, 'recall', remove_stop_words),
            1)
        rouge_2 = np.squeeze(
            util.rouge_2_similarity_matrix(article_sent_tokens, [summ_sent],
                                           vocab, 'recall', False), 1)
        similarities = (rouge_l + rouge_1 + rouge_2) / 3.0

    return similarities
def get_sent_similarities(summ_sent, article_sent_tokens, vocab):
    rouge_l = np.squeeze(util.rouge_l_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall'))
    rouge_1 = np.squeeze(util.rouge_1_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', True), 1)
    rouge_2 = np.squeeze(util.rouge_2_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', False), 1)
    similarities = (rouge_l + rouge_1 + rouge_2) / 3.0
    return similarities