def get_best_ROUGE_L_for_each_abs_sent(art_oovs, all_original_abstracts_sents, vocab, enc_tokens): human_tokens = get_tokens_for_human_summaries( art_oovs, all_original_abstracts_sents, vocab, split_sents=True) # list (of 4 human summaries) of list of token ids if len(human_tokens) > 1: raise Exception( 'human_tokens (len %d) should have 1 entry, because cnn/dm has one abstract per article.' % len(human_tokens)) human_tokens = human_tokens[0] metric = 'recall' similarity_matrix = util.rouge_l_similarity_matrix(enc_tokens, human_tokens, vocab, metric=metric) best_indices = [] for col_idx in range(similarity_matrix.shape[1]): col = similarity_matrix[:, col_idx] sorted_indices = np.argsort(col)[::-1] idx = 0 while sorted_indices[idx] in best_indices: idx += 1 if idx >= len( sorted_indices ): # If all sentences have been used then just continue idx = 0 break best_idx = sorted_indices[idx] best_indices.append(best_idx) binary_y = np.zeros([len(enc_tokens)], dtype=float) binary_y[best_indices] = 1 return binary_y
def cluster_similar_source_sents(article_sent_tokens, similar_source_indices, vocab, threshold): chosen_article_sents = [ sent for i, sent in enumerate(article_sent_tokens) if i in similar_source_indices ] temp_similarity_matrix = util.rouge_l_similarity_matrix( chosen_article_sents, chosen_article_sents, vocab, 'f1') similarity_matrix = np.zeros( [len(article_sent_tokens), len(article_sent_tokens)], dtype=float) for row_idx in range(len(temp_similarity_matrix)): for col_idx in range(len(temp_similarity_matrix)): similarity_matrix[ similar_source_indices[row_idx], similar_source_indices[col_idx]] = temp_similarity_matrix[ row_idx, col_idx] groups = [[similar_source_indices[0]]] for sent_idx in similar_source_indices[1:]: found_group = False for group in groups: for group_member in group: similarity = similarity_matrix[sent_idx, group_member] if similarity >= threshold: found_group = True group.append(sent_idx) break if found_group: break if not found_group: groups.append([sent_idx]) return groups
def get_sent_similarities(summ_sent, article_sent_tokens, vocab, only_rouge_l=False, remove_stop_words=True): similarity_matrix = util.rouge_l_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall') similarities = np.squeeze(similarity_matrix, 1) if not only_rouge_l: rouge_l = similarities rouge_1 = np.squeeze( util.rouge_1_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', remove_stop_words), 1) rouge_2 = np.squeeze( util.rouge_2_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', False), 1) similarities = (rouge_l + rouge_1 + rouge_2) / 3.0 return similarities
def get_sent_similarities(summ_sent, article_sent_tokens, vocab): rouge_l = np.squeeze(util.rouge_l_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall')) rouge_1 = np.squeeze(util.rouge_1_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', True), 1) rouge_2 = np.squeeze(util.rouge_2_similarity_matrix(article_sent_tokens, [summ_sent], vocab, 'recall', False), 1) similarities = (rouge_l + rouge_1 + rouge_2) / 3.0 return similarities