def compute_rouge_abstract_score(sentence, abstract): """ Computes the ROUGE score of the given sentence compared to the given abstract. :param sentence: the sentence to compute the ROUGE score for, as a list of words. :param abstract: the abstract of the paper to compute the ROUGE score against, as a list of strings. :return: the ROUGE score of the sentence compared to the abstract. """ r = Rouge() return r.calc_score([" ".join(sentence)], abstract)
assert(len(golds) > 0) assert(len(generateds) > 0) assert(type(golds[0]) is tuple) assert(type(generateds[0]) is tuple) assert(type(golds[0][0][0]) is str) assert(type(generateds[0][0][0]) is str) r = Rouge() summaries_and_scores = [] for gold, ref in zip(golds, generateds): avg_score = 0 count = 0 for sent in ref[0]: score = r.calc_score([sent], gold[0]) avg_score += score count += 1 if count > 0: avg_score = avg_score / count else: avg_score = 0 summaries_and_scores.append((gold[0], ref[0], avg_score, gold[1])) summaries_and_scores = sorted(summaries_and_scores, key=itemgetter(2)) for item in summaries_and_scores: print("\n") print("SCORE: ", item[2]) print()
abstract = paper["ABSTRACT"] abstract_join = [" ".join(x) for x in abstract] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): section_avg_score = 0 i = 0 # Iterate over each sentence in the section for sentence in sents: # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], abstract_join) section_avg_score += r_score i += 1 if i > 0: section_avg_score /= i else: section_avg_score = 0 rouge_by_section[section].append(section_avg_score) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Rouge_By_Section_Abstract/" with open(write_dir + "rouge_by_section_abstract_list.pkl", "wb") as f: pickle.dump(rouge_by_section, f)
def prepare_data(self): """ Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of its constituent word vectors. :return: all sentences as vectors and their classification (data is balanced). """ # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE): # Ignores files which are not papers e.g. hidden files if filename.endswith(".txt"): # Display a loading bar of progress useful_functions.loading_bar(self.loading_section_size, count, self.number_of_papers) count += 1 # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding # to the text in that section. The text is given as a list of lists, each list being a list of words # corresponding to a sentence. paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Get the highlights of the paper highlights = paper["HIGHLIGHTS"] highlights_join = [" ".join(x) for x in highlights] abstract = paper["ABSTRACT"] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): # Iterate over each sentence in the section for sentence in sents: # We don't want to calculate ROUGE for the abstract if section != "ABSTRACT": # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], highlights_join) sentences.append((sentence, r_score, section)) sentences = [(x, section) for x, score, section in reversed( sorted(sentences, key=itemgetter(1)))] sents_pos = sentences[0:self.num_summary] sents_neg = sentences[self.num_summary:] if len(sents_neg) < len(sents_pos): continue sents_pos = [(x[0], x[1], y) for x, y in zip(sents_pos, [1] * len(sents_pos))] sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)] sents_neg = [(x[0], x[1], y) for x, y in zip(sents_neg, [0] * len(sents_neg))] sents_class = sents_pos + sents_neg random.shuffle(sents_class) # Each item in the sentence list has form [(sentence, section, classification)] paper = { "filename": filename, "title": paper["MAIN-TITLE"], "gold": paper["HIGHLIGHTS"], "abstract": abstract, "sentences": sents_class, "description": "All text data is given in the form of a list of words." } data.append(paper) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/" with open(write_dir + "data.pkl", "wb") as f: pickle.dump(data, f) print("Done") return data
highlights_join = [" ".join(x) for x in highlights] abstract = paper["ABSTRACT"] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): section_avg_score = 0 i = 0 # Iterate over each sentence in the section for sentence in sents: # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], highlights_join) section_avg_score += r_score i += 1 if i > 0: section_avg_score /= i else: section_avg_score = 0 rouge_by_section[section].append(section_avg_score) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Rouge_By_Section/" with open(write_dir + "rouge_by_section_list.pkl", "wb") as f: pickle.dump(rouge_by_section, f)