def prepare_for_summarisation(self, filename, visualise=False): """ Prepares a paper to be summarised by the Word2Vec method. :param filename: the filename of the paper to summarise :param visualise: true if visualising :return: the paper in a form suitable to be summarised with the trained models. """ sentences = self.paper2orderedlist(filename) # Final form will be an ordered list of tuples, where each tuple shall have the form # (sentence_text, sentence_vector, abstract_vector, features). final_form = [] raw_paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) abstract = raw_paper["ABSTRACT"] abs_vector = self.abstract2vector(abstract) prev_section = "" try: bow = self.paper_bags_of_words[filename] except KeyError: paper_str = useful_functions.read_in_paper(filename) paper_str = " ".join([val for _, val in paper_str.iteritems()]).lower() paper_bag_of_words = useful_functions.calculate_bag_of_words(paper_str) self.paper_bags_of_words[filename] = paper_bag_of_words try: kf = self.keyphrases[filename] except KeyError: kfs = raw_paper["KEYPHRASES"] self.keyphrases[filename] = kfs for sentence, section in sentences: sentence_vector = useful_functions.sentence2vec(sentence, self.word2vec) features = self.calculate_features(sentence, self.paper_bags_of_words[filename], self.keyphrases[filename], [" ".join(x) for x in abstract], " ".join(raw_paper["MAIN-TITLE"][0]), section, shorter=True) if not visualise: final_form.append((sentence, sentence_vector, abs_vector, features)) else: if prev_section != section: print("----> Adding section: ", section) final_form.append(([section], np.zeros_like(sentence_vector), np.zeros_like(sentence_vector), np.zeros_like(features))) prev_section = section final_form.append((sentence, sentence_vector, abs_vector, features)) return final_form
def prepare_paper(self, filename): """ Prepares the paper for summarisation. :return: The paper in a form suitable for summarisation """ paper = useful_functions.read_in_paper(filename, sentences_as_lists=True, preserve_order=True) return paper
def paper2orderedlist(self, filename): """ Performs the first task necessary to summarise a paper: turning it into an ordered list of sentences which doesn't include the highlights or abstract section of the paper (as these are already summaries). :param filename: the filename to summarise. :return: the paper as an ordered list of sentences, not including abstract or highlights. """ paper = useful_functions.read_in_paper(filename, sentences_as_lists=True, preserve_order=True) # We don't want to make any predictions for the Abstract or Highlights as these are already summaries. sections_to_predict_for = [] for section, text in paper.iteritems(): if section != "ABSTRACT" and section != "HIGHLIGHTS": sections_to_predict_for.append(text) # Sorts the sections according to the order in which they appear in the paper. sorted_sections_to_predict_for = sorted(sections_to_predict_for, key=itemgetter(1)) # Creates an ordered list of the sentences in the paper sentence_list = [] for sentence_text, section_position_in_paper in sorted_sections_to_predict_for: section_sentences = sentence_text for sentence in section_sentences: sentence_list.append(sentence) return sentence_list
def dump_keyphrases_pkl(out_file): files = os.listdir(BASE_DIR + "Data/Papers/Full/Papers_With_Section_Titles/") keyphrases = {} for file in files: paper = useful_functions.read_in_paper(file, sentences_as_lists=False) keyphrases[file] = paper["KEYPHRASES"] write_pkl(keyphrases, out_file)
def dump_bow_pkl(out_file): files = os.listdir(BASE_DIR + "Data/Papers/Full/Papers_With_Section_Titles/") bow = {} for file in files: paper = useful_functions.read_in_paper(file, sentences_as_lists=False) paper_string = " ".join(v for k, v in paper.iteritems()) bow[file] = useful_functions.calculate_bag_of_words(paper_string) write_pkl(bow, out_file)
preserve_order=True) return paper if __name__ == "__main__": # Paper One: S0168874X14001395.txt # Paper Two: S0141938215300044.txt # Paper Three: S0142694X15000423.txt summ = AbstractRougeSummariser() #summ.summarise("S0142694X15000423.txt") count = 0 for filename in os.listdir(PAPER_SOURCE): if count > 150: break if filename.endswith(".txt") and count > 0: # We need to write the highlights as a gold summary with the same name as the generated summary. highlights = useful_functions.read_in_paper(filename, True)["HIGHLIGHTS"] useful_functions.write_gold(SUMMARY_WRITE_LOC, highlights, filename) # Display a loading bar of progress useful_functions.loading_bar(LOADING_SECTION_SIZE, count, NUMBER_OF_PAPERS) # Generate and write a summary summ.summarise(filename) count += 1
tf_idfs.append(word_tf_idf) return [x for x in zip(sentence, tf_idfs)] with open(BASE_DIR + "/Visualisations/base_html.txt", "rb") as f: html = f.readlines() html.append("<body>\n") html.append("<div class=\"container\">\n") html.append("<div id=\"title\" class=\"text\">") filename = "S0920548915000744.txt" paper = useful_functions.read_in_paper(filename, sentences_as_lists=True, preserve_order=True) html.append("<h1>" + " ".join(paper["MAIN-TITLE"][0][0]) + "</h1>") html.append("</div>") html.append("<div id=\"gold\" class=\"text\">") html.append("<h2>Human Written Summary</h2>") html.append("<hr>") html.append("<br>") html.append("<p>") highlights = paper["HIGHLIGHTS"] print("Reading stuff...") bag_of_words = defaultdict(float) for key, val in paper.iteritems(): sents = val[0]
def process_paper(filename): """ The concurrent function which processes each paper into its training data format. :param filename: the filename of the paper to process. :return: none, but write the preprocessed file to "Data/Training_Data/" """ #print("--> Started processing ", filename) # Start time start_time = time.time() # Read in the paper paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Extract the gold summary gold = paper["HIGHLIGHTS"] gold_string_list = [" ".join(x) for x in gold] # Extract the title title = paper["MAIN-TITLE"][0] title_string = " ".join(title) # Extract the abstract abstract = paper["ABSTRACT"] abstract_string_list = [" ".join(x) for x in abstract] # Extract the keyphrases try: keyphrases = paper["KEYPHRASES"][0] except IndexError: keyphrases = [] # Turn the paper into a single string and calculate the bag of words score paper_string = " ".join([" ".join(x) for key, val in paper.iteritems() for x in val]) bag_of_words = useful_functions.calculate_bag_of_words(paper_string) # Get the paper as a list of sentences, associating each sentence with its section name - will be used by oracle # to find best summary sentences. paper_sentences = [(" ".join(x), key) for key, val in paper.iteritems() for x in val if key != "ABSTRACT"] # Create a list of sentences, their ROUGE-L scores with the Highlights and the section they occur in # (as a string) sents_scores_secs = [] for sentence, section in paper_sentences: # For some reason the candidate sentence needs to be the only item in a list r_score = rouge.calc_score([sentence], gold_string_list) sents_scores_secs.append((sentence.split(" "), r_score, section)) # Sort the sentences, scores and sections into descending order sents_scores_secs = sorted(sents_scores_secs, key=itemgetter(1), reverse=True) pos_sents_scores_secs = sents_scores_secs[:num_summary] neg_sents_scores_secs = sents_scores_secs[num_summary:] if len(neg_sents_scores_secs) < len(pos_sents_scores_secs): print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format( Color.RED, filename, Color.END)) return # Positive sentences positive_sents_secs_class = [(sent, sec, 1) for sent, _, sec in pos_sents_scores_secs] # Negative sentences # Take the sentences not used as positive and reverse it to have worst scores first then take an equal number neg_sents_scores_secs = [x for x in reversed(neg_sents_scores_secs)][:len(positive_sents_secs_class)] negative_sents_secs_class = [(sent, sec, 0) for sent, _, sec in neg_sents_scores_secs] # Don't create data from this paper if it's less than 40 sentences - i.e. there would be more positive than # negative data. The data needs to be balanced. #if len(positive_sents_secs_class) != len(negative_sents_secs_class): # print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format( # Color.RED, filename, Color.END)) # return # Concatenate the positive and negative sentences into a single data item and shuffle it data = positive_sents_secs_class + negative_sents_secs_class random.shuffle(data) # Average word vectors of each sentence and convert to list for JSON serialisation sentvecs_secs_class = [(useful_functions.sentence2vec(sent).tolist(), sec, y) for sent, sec, y in data] # Calculate features for each sentence features = [useful_functions.calculate_features(sent, bag_of_words, document_wordcount, keyphrases, abstract_string_list, title_string, sec) for sent, sec, y in data] # Calculate abstract vector abs_vector = useful_functions.abstract2vector(abstract_string_list).tolist() # Description of the data description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \ " sentences are of the form [(sentence (as a list of words), section in paper," \ " classification)]. The sentence vectors are of a similar form, except the sentence text is" \ " replaced with the vector representation of the sentence. The features are of the form " \ "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \ " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \ "abstract vector is a single [1x100] vector also." # The data item that will be written for this paper data_item = { "filename": filename, "gold": gold, "title": paper["MAIN-TITLE"], "abstract": abstract, "abstract_vec": abs_vector, "sentences": data, "sentence_vecs": sentvecs_secs_class, "sentence_features": features, "description": description_text } # Write the data out with open(TRAINING_DATA_WRITE_LOC + filename.strip(".txt") + ".json", "wb") as f: json.dump(data_item, f) print("--> Finished processing {}, took {} seconds, data length: {}.".format( filename, (time.time() - start_time), len(data)))
def prepare_data(self): """ Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of its constituent word vectors. :return: all sentences as vectors and their classification (data is balanced). """ # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE): # Ignores files which are not papers e.g. hidden files if filename.endswith(".txt"): # Display a loading bar of progress useful_functions.loading_bar(self.loading_section_size, count, self.number_of_papers) count += 1 # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding # to the text in that section. The text is given as a list of lists, each list being a list of words # corresponding to a sentence. paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Get the highlights of the paper highlights = paper["HIGHLIGHTS"] highlights_join = [" ".join(x) for x in highlights] abstract = paper["ABSTRACT"] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): # Iterate over each sentence in the section for sentence in sents: # We don't want to calculate ROUGE for the abstract if section != "ABSTRACT": # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], highlights_join) sentences.append((sentence, r_score, section)) sentences = [(x, section) for x, score, section in reversed( sorted(sentences, key=itemgetter(1)))] sents_pos = sentences[0:self.num_summary] sents_neg = sentences[self.num_summary:] if len(sents_neg) < len(sents_pos): continue sents_pos = [(x[0], x[1], y) for x, y in zip(sents_pos, [1] * len(sents_pos))] sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)] sents_neg = [(x[0], x[1], y) for x, y in zip(sents_neg, [0] * len(sents_neg))] sents_class = sents_pos + sents_neg random.shuffle(sents_class) # Each item in the sentence list has form [(sentence, section, classification)] paper = { "filename": filename, "title": paper["MAIN-TITLE"], "gold": paper["HIGHLIGHTS"], "abstract": abstract, "sentences": sents_class, "description": "All text data is given in the form of a list of words." } data.append(paper) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/" with open(write_dir + "data.pkl", "wb") as f: pickle.dump(data, f) print("Done") return data