preserve_order=True) return paper if __name__ == "__main__": # Paper One: S0168874X14001395.txt # Paper Two: S0141938215300044.txt # Paper Three: S0142694X15000423.txt summ = AbstractRougeSummariser() #summ.summarise("S0142694X15000423.txt") count = 0 for filename in os.listdir(PAPER_SOURCE): if count > 150: break if filename.endswith(".txt") and count > 0: # We need to write the highlights as a gold summary with the same name as the generated summary. highlights = useful_functions.read_in_paper(filename, True)["HIGHLIGHTS"] useful_functions.write_gold(SUMMARY_WRITE_LOC, highlights, filename) # Display a loading bar of progress useful_functions.loading_bar(LOADING_SECTION_SIZE, count, NUMBER_OF_PAPERS) # Generate and write a summary summ.summarise(filename) count += 1
def prepare_data(self): """ Puts the data in a form suitable for the Word2Vec classifier - it changes each sentence into the average of its constituent word vectors. :return: all sentences as vectors and their classification (data is balanced). """ # Count of how many papers have been processed. count = 0 # Sentences as vectors with their classification data = [] # Count of positive data pos_count = 0 # Count of negative data neg_count = 0 r = Rouge() # Iterate over every file in the paper directory for filename in os.listdir(PAPER_SOURCE): # Ignores files which are not papers e.g. hidden files if filename.endswith(".txt"): # Display a loading bar of progress useful_functions.loading_bar(self.loading_section_size, count, self.number_of_papers) count += 1 # Opens the paper as a dictionary, with keys corresponding to section titles and values corresponding # to the text in that section. The text is given as a list of lists, each list being a list of words # corresponding to a sentence. paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Get the highlights of the paper highlights = paper["HIGHLIGHTS"] highlights_join = [" ".join(x) for x in highlights] abstract = paper["ABSTRACT"] sentences = [] # Iterate over the whole paper for section, sents in paper.iteritems(): # Iterate over each sentence in the section for sentence in sents: # We don't want to calculate ROUGE for the abstract if section != "ABSTRACT": # Calculate the ROUGE score and add it to the list r_score = r.calc_score([" ".join(sentence)], highlights_join) sentences.append((sentence, r_score, section)) sentences = [(x, section) for x, score, section in reversed( sorted(sentences, key=itemgetter(1)))] sents_pos = sentences[0:self.num_summary] sents_neg = sentences[self.num_summary:] if len(sents_neg) < len(sents_pos): continue sents_pos = [(x[0], x[1], y) for x, y in zip(sents_pos, [1] * len(sents_pos))] sents_neg = [x for x in reversed(sents_neg)][:len(sents_pos)] sents_neg = [(x[0], x[1], y) for x, y in zip(sents_neg, [0] * len(sents_neg))] sents_class = sents_pos + sents_neg random.shuffle(sents_class) # Each item in the sentence list has form [(sentence, section, classification)] paper = { "filename": filename, "title": paper["MAIN-TITLE"], "gold": paper["HIGHLIGHTS"], "abstract": abstract, "sentences": sents_class, "description": "All text data is given in the form of a list of words." } data.append(paper) if count % 1000 == 0: print("\nWriting data...") write_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/" with open(write_dir + "data.pkl", "wb") as f: pickle.dump(data, f) print("Done") return data