def abstract2vector(self, abstract): """ Changes the abstract into a single averaged vector. :param abstract: the abstract to turn into a vector :return: a single vector representing the abstract """ abstract_vecs = [useful_functions.sentence2vec(x) for x in abstract] avg = np.mean(abstract_vecs, axis=0) return avg
def prepare_for_summarisation(self, filename, visualise=False): """ Prepares a paper to be summarised by the Word2Vec method. :param filename: the filename of the paper to summarise :param visualise: true if visualising :return: the paper in a form suitable to be summarised with the trained models. """ sentences = self.paper2orderedlist(filename) # Final form will be an ordered list of tuples, where each tuple shall have the form # (sentence_text, sentence_vector, abstract_vector, features). final_form = [] raw_paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) abstract = raw_paper["ABSTRACT"] abs_vector = self.abstract2vector(abstract) prev_section = "" try: bow = self.paper_bags_of_words[filename] except KeyError: paper_str = useful_functions.read_in_paper(filename) paper_str = " ".join([val for _, val in paper_str.iteritems()]).lower() paper_bag_of_words = useful_functions.calculate_bag_of_words(paper_str) self.paper_bags_of_words[filename] = paper_bag_of_words try: kf = self.keyphrases[filename] except KeyError: kfs = raw_paper["KEYPHRASES"] self.keyphrases[filename] = kfs for sentence, section in sentences: sentence_vector = useful_functions.sentence2vec(sentence, self.word2vec) features = self.calculate_features(sentence, self.paper_bags_of_words[filename], self.keyphrases[filename], [" ".join(x) for x in abstract], " ".join(raw_paper["MAIN-TITLE"][0]), section, shorter=True) if not visualise: final_form.append((sentence, sentence_vector, abs_vector, features)) else: if prev_section != section: print("----> Adding section: ", section) final_form.append(([section], np.zeros_like(sentence_vector), np.zeros_like(sentence_vector), np.zeros_like(features))) prev_section = section final_form.append((sentence, sentence_vector, abs_vector, features)) return final_form
def process_paper(filename): """ The concurrent function which processes each paper into its training data format. :param filename: the filename of the paper to process. :return: none, but write the preprocessed file to "Data/Training_Data/" """ #print("--> Started processing ", filename) # Start time start_time = time.time() # Read in the paper paper = useful_functions.read_in_paper(filename, sentences_as_lists=True) # Extract the gold summary gold = paper["HIGHLIGHTS"] gold_string_list = [" ".join(x) for x in gold] # Extract the title title = paper["MAIN-TITLE"][0] title_string = " ".join(title) # Extract the abstract abstract = paper["ABSTRACT"] abstract_string_list = [" ".join(x) for x in abstract] # Extract the keyphrases try: keyphrases = paper["KEYPHRASES"][0] except IndexError: keyphrases = [] # Turn the paper into a single string and calculate the bag of words score paper_string = " ".join([" ".join(x) for key, val in paper.iteritems() for x in val]) bag_of_words = useful_functions.calculate_bag_of_words(paper_string) # Get the paper as a list of sentences, associating each sentence with its section name - will be used by oracle # to find best summary sentences. paper_sentences = [(" ".join(x), key) for key, val in paper.iteritems() for x in val if key != "ABSTRACT"] # Create a list of sentences, their ROUGE-L scores with the Highlights and the section they occur in # (as a string) sents_scores_secs = [] for sentence, section in paper_sentences: # For some reason the candidate sentence needs to be the only item in a list r_score = rouge.calc_score([sentence], gold_string_list) sents_scores_secs.append((sentence.split(" "), r_score, section)) # Sort the sentences, scores and sections into descending order sents_scores_secs = sorted(sents_scores_secs, key=itemgetter(1), reverse=True) pos_sents_scores_secs = sents_scores_secs[:num_summary] neg_sents_scores_secs = sents_scores_secs[num_summary:] if len(neg_sents_scores_secs) < len(pos_sents_scores_secs): print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format( Color.RED, filename, Color.END)) return # Positive sentences positive_sents_secs_class = [(sent, sec, 1) for sent, _, sec in pos_sents_scores_secs] # Negative sentences # Take the sentences not used as positive and reverse it to have worst scores first then take an equal number neg_sents_scores_secs = [x for x in reversed(neg_sents_scores_secs)][:len(positive_sents_secs_class)] negative_sents_secs_class = [(sent, sec, 0) for sent, _, sec in neg_sents_scores_secs] # Don't create data from this paper if it's less than 40 sentences - i.e. there would be more positive than # negative data. The data needs to be balanced. #if len(positive_sents_secs_class) != len(negative_sents_secs_class): # print("{}**** NOT A SUFFICIENT AMOUNT OF DATA IN PAPER {}, IGNORING PAPER ****{}".format( # Color.RED, filename, Color.END)) # return # Concatenate the positive and negative sentences into a single data item and shuffle it data = positive_sents_secs_class + negative_sents_secs_class random.shuffle(data) # Average word vectors of each sentence and convert to list for JSON serialisation sentvecs_secs_class = [(useful_functions.sentence2vec(sent).tolist(), sec, y) for sent, sec, y in data] # Calculate features for each sentence features = [useful_functions.calculate_features(sent, bag_of_words, document_wordcount, keyphrases, abstract_string_list, title_string, sec) for sent, sec, y in data] # Calculate abstract vector abs_vector = useful_functions.abstract2vector(abstract_string_list).tolist() # Description of the data description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \ " sentences are of the form [(sentence (as a list of words), section in paper," \ " classification)]. The sentence vectors are of a similar form, except the sentence text is" \ " replaced with the vector representation of the sentence. The features are of the form " \ "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \ " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \ "abstract vector is a single [1x100] vector also." # The data item that will be written for this paper data_item = { "filename": filename, "gold": gold, "title": paper["MAIN-TITLE"], "abstract": abstract, "abstract_vec": abs_vector, "sentences": data, "sentence_vecs": sentvecs_secs_class, "sentence_features": features, "description": description_text } # Write the data out with open(TRAINING_DATA_WRITE_LOC + filename.strip(".txt") + ".json", "wb") as f: json.dump(data_item, f) print("--> Finished processing {}, took {} seconds, data length: {}.".format( filename, (time.time() - start_time), len(data)))
def process_item(self, item): """ Data item is of form: data = { "filename" "gold" "title" "abstract" "sentences" "description" } :param item: the data item to process :return: the processed data item """ t = time.time() # Get the bag of words representation for this paper. bag_of_words = self.paper_bags_of_words[item["filename"]] # Get the keyphrases of this paper keyphrases = self.keyphrases[item["filename"]] # Get the abstract of this paper as a list of strings abstract = [" ".join(x) for x in item["abstract"]] # Get the title of this paper title = item["title"][0] # Get a vector representation of the abstract abs_vector = self.abstract2vector(abstract) # Get vector representations of each of the sentences sentence_vectors = [(useful_functions.sentence2vec(x), section, y) for x, section, y in item["sentences"]] # Get feature representations of each of the sentences features = [ self.calculate_features(x, bag_of_words, keyphrases, abstract, title, section, True) for x, section, y in item["sentences"] ] description_text = "All text is of the form of a list of lists, where each sentence is a list of words. The" \ " sentences are of the form [(sentence (as a list of words), section in paper," \ " classification)]. The sentence vectors are of a similar form, except the sentence text is" \ " replaced with the vector representation of the sentence. The features are of the form " \ "[(AbstractROUGE, TF-IDF, Document_TF-IDF, keyphrase_score, title_score, numeric_score," \ " sentence_length, section)]. The dimensions of each sentence vector are [1x100]. The " \ "abstract vector is a single [1x100] vector also." new_data = { "filename": item["filename"], "gold": item["gold"], "title": item["title"], "abstract": item["abstract"], "abstract_vec": abs_vector, "sentences": item["sentences"], "sentence_vecs": sentence_vectors, "sentence_features": features, "description": description_text } print("Done, process took ", time.time() - t, " seconds, time since start is ", (time.time() - self.start_time) / 60, " minutes") return new_data