def __init__(self): """ Double Dataset based Summariser. """ self.summary_length = 10 print("Reading bags of words...") t = time.time() self.paper_bags_of_words = useful_functions.load_pickled_object(PAPER_BAG_OF_WORDS_LOC) print("Done, took ", time.time() - t, " seconds.") # Dictionary holding the keyphrases for each paper print("Reading keyphrases...") t = time.time() self.keyphrases = useful_functions.load_pickled_object(KEYPHRASES_LOC) print("Done, took ", time.time() - t, " seconds.") # Dictionary which contains the counts of the number of different papers that a word occurs in print("Reading global count...") t = time.time() self.global_paper_count = useful_functions.load_pickled_object(GLOBAL_COUNT_LOC) print("Done, took ", time.time() - t, " seconds.") # The word2vec model to represent the first word and first and second words of the sentence print("Loading word2vec...") t = time.time() self.word2vec_model = useful_functions.load_word2vec() self.vocab = set(self.word2vec_model.index2word) print("Done, took ", time.time() - t, " seconds.")
def __init__(self): """ Preprocessess data into a form suitable for use with networks that use the abstract vector as part of their input for classification. """ # The number of summary sentences to extract from the paper as training data self.num_summary = 20 # The number of papers to process and the loading section size which will be used to print a loading bar as # the papers are processed self.number_of_papers = len([ name for name in os.listdir(PAPER_SOURCE) if name.endswith(".txt") ]) self.loading_section_size = self.number_of_papers / 30 # Number of classes to classify into self.num_classes = 2 # A thread pool for parallel processing of dats self.pool2 = ThreadPool(4) # Load the word2vec model print("Reading word2vec...") t = time.time() self.word2vec = useful_functions.load_word2vec() self.vocab = set(self.word2vec.index2word) print("Done, took ", time.time() - t, " seconds.") # Dictionary which contains bag of words representations for every paper print("Reading bags of words...") t = time.time() self.paper_bags_of_words = useful_functions.load_pickled_object( PAPER_BAG_OF_WORDS_LOC) print("Done, took ", time.time() - t, " seconds.") # Dictionary holding the keyphrases for each paper print("Reading keyphrases...") t = time.time() self.keyphrases = useful_functions.load_pickled_object(KEYPHRASES_LOC) print("Done, took ", time.time() - t, " seconds.") # Dictionary which contains the counts of the number of different papers that a word occurs in print("Reading global count...") t = time.time() self.global_paper_count = useful_functions.load_pickled_object( GLOBAL_COUNT_LOC) print("Done, took ", time.time() - t, " seconds.") # Running start time to measure running time self.start_time = time.time()
def get_data(): print("Loading Data...") t = time.time() data = useful_functions.load_pickled_object(DATA_DIR) sentences_class = [] for item in data: sentences = item["sentences"] features = item["sentence_features"] for sentence, feat in zip(sentences, features): sent = sentence[0] sec = sentence[1] y = sentence[2] sentences_class.append((sent, feat, y)) data = sentences_class print("Done, took ", time.time() - t, " seconds") print("Processing Data...") #new_data = [x for x in data if len(x[0]) < MAX_SENT_LEN] new_data = [] for sent, feat, y in data: if len(sent) > MAX_SENT_LEN: new_sent = sent[0:MAX_SENT_LEN] else: new_sent = sent new_data.append((new_sent, feat, y)) print("Done") return new_data
def get_data(): """ Loads the data from the data directory given above and puts it into the form required by the summarisers. In this summariser the data we require is: the raw sentences, the abstract and the features. :return: The data, but discarding the sentences longer than the maximum length. """ print("Loading Data...") t = time.time() # The data is a pickled object data = useful_functions.load_pickled_object(DATA_DIR) # Data list sents_absvec_feats_class = [] for item in data: sentences = item["sentences"] abstract_vec = item["abstract_vec"] features = item["sentence_features"] for sentence, feat in zip(sentences, features): sent = sentence[0] sec = sentence[1] y = sentence[2] sents_absvec_feats_class.append((sent, abstract_vec, feat, y)) data = sents_absvec_feats_class print("Done, took ", time.time() - t, " seconds") print("Processing Data...") new_data = [] for sent, abs_vec, feat, y in data: new_feat = (feat[1], feat[2], feat[3], feat[4], feat[5], feat[6], feat[7]) if len(sent) > MAX_SENT_LEN: new_sent = sent[0:MAX_SENT_LEN] else: new_sent = sent new_data.append((new_sent, abs_vec, new_feat, y)) return new_data
def extra_processing(self): data_dir = BASE_DIR + "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/data.pkl" write_dir = BASE_DIR + \ "/Data/Generated_Data/Sentences_And_SummaryBool/Abstract_Neg/AbstractNet/abstractnet_data.pkl" print("----> Reading data...") t = time.time() data = useful_functions.load_pickled_object(data_dir) print("----> Done, took ", time.time() - t, " seconds") print("----> Beginning processing...") t = time.time() self.start_time = t new_data = self.pool2.map(self.process_item, data) # new_data = np.concatenate(new_data, axis=0) print("----> Done, took ", (time.time() - t) / 60, " minutes") useful_functions.pickle_list(new_data, write_dir)
def get_data(): print("Loading Data...") t = time.time() data = useful_functions.load_pickled_object(DATA_DIR) sents = [] labs = [] for item in data: sentences = item["sentences"] for sent, sec, y in sentences: sents.append(sent) labs.append(num2onehot(y, NUM_CLASSES)) print("Done, took ", time.time() - t, " seconds") data = { "sentences": sents, "labels": labs } return data
correct_answers = graph_outputs["correct_answers"] # Accuracy operation accuracy = graph_outputs["accuracy"] with tf.Session() as sess: # Initialise all variables sess.run(tf.global_variables_initializer()) # Saving object saver = tf.train.Saver() print("Loading Data...") t = time.time() data = useful_functions.load_pickled_object(DATA_DIR) sentvec_abstracvec_features_class = [] for item in data: sentence_vecs = item["sentence_vecs"] abstract_vec = item["abstract_vec"] features = item["sentence_features"] for sent, feat in zip(sentence_vecs, features): vec = np.concatenate((sent[0], abstract_vec, feat)) sentvec_abstracvec_features_class.append((vec, sent[2])) data = sentvec_abstracvec_features_class print("Done, took ", time.time() - t, " seconds") test_len = int(len(data) * (1/3)) test_data = data[0:test_len] train_data = data[test_len:]
html.append("</div>") html.append("<div id=\"gold\" class=\"text\">") html.append("<h2>Human Written Summary</h2>") html.append("<hr>") html.append("<br>") html.append("<p>") highlights = paper["HIGHLIGHTS"] print("Reading stuff...") bag_of_words = defaultdict(float) for key, val in paper.iteritems(): sents = val[0] for sent in sents: for word in sent: bag_of_words[word] += 1.0 global_paper_count = useful_functions.load_pickled_object(GLOBAL_COUNT_LOC) print("Done") sents_and_scores = [] for sentence in highlights[0]: sents_and_scores.append( calculate_tf_idf(sentence, global_paper_count, bag_of_words)) max_tf_idf = -1 for sentence in sents_and_scores: for word, score in sentence: if score > max_tf_idf: max_tf_idf = score highlights_and_scores = [] for sentence in sents_and_scores: