def get_data(): print("Loading Data...") t = time.time() data = useful_functions.load_cspubsumext() sentences_class = [] for item in data: sentences = item["sentences"] features = item["sentence_features"] for sentence, feat in zip(sentences, features): sent = sentence[0] sec = sentence[1] y = sentence[2] sentences_class.append((sent, feat, y)) data = sentences_class print("Done, took ", time.time() - t, " seconds") print("Processing Data...") #new_data = [x for x in data if len(x[0]) < MAX_SENT_LEN] new_data = [] for sent, feat, y in data: if len(sent) > MAX_SENT_LEN: new_sent = sent[0:MAX_SENT_LEN] else: new_sent = sent new_data.append((new_sent, feat, y)) print("Done") return new_data
def get_data(): """ Loads the data from the data directory given above and puts it into the form required by the summarisers. In this summariser the data we require is: the raw sentences, the abstract and the features. :return: The data, but discarding the sentences longer than the maximum length. """ print("Loading Data...") t = time.time() # The data is a pickled object data = useful_functions.load_cspubsumext() # Data list sents_absvec_feats_class = [] for item in data: sentences = item["sentences"] abstract_vec = item["abstract_vec"] features = item["sentence_features"] for sentence, feat in zip(sentences, features): sent = sentence[0] sec = sentence[1] y = sentence[2] sents_absvec_feats_class.append((sent, abstract_vec, feat, y)) data = sents_absvec_feats_class print("Done, took ", time.time() - t, " seconds") print("Processing Data...") new_data = [] for sent, abs_vec, feat, y in data: new_feat = (feat[1], feat[2], feat[3], feat[4], feat[5], feat[6], feat[7]) if len(sent) > MAX_SENT_LEN: new_sent = sent[0:MAX_SENT_LEN] else: new_sent = sent new_data.append((new_sent, abs_vec, new_feat, y)) return new_data
def get_data(): """ Loads the data from the data directory given above and puts it into the form required by the summarisers. In this summariser the data we require is: the raw sentences, the abstract and the features. :return: The data, but discarding the sentences longer than the maximum length. """ print("Loading Data...") t = time.time() # The data is a pickled object data = useful_functions.load_cspubsumext() # Data list sents_absvec_feats_class = [] for item in data: sentences = item["sentences"] abstract_vec = item["abstract_vec"] features = item["sentence_features"] for sentence, feat in zip(sentences, features): sent = sentence[0] sec = sentence[1] y = sentence[2] #sents_absvec_feats_class.append((sent, abstract_vec, feat, y)) sents_absvec_feats_class.append((sent[:MAX_SEN_LEN], abstract_vec, feat, y)) data = sents_absvec_feats_class print("Done, took ", time.time() - t, " seconds") #avoiding this step as we are using [:MAX_SENT_LEN] in above step #To truncate the input sentences at fixed length #print("Processing Data...") #new_data = [] #for sent, abs_vec, feat, y in data: # if len(sent) > MAX_SENT_LEN: # new_sent = sent[0:MAX_SENT_LEN] # else: # new_sent = sent # new_data.append((new_sent, abs_vec, feat, y)) #return new_data return data
def get_data(): print("Loading Data...") t = time.time() data = useful_functions.load_cspubsumext() sents = [] labs = [] for item in data: sentences = item["sentences"] for sent, sec, y in sentences: sents.append(sent) labs.append(num2onehot(y, NUM_CLASSES)) print("Done, took ", time.time() - t, " seconds") data = {"sentences": sents, "labels": labs} return data
correct_answers = tf.argmax(labels, axis=1) accuracy = tf.reduce_mean( tf.cast(tf.equal(pred_answers, correct_answers), tf.float32)) with tf.Session() as sess: # Initialise all variables sess.run(tf.global_variables_initializer()) # Saving object saver = tf.train.Saver() # Data is 3D - of form [sentence_vector, classification, abstract_vector]. We will ignore the abstract vector here. print("Loading Data...") t = time.time() data = useful_functions.load_cspubsumext() sentence_class_abstract = [] for item in data: sentence_vecs = item["sentence_vecs"] abstract_vec = item["abstract_vec"] for sent, sec, y in sentence_vecs: sentence_class_abstract.append((sent, y, abstract_vec)) data = sentence_class_abstract print("Done, took ", time.time() - t, " seconds") test_len = int(len(data) * (1 / 3)) test_data = data[0:test_len] train_data = data[test_len:] print("Length of Training Data: ", len(train_data)) print("Length of Testing Data: ", len(test_data))