def finalize_data(self, mode, u_dataset, babi_id): final_data = [] # score = 0 # instances = 0 # p_score = 0 # p_instances = 0 # pred_labl = -1 # mark_init = time.time() # saver = tf.train.Saver() # train_step, output_layer, loss, output_layer_test = self.model() dataset = [] if u_dataset == 'wikiqa': print('> Getting dataset: {} {}'.format(u_dataset, mode)) dataset = utils.get_wikiqa_for_abcnn(mode) else: print('> Getting dataset: {} {} {}'.format(u_dataset, mode, babi_id)) dataset = utils.get_babi_for_abcnn(babi_id, mode) glove = utils.load_glove(200) print('> Vectorizing the questions and answers') for data in tqdm(dataset, total=len(dataset), ncols=75, unit='Pairs'): q, a, label, tfidf, word_cnt = data q_vector, a_vector = self.qa_vectorize(q, a, glove) final_data.append((q_vector, a_vector, label, tfidf, word_cnt)) return final_data
def test_sick_preprocess(): from Helpers.preprocess import SICK from Helpers import utils import spacy nlp = spacy.load('en') sick = SICK.get_data() glove = utils.load_glove(200) data = sick[0] assert ('senetnce_A' not in data['A']) dtree_entry, dtne_entry = SICK.get_input_tree_single(data, nlp, glove) for entry in [dtree_entry, dtne_entry]: for x in ['A', 'B', 'score']: assert (x in entry) if x != 'score': for y in [ 'word_vectors', 'parent_indices', 'is_leaf', 'dep_tags', 'text' ]: assert (y in entry[x]) assert ('ent_type' in dtne_entry['A']) assert ('ent_type' in dtne_entry['B']) return
def ans_select(self, question, ans_list): ans_sents = [] tfidf, word_cnt = self.extract_features(question, ans_list) _, _, output_layer_test, _ = self.model() saver = tf.train.Saver() with tf.Session() as sessn: filename, _, _ = self.model_state_loader() try: print(filename) saver.restore(sessn, filename) print(' > Model state restored from @ ' + filename) except: print(' > No saved state found. Exiting') sessn.close() sys.exit() glove = utils.load_glove(200) for i, ans in enumerate(ans_list): q_vector, a_vector = self.qa_vectorize(question, ans, glove) input_dict = {self.q: q_vector, self.a: a_vector, self.label: None, self.word_cnt: word_cnt[i], self.tfidf: tfidf[i]} pred = sessn.run(output_layer_test, feed_dict=input_dict) ans_sents.append((ans, pred)) ans_sents = sorted(ans_sents, key=operator.itemgetter(1), reverse=True) # Sorts by scores in desc order return ans_sents
def vis_tokenize(context, question): glove = utils.load_glove(dim=200) ttt = TextTilingTokenizer() para_list = [] paras = [para for para in context.split('\\n') if para != ''] for para in paras: sent_list = [] for sent in sent_tokenize(para): temp = {} temp['words'] = word_tokenize(sent) temp['vectors'] = [ np.array(glove[word.lower()]) for word in temp['words'] ] sent_list.append(temp) para_list.append(sent_list) q_dict = {} q_dict['words'] = word_tokenize(question) q_dict['vectors'] = [ np.array(glove[word.lower()]) for word in q_dict['words'] ] return para_list, q_dict
def main(): glove = utils.load_glove() vector = [] # query = sys.argv[1] # file_name = sys.argv[2] file_name = "../data/corpus/cricket.txt" # query = "what is the role of bat in cricket" query = "what does the batsman do with a ball" with open(file_name, 'r') as f: doc = list(filter(('\n').__ne__, f.readlines())) tidf_measure = np.array(tf_idf(doc, query)) top_indices = tidf_measure.argsort()[-3:][::-1] for index in top_indices: para = doc[index] para_word_vec = get_word_vecs(para, glove) measure = centroid(para_word_vec) vector.append((para, measure)) query_measure = centroid(get_word_vecs(query, glove)) print("\n" + query) print("\n" + get_most_relevant(vector, query_measure))
def test_ans_select(self): babi = utils.get_babi_raw_for_abcnn(babi_id='1', mode='test') babi = utils.process_babi_for_abcnn(babi) shuffle(babi) babi = babi[:100] instances, correct_op = len(babi), 0 _, _, output_layer_test, _ = self.model() with tf.Session() as sess: filename, _, _ = self.model_state_loader() try: saver = tf.train.Saver() print(filename) saver.restore(sess, filename) print(' > Model state restored from @ ' + filename) except Exception as e: print(e) print(' > No saved state found. Exiting') sess.close() sys.exit() glove = utils.load_glove(200) for sample in tqdm(babi, total=len(babi), ncols=75, unit='Sample '): line_numbers, context, question, _, support = sample ans_sents = [] tfidf, word_cnt = self.extract_features(question, context) for i, ans in enumerate(context): q_vector, a_vector = self.qa_vectorize(question, ans, glove) input_dict = {self.q: q_vector, self.a: a_vector, self.label: None, self.word_cnt: word_cnt[i], self.tfidf: tfidf[i]} pred = sess.run(output_layer_test, feed_dict=input_dict) ans_sents.append((ans, pred)) ans_sent, _ = max(ans_sents, key=operator.itemgetter(1)) pred_labl = line_numbers[context.index(ans_sent)] ans_sents = sorted(ans_sents, key=operator.itemgetter(1), reverse=True) all_labels = [operator.itemgetter(0)(item) for item in ans_sents] all_labels = [line_numbers[context.index(item)] for item in all_labels] with open('context_accuracy_abcnn.txt', 'a') as f: res = 'Correct Label: {}\tPredicted Label: {}\tSorted Labels: {}\n'.format(support, pred_labl, all_labels) f.write(res) if pred_labl == support: correct_op += 1 accuracy = correct_op / instances print('Accuracy: {0:.2f}'.format(accuracy))
def read_dataset(): global glove global dep_tags global nlp nlp = spacy.load('en') #PreProcessing of Data before training our SentEmbd Model includes converting of words to their vector representation training_set = os.path.join(os.path.join(BASE, 'data'), "SICK.txt") with open(training_set, 'r') as file1: raw_dataset = file1.read().split('\n') file1.close() # print(raw_dataset) #TESTING PURPOSE dataset = [] training_dataset = [] sim_dataset = [] relatedness_scores = [] depTags_training_dataset = [] depTags_sim_dataset = [] raw_dataset = raw_dataset[1:-1] for item in raw_dataset: temp = item.split('\t') temp2 = temp[4] temp = temp[1:3] temp.append(temp2.strip()) dataset.append(temp) glove = utils.load_glove() dep_tags = utils.load_dep_tags() for item in dataset: vectorized_sent1, dep_tags_1 = utils.get_sent_details( item[0].strip(), glove, dep_tags, nlp) vectorized_sent2, dep_tags_2 = utils.get_sent_details( item[1].strip(), glove, dep_tags, nlp) training_dataset.append(vectorized_sent1) depTags_training_dataset.append(dep_tags_1) sim_dataset.append(vectorized_sent2) depTags_sim_dataset.append(dep_tags_2) relatedness_scores.append(float(item[2])) return dataset, training_dataset, sim_dataset, relatedness_scores, depTags_training_dataset, depTags_sim_dataset
def _retrieve_info(doc, query): glove = utils.load_glove(dim=200) vector = [] ir_dict = {} doc = [x for x in doc.split('\n') if x != ''] print(doc) tidf_measure = np.array(tf_idf(doc, query)[0]) print(tidf_measure) top_indices = tidf_measure.argsort()[-10:][::-1] for index in top_indices: para = doc[index] print(para) print('-'*10) para_word_vec = get_word_vecs(para, glove) p_centr = centroid(para_word_vec) p_tfidf = tidf_measure[index] vector.append((para, p_centr, p_tfidf)) # print(vector) query_measure = centroid(get_word_vecs(query, glove)) # Ranked Paras - (Para, centroid, tfidf, cosine_sim) top_para_list = get_most_relevant(vector, query_measure) # print(get_most_relevant(vector, query_measure)) top_para_dict = [] for para in top_para_list: entry = {} entry['para'] = para[0] entry['centroid'] = para[1] entry['tf_idf'] = para[2] entry['cosine_sim'] = para[3] top_para_dict.append(entry) ir_dict['question'] = query ir_dict['q_centroid'] = query_measure ir_dict['top_paras'] = top_para_dict return ir_dict
def main(): start = time.time() query = sys.argv[1] glove = utils.load_glove() quest = utils.init_babi_deploy( os.path.join( os.path.join( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data.bak'), 'corpus'), 'babi1.txt'), query) dmn = dmn_basic.DMN_basic(babi_train_raw=quest, babi_test_raw=[], word2vec=glove, word_vector_size=50, dim=40, mode='deploy', answer_module='feedforward', input_mask_mode="sentence", memory_hops=5, l2=0, normalize_attention=False, answer_vec='index', debug=False) dmn.load_state( 'states/dmn_basic/dmn_basic.mh5.n40.bs10.babi1.epoch2.test1.20454.state' ) # dmn.load_state('states/dmn_basic/dmn_basic.mh5.n40.bs10.babi1.epoch0.test1.48296.state') prediction = dmn.step_deploy() prediction = prediction[0][0] for ind in prediction.argsort()[::-1]: if ind < dmn.answer_size: print(dmn.ivocab[ind], prediction[ind]) # break print('Time taken:', time.time() - start)
def retrieve_info(doc, query): glove = utils.load_glove() vector = [] # query = sys.argv[1] # file_name = sys.argv[2] # print(tf_idf(doc, query)) tidf_measure = np.array(tf_idf(doc, query)[0]) top_indices = tidf_measure.argsort()[-10:][::-1] # print(top_indices) for index in top_indices: para = doc[index] para_word_vec = get_word_vecs(para, glove) measure = centroid(para_word_vec) vector.append((para, measure)) # print(vector) query_measure = centroid(get_word_vecs(query, glove)) # print(get_most_relevant(vector, query_measure)) return get_most_relevant(vector, query_measure)
assert args.word_vector_size in [50, 100, 200, 300] network_name = args.prefix + '%s.mh%d.n%d.bs%d%s%s%s.babi%s' % ( args.network, args.memory_hops, args.dim, args.batch_size, ".na" if args.normalize_attention else "", ".bn" if args.batch_norm else "", (".d" + str(args.dropout)) if args.dropout>0 else "", args.babi_id) if(args.mode != 'deploy'): babi_train_raw, babi_test_raw = utils.get_babi_raw(args.babi_id, args.babi_test_id) word2vec = utils.load_glove(args.word_vector_size) args_dict = dict(args._get_kwargs()) if(args.mode != 'deploy'): args_dict['babi_train_raw'] = babi_train_raw args_dict['babi_test_raw'] = babi_test_raw args_dict['babi_deploy_raw']=None else: raw_task=utils.init_babi_deploy('/home/mit/Desktop/EruditeX/data/corpus/babi.txt',args.query) args_dict['babi_train_raw'] = None args_dict['babi_test_raw'] = None args_dict['babi_deploy_raw']=raw_task # print(raw_task)