def interactive_mode(self): facts = [] utterance = None response = None turn_count = 1 while True: line = input("==> ").strip().lower() if line == "exit": break if line == "restart": facts = [] turn_count = 1 print("Restarting dialog...\n") continue utterance = tokenize(line) data = [(facts, utterance, -1)] # Vectorize data and make prediction f, q, a = vectorize_data(data, self.word_idx, self.sentence_size, self.batch_size, self.memory_size) preds = self.model.predict(f, q) response = self.idx_to_candidates[preds[0]] # Print predicted response print(response) response = tokenize(response) # Add turn count temporal encoding utterance.append("$u") response.append("$r") # Add utterance/response encoding utterance.append("#" + str(turn_count)) response.append("#" + str(turn_count)) # Update facts memory facts.append(utterance) facts.append(response) turn_count += 1
def interactive(self): context = [['male', 'young', '$r', '#0']] # context = [] u = None r = None nid = 1 while True: line = input('--> ').strip().lower() if line == 'exit': break if line == 'restart': context = [['female', 'young', '$r', '#0']] # context = [] nid = 1 print("clear memory") continue u = tokenize(line) data = [(context, u, -1)] s, q, a = vectorize_data(data, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size) preds = self.model.predict(s, q) r = self.indx2candid[preds[0]] print(r) r = tokenize(r) u.append('$u') u.append('#' + str(nid)) r.append('$r') r.append('#' + str(nid)) context.append(u) context.append(r) nid += 1
def interactive(self): context = [] u = None r = None nid = 1 while True: line = raw_input('--> ').strip().lower() if line == 'exit': break if line == 'restart': context = [] nid = 1 print("clear memory") continue u = tokenize(line) data = [(context, u, -1)] s, q, a = vectorize_data( data, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size) preds = self.model.predict(s, q) r = self.indx2candid[preds[0]] print(r) r = tokenize(r) u.append('$u') u.append('#' + str(nid)) r.append('$r') r.append('#' + str(nid)) context.append(u) context.append(r) nid += 1
def interactive(self): context = [] u = None r = None nid = 1 while True: line = input('--> ').strip().lower() if line == 'exit': break if line == 'restart': context = [] nid = 1 print("clear memory") continue u = tokenize(line) data = [(context, u, -1)] s, q, a = vectorize_data(data, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size) s = Variable(torch.from_numpy(np.stack(s))) q = Variable(torch.from_numpy(np.stack(q))) a = Variable(torch.from_numpy(np.stack(a))) preds = list(self.model.predict(s, q).data.numpy().tolist()) r = self.indx2candid[preds[0]] print(r) r = tokenize(r) u.append('$u') u.append('#' + str(nid)) r.append('$r') r.append('#' + str(nid)) context.append(u) context.append(r) nid += 1
def main(config): model = Eval(config) results = model.run_txt(config.file) print("Test accuracy: ", results[4]) print("") for i in range(len(results[0])): counter = 0 while True: line = results[1][results[0][i] + counter] if len(line) > 0: # print(line) # print("") counter += 1 if int(tokenize(line)[0]) == 21: idx = get_ans_index(line, results[5][str(results[2][i])]) break print("Real answer: ", results[5][str(results[3][i])], " Predicted answer: ", results[5][str(results[2][i])]) print("Index of ans: ", idx) print("") # key = input("Press enter to continue...") # if key == 'q': # break print("") print("Thanks!! :)")
def interactive(self): context = [] u = None r = None nid = 1 while True: line = input('--> ').strip().lower() if line == 'exit': break if line == 'restart': context = [] nid = 1 print("clear memory") continue u = tokenize(line) data = [(context, u, -1)] # Need to take care of the candidate sentence size > sentence size. In both main function and here # Whichever of candidate_size or candidate_sentence_size is higher, that should be allowed s, q, a = vectorize_data_match(data, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain_word=True, uncertain=self.uncertain_word_index) m = None if FLAGS.match: m = create_match_features([(s, q, a)], self.indx2candid, self.kb) preds = self.model.predict( s, q, get_temporal_encoding(s, random_time=0.0), False, m) r = self.indx2candid[preds[0]] print(r) r = tokenize(r) u.append('$u') u.append('#' + str(nid)) r.append('$r') r.append('#' + str(nid)) context.append(u) context.append(r) nid += 1
def set_lines_pos(self, lines): self.lines = lines pos = [] counter = 0 for line in lines: tokens = tokenize(line) if len(tokens) > 0 and int(tokens[0]) == 1: pos.append(counter) counter += 1 self.pos = pos self.len_dataset = len(self.pos)
def encode(tokenizer: BertTokenizer, sentence: str) -> Tuple[np.ndarray, np.ndarray]: """ Encoding the eval task dataset :param tokenizer: :param text: :param labels: :return: """ texts = np.array(tokenize(tokenizer, sentence, 50)['input_ids']) return texts
def interactive(model, indx2candid, cands_tensor, word_idx, sentence_size, memory_size, cuda=False): context = [] u = None r = None nid = 1 while True: line = input('--> ').strip().lower() if line == 'exit': break if line == 'restart': context = [] nid = 1 print("clear memory") continue u = tokenize(line) data = [(context, u, -1)] s, q, a, entity_dict = vectorize_data(data, word_idx, sentence_size, memory_size) memory = V(torch.from_numpy(np.stack(s))) utter = V(torch.from_numpy(np.stack(q))) if cuda: memory = transfer_to_gpu(memory) utter = transfer_to_gpu(utter) context_, cand_ = model(utter, memory, cands_tensor) preds = model.predict(context_, cand_) r = indx2candid[preds.data[0]] print(r) r = tokenize(r) u.append('$u') u.append('#' + str(nid)) r.append('$r') r.append('#' + str(nid)) context.append(u) context.append(r) nid += 1
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=100): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.vocab = {} self.ivocab = {} self.word2vec = {} self.word2vec_init = True if self.word2vec_init: # assert config.embed_size == 100 self.word2vec = load_glove(self.embedding_size) process_word(word="<eos>", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") # Define uncertain or unknown word index and vec for use later for training out-of-context data self.uncertain_word_index = process_word( word="sdfsssdf", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) self.set_max_sentence_length() # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.trainS, self.trainQ, self.trainA = vectorize_data_match( self.trainData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain=self.uncertain_word_index) self.valS, self.valQ, self.valA = vectorize_data_match( self.valData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain_word=True, uncertain=self.uncertain_word_index) self.candidates_vec = vectorize_candidates( candidates, self.word2vec, self.candidate_sentence_size, self.vocab, self.ivocab, self.embedding_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() # Set max sentence vector size self.build_vocab(data, candidates) answer_n_hot = np.zeros((self.vocab_size, len(self.candid2indx))) for ans_it in range(len(self.indx2candid)): ans = self.indx2candid[ans_it] n_hot = np.zeros((self.vocab_size, )) for w in tokenize(ans): assert w in self.word_idx n_hot[self.word_idx[w]] = 1 answer_n_hot[:, ans_it] = n_hot # Need to understand more about sentence size. Model failing because sentence size > candidate_sentence_size? Answers longer than queries? self.model = MemN2NDialogHybridMatch(self.batch_size, self.vocab_size, self.max_sentence_size, self.memory_size, self.embedding_size, answer_n_hot, match=FLAGS.match, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=self.task_id) # self.model = MemN2NDialogHybrid(self.batch_size, self.vocab_size, self.n_cand, self.max_sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, # hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph) self.kb = parse_kb(FLAGS.kb_file)
response.append("$r") # Add utterance/response encoding utterance.append("#" + str(turn_count)) response.append("#" + str(turn_count)) # Update facts memory facts.append(utterance) facts.append(response) turn_count += 1 if __name__ == "__main__": candidates = [] candidates_to_idx = {} with open('dialog-babi/dialog-babi-candidates.txt') as f: for i, line in enumerate(f): candidates_to_idx[line.strip().split(' ', 1)[1]] = i line = tokenize(line.strip())[1:] candidates.append(line) train_data = [] with open('dialog-babi/dialog-babi-task5-full-dialogs-trn.txt') as f: train_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx) test_data = [] with open('dialog-babi/dialog-babi-task5-full-dialogs-tst.txt') as f: test_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx) val_data = [] with open('dialog-babi/dialog-babi-task5-full-dialogs-dev.txt') as f: val_data = parse_dialogs_per_response(f.readlines(), candidates_to_idx) chatbot = ChatBotWrapper(train_data, test_data, val_data,
# Parameters params = { 'batch_size': 256, 'n_classes': 2, 'max_len': 200, 'n_words': 20000, 'shuffle': True } learning_rate = 0.5 df_train = pd.read_csv(DATA_TRAIN, names=['label', 'text']) df_test = pd.read_csv(DATA_TEST, names=['label', 'text']) tokenizer = tokenize(df_train['text'].values, n_words=params['n_words']) train_x, valid_x, train_y, valid_y = \ model_selection.train_test_split(df_train['text'].values, df_train['label'].values) # Generators training_generator = DataGenerator(train_x, train_y, tokenizer, **params) valid_generator = DataGenerator(valid_x, valid_y, tokenizer, **params) #test_generator = DataGenerator(df_test, **params) # load the pre-trained word-embedding vectors embeddings_index = {} for i, line in enumerate(open('lmdata/wiki-news-300d-1M.vec', encoding='utf8')): if i % 10000 == 0: print(i)
#word_idx, _ = data_utils.build_vocab(essay_list, vocab_limit) sent_size_list = list(map(len, [essay for essay in essay_list])) max_sent_size = max(sent_size_list) mean_sent_size = int(np.mean(list(map(len, essay_list)))) memory = [] memory_score = [] if mem_file: print('MEM_FILE:', mem_file) file_memory = 'mems/{}.txt'.format(essay_set_id) logger.info('Reading memory from file {}...'.format(file_memory)) logger.info('Reading memory from {}!!!!'.format(file_memory)) with open(file_memory, 'r') as fm: memory_list = [data_utils.tokenize(m) for m in fm] num_mem = len(memory_list) mem_size_list = [len(m) for m in memory_list] max_mem_size = max(mem_size_list) for m in memory_list: logger.info('MEMORY: {}'.format(m)) if max_sent_size < max_mem_size: max_sent_size = max_mem_size memory = data_utils.vectorize_data(memory_list, word_idx, max_sent_size) memory_score = [0] * num_mem logger.info('max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size)) with open(out_dir+'/params', 'a') as f: f.write('max sentence size: {} \nmean sentence size: {}\n'.format(max_sent_size, mean_sent_size)) logger.info('The length of score range is {}'.format(len(score_range)))
import joblib import numpy as np import pickle from gensim.models import Word2Vec, Doc2Vec from data_utils import load_text_data, tokenize from embedding_utils import (vectorize_matrix_with_word2vec, vectorize_matrix_with_doc2vec) from features import mk_feature if __name__ == "__main__": # load and preprocess indices, documents = load_text_data('data/rating_eval.txt') document_tokens = [tokenize(document) for document in documents] # load vectorizer and embed # count vectorizer count_v = joblib.load('model/count_vectorizer.pickle') count_v_test = count_v.transform( [' '.join(tokens) for tokens in document_tokens]).toarray() additional_feature = mk_feature(documents, document_tokens) # word2vec w2v_model = Word2Vec.load('model/model_w2v_128.model') w2v_test = np.concatenate( (vectorize_matrix_with_word2vec(document_tokens, model=w2v_model), mk_feature(documents, document_tokens)), axis=1) # doc2vec d2v_model = Doc2Vec.load('model/doc2vec.model')
def __getitem__(self, index: int) -> Any: return T.LongTensor( tokenize(self.tokenizer, self.corpus[index], self.max_len)['input_ids'])