def init(data_dir, task_id, OOV=False): # load candidates candidates, candid2indx = load_candidates( data_dir, task_id) n_cand = len(candidates) print("Candidate Size", n_cand) indx2candid = dict( (candid2indx[key], key) for key in candid2indx) # load task data train_data, test_data, val_data = load_dialog_task( data_dir, task_id, candid2indx, OOV) data = train_data + test_data + val_data # build parameters word_idx, sentence_size, \ candidate_sentence_size, memory_size, \ vocab_size = build_vocab(data, candidates) # Variable(torch.from_numpy(candidates_vec)).view(len(candidates), sentence_size) candidates_vec = vectorize_candidates( candidates, word_idx, candidate_sentence_size) return candid2indx, \ indx2candid, \ candidates_vec, \ word_idx, \ sentence_size, \ candidate_sentence_size, \ memory_size, \ vocab_size, \ train_data, test_data, val_data
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20,intro_times=20): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.intro_times=intro_times candidates, self.candid2indx = load_candidates( self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) #build training words set # pdb.set_trace() self.train_val_wordset = self.words_set(self.valData+self.trainData) all_wordset = self.words_set(data) no_oov_word = len(self.train_val_wordset) with_oov_word = len(all_wordset) print('oov words', with_oov_word - no_oov_word) # new_words=[] # for word in all_wordset: # if word not in self.train_val_wordset: # new_words.append(self.idx_word[word]) # print('These words are new:',new_words) # pdb.set_trace() # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id,introspection_times=self.intro_times) self.saver = tf.train.Saver(max_to_keep=1) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph)
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, hops=self.hops, max_grad_norm=self.max_grad_norm, task_id=task_id)
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size candidates, self.candid2indx = load_candidates( self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph)
def __init__(self,data_dir,model_dir,task_id,isInteractive=True,OOV=False,memory_size=250,random_state=None,batch_size=32,learning_rate=0.001,epsilon=1e-8,max_grad_norm=40.0,evaluation_interval=10,hops=3,epochs=200,embedding_size=20,save_vocab=False,load_vocab=False): self.data_dir=data_dir self.task_id=task_id self.model_dir=model_dir # self.isTrain=isTrain self.isInteractive=isInteractive self.OOV=OOV self.memory_size=memory_size self.random_state=random_state self.batch_size=batch_size self.learning_rate=learning_rate self.epsilon=epsilon self.max_grad_norm=max_grad_norm self.evaluation_interval=evaluation_interval self.hops=hops self.epochs=epochs self.embedding_size=embedding_size self.save_vocab=save_vocab self.load_vocab=load_vocab candidates,self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid= dict((self.candid2indx[key],key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task(self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data,candidates,self.save_vocab,self.load_vocab) # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.candidates_vec=vectorize_candidates(candidates,self.word_idx,self.candidate_sentence_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess=tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) # self.summary_writer = tf.train.SummaryWriter(self.model.root_dir, self.model.graph_output.graph) self.summary_writer = tf.summary.FileWriter(self.model.root_dir, self.model.graph_output.graph)
def __init__(self, data_dir, model_dir, task_id, OOV=False, memory_size=250, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=10, embedding_size=20, save_vocab=False, load_vocab=False): """Creates wrapper for training and testing a chatbot model. Args: data_dir: Directory containing personalized dialog tasks. model_dir: Directory containing memn2n model checkpoints. task_id: Personalized dialog task id, 1 <= id <= 5. Defaults to `1`. OOV: If `True`, use OOV test set. Defaults to `False` memory_size: The max size of the memory. Defaults to `250`. random_state: Random state to set graph-level random seed. Defaults to `None`. batch_size: Size of the batch for training. Defaults to `32`. learning_rate: Learning rate for Adam Optimizer. Defaults to `0.001`. epsilon: Epsilon value for Adam Optimizer. Defaults to `1e-8`. max_gradient_norm: Maximum L2 norm clipping value. Defaults to `40.0`. evaluation_interval: Evaluate and print results every x epochs. Defaults to `10`. hops: The number of hops over memory for responding. A hop consists of reading and addressing a memory slot. Defaults to `3`. epochs: Number of training epochs. Defualts to `200`. embedding_size: The size of the word embedding. Defaults to `20`. save_vocab: If `True`, save vocabulary file. Defaults to `False`. load_vocab: If `True`, load vocabulary from file. Defaults to `False`. """ self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.save_vocab = save_vocab self.load_vocab = load_vocab candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) # print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # Task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) # print(self.testData) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates, self.save_vocab, self.load_vocab) print("build_vocab", self.build_vocab) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) print("build_vocab", self.candidates_vec) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50)
def __init__(self, data_dir, model_dir, task_id, source, resFlag, wrong_conversations, error, acc_each_epoch, acc_ten_epoch, conv_wrong_right, epochs, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, embedding_size=20): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.source = source self.resFlag = resFlag self.wrong_conversations = wrong_conversations self.error = error self.acc_each_epoch = acc_each_epoch self.acc_ten_epoch = acc_ten_epoch candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # create train, test and validation data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) self.test_acc_list = [] self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id, source=self.source, resFlag=self.resFlag, oov=self.OOV) self.saver = tf.train.Saver(max_to_keep=50) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph)
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=50, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=100): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.vocab = {} self.ivocab = {} self.word2vec = {} self.word2vec_init = True if self.word2vec_init: # assert config.embed_size == 100 self.word2vec = load_glove(self.embedding_size) process_word(word="<eos>", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") # Define uncertain or unknown word index and vec for use later for training out-of-context data self.uncertain_word_index = process_word( word="sdfsssdf", word2vec=self.word2vec, vocab=self.vocab, ivocab=self.ivocab, word_vector_size=self.embedding_size, to_return="index") candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) self.set_max_sentence_length() # self.candidates_vec=vectorize_candidates_sparse(candidates,self.word_idx) self.trainS, self.trainQ, self.trainA = vectorize_data_match( self.trainData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain=self.uncertain_word_index) self.valS, self.valQ, self.valA = vectorize_data_match( self.valData, self.word2vec, self.max_sentence_size, self.batch_size, self.n_cand, self.memory_size, self.vocab, self.ivocab, self.embedding_size, uncertain_word=True, uncertain=self.uncertain_word_index) self.candidates_vec = vectorize_candidates( candidates, self.word2vec, self.candidate_sentence_size, self.vocab, self.ivocab, self.embedding_size) optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() # Set max sentence vector size self.build_vocab(data, candidates) answer_n_hot = np.zeros((self.vocab_size, len(self.candid2indx))) for ans_it in range(len(self.indx2candid)): ans = self.indx2candid[ans_it] n_hot = np.zeros((self.vocab_size, )) for w in tokenize(ans): assert w in self.word_idx n_hot[self.word_idx[w]] = 1 answer_n_hot[:, ans_it] = n_hot # Need to understand more about sentence size. Model failing because sentence size > candidate_sentence_size? Answers longer than queries? self.model = MemN2NDialogHybridMatch(self.batch_size, self.vocab_size, self.max_sentence_size, self.memory_size, self.embedding_size, answer_n_hot, match=FLAGS.match, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=self.task_id) # self.model = MemN2NDialogHybrid(self.batch_size, self.vocab_size, self.n_cand, self.max_sentence_size, self.embedding_size, self.candidates_vec, session=self.sess, # hops=self.hops, max_grad_norm=self.max_grad_norm, optimizer=optimizer, task_id=task_id) self.saver = tf.train.Saver(max_to_keep=50) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph) self.kb = parse_kb(FLAGS.kb_file)
def main(args): # parse args args = parse_args(args) # prepare data if args['prep_data']: print('\n>> Preparing Data\n') prepare_data(args) sys.exit() # ELSE # read data and metadata from pickled files with open(P_DATA_DIR + 'metadata.pkl', 'rb') as f: metadata = pkl.load(f) with open(P_DATA_DIR + 'data.pkl', 'rb') as f: data_ = pkl.load(f) # read content of data and metadata candidates = data_['candidates'] candid2idx, idx2candid = metadata['candid2idx'], metadata['idx2candid'] # get train/test/val data train, test, val = data_['train'], data_['test'], data_['val'] # gather more information from metadata sentence_size = metadata['sentence_size'] w2idx = metadata['w2idx'] # is a list idx2w = metadata['idx2w'] memory_size = metadata['memory_size'] vocab_size = metadata['vocab_size'] n_cand = metadata['n_cand'] candidate_sentence_size = metadata['candidate_sentence_size'] # embeddings = metadata['embeddings'] # vectorize candidates candidates_vec = data_utils.vectorize_candidates(candidates, w2idx, candidate_sentence_size) print('---- memory config ----') print('embedding size:', EMBEDDING_SIZE) print('batch_size:', BATCH_SIZE) print('memory_size:', memory_size) print('vocab_size:', vocab_size) print('candidate_size:', n_cand) print('candidate_sentence_size:', candidate_sentence_size) print('hops:', HOPS) print('---- end ----') ### # create model # model = model['memn2n']( # why? model = memn2n.MemN2NDialog(batch_size=BATCH_SIZE, vocab_size=vocab_size, candidates_size=n_cand, sentence_size=sentence_size, embedding_size=EMBEDDING_SIZE, candidates_vec=candidates_vec, hops=HOPS) # model = memn2n2.MemN2NDialog( # batch_size=BATCH_SIZE, # vocab_size=vocab_size, # candidates_size=n_cand, # sentence_size=sentence_size, # embedding_size=EMBEDDING_SIZE, # candidates_vec=candidates_vec, # embeddings=embeddings, # hops=HOPS # ) # gather data in batches train, val, test, batches = data_utils.get_batches(train, val, test, metadata, batch_size=BATCH_SIZE) # for t in train['q']: # print(recover_sentence(t, idx2w)) if args['train']: # training starts here epochs = args['epochs'] eval_interval = args['eval_interval'] # restore from checkpoint _check_restore_parameters(model.get_sess(), model.saver, CKPT_DIR) # # training and evaluation loop print('\n>> Training started!\n') # write log to file log_handle = open(dir_path + '/../../logs/' + args['log_file'], 'w') cost_total = 0. best_cost = 100 # best_validation_accuracy = 0. lowest_val_acc = 0.8 total_begin = time.clock() begin = time.clock() for i in range(epochs + 1): for start, end in batches: s = train['s'][start:end] q = train['q'][start:end] # print(len(q)) a = train['a'][start:end] if config.MULTILABEL >= 1: # convert to one hot one_hot = np.zeros((end - start, n_cand)) for aa in range(end - start): for index in a[aa]: one_hot[aa][index] = 1 a = one_hot cost_total += model.batch_fit(s, q, a) if config.MULTILABEL >= 1: if i % 1 == 0 and i: print('stage...', i, cost_total) if cost_total < best_cost: print('saving model...', i, '++', str(best_cost) + '-->' + str(cost_total)) best_cost = cost_total model.saver.save(model.get_sess(), CKPT_DIR + '/memn2n_model.ckpt', global_step=i) else: if i % 1 == 0 and i: print('stage...', i) if i % eval_interval == 0 and i: train_preds = batch_predict(model, train['s'], train['q'], len(train['s']), batch_size=BATCH_SIZE) for error in range(len(train['q'])): if train_preds[error] != train['a'][error]: print_out = recover(error, train['s'], train['q'], train_preds[error], train['a'][error], idx2w, idx2candid) print(print_out) # print(recover_sentence(train['q'][i], idx2w), # recover_cls(train_preds[i], idx2candid), # recover_cls(train['a'][i], idx2candid)) val_preds = batch_predict(model, val['s'], val['q'], len(val['s']), batch_size=BATCH_SIZE) train_acc = metrics.accuracy_score( np.array(train_preds), train['a']) val_acc = metrics.accuracy_score(val_preds, val['a']) end = time.clock() print('Epoch[{}] : <ACCURACY>\n\t,\ training : {} \n\t,\ validation : {}\n\t,\ current_best_accuracy: {}'.format( i, train_acc, val_acc, lowest_val_acc)) print('time:{}'.format(end - begin)) # log_handle.write('{} {} {} {}\n'.format(i, train_acc, val_acc, # cost_total / (eval_interval * len(batches)))) cost_total = 0. # empty cost begin = end # # save the best model, to disk # if val_acc > best_validation_accuracy: # best_validation_accuracy = val_acc if train_acc > lowest_val_acc: print('saving model...', train_acc, lowest_val_acc) lowest_val_acc = train_acc model.saver.save(model.get_sess(), CKPT_DIR + '/memn2n_model.ckpt', global_step=i) # close file total_end = time.clock() print('Total time: {} minutes.'.format((total_end - total_begin) / 60)) log_handle.close() else: # inference ### # restore checkpoint # ckpt = tf.train.get_checkpoint_state(CKPT_DIR) # if ckpt and ckpt.model_checkpoint_path: # print('\n>> restoring checkpoint from', ckpt.model_checkpoint_path) # model.saver.restore(model.get_sess(), ckpt.model_checkpoint_path) # # base(model, idx2candid, w2idx, sentence_size, BATCH_SIZE, n_cand, memory_size) # # # create an base session instance # isess = InteractiveSession( # model, idx2candid, w2idx, n_cand, memory_size) # # if args['infer']: # query = '' # while query != 'exit': # query = input('>> ') # print('>> ' + isess.reply(query)) # elif args['ui']: # return isess pass
def __init__(self, data_dir, task_id, OOV=False, memory_size=50, train=0, batch_size=32, nn=False): self.data_dir = data_dir self.task_id = task_id self.OOV = OOV self.memory_size = memory_size self.train = train self.batch_size = batch_size self.nn = nn candidates, self.candid2indx = load_candidates(self.data_dir, self.task_id) self.n_cand = len(candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData self.build_vocab(data, candidates) self.candidates_vec = vectorize_candidates( candidates, self.word_idx, self.candidate_sentence_size) self.params = { 'n_cand': self.n_cand, 'indx2candid': self.indx2candid, 'candid2indx': self.candid2indx, 'candidates_vec': self.candidates_vec, 'word_idx': self.word_idx, 'sentence_size': self.sentence_size, 'candidate_sentence_size': self.candidate_sentence_size, 'vocab_size': self.vocab_size } if self.nn: if self.train == 0: self.S, self.Q, self.A = vectorize_data(self.trainData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size, nn=self.nn) elif self.train == 1: self.S, self.Q, self.A = vectorize_data(self.valData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size, nn=self.nn) elif self.train == 2: self.S, self.Q, self.A = vectorize_data(self.testData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size, nn=self.nn) else: if self.train == 0: self.S, self.Q, self.A = vectorize_data( self.trainData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size) elif self.train == 1: self.S, self.Q, self.A = vectorize_data( self.valData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size) elif self.train == 2: self.S, self.Q, self.A = vectorize_data( self.testData, self.word_idx, self.sentence_size, self.batch_size, self.n_cand, self.memory_size)
sentence_size) + 5 # add some space for testing data memory_size = min(FLAGS.memory_size, max_story_size) # vectorize data trainS, trainQ, trainA, trainID = utils.vectorize_data( train, word_idx, sentence_size, FLAGS.batch_size, memory_size, cand_idx) valS, valQ, valA, valID = utils.vectorize_data(val, word_idx, sentence_size, FLAGS.batch_size, memory_size, cand_idx) testS, testQ, testA, testID = utils.vectorize_data(test, word_idx, sentence_size, FLAGS.batch_size, memory_size, cand_idx) C, cand_idx, idx_cand = utils.vectorize_candidates(cand_idx, idx_cand, word_idx, sentence_size) # params n_train = np.array(trainS).shape[0] n_test = np.array(testS).shape[0] n_val = np.array(valS).shape[0] tf.set_random_seed(FLAGS.random_state) batch_size = FLAGS.batch_size batches = zip(range(0, n_train - batch_size, batch_size), range(batch_size, n_train, batch_size)) batches = [(start, end) for start, end in batches] print "input data example: ", train[5]['utter_list'][0] print "overall bot utterance candidates: ", len(cand_idx) print 'vocab_size', vocab_size print "Longest sentence length", sentence_size
def __init__(self, data_dir, model_dir, task_id, isInteractive=True, OOV=False, memory_size=250, random_state=None, batch_size=32, learning_rate=0.001, epsilon=1e-8, max_grad_norm=40.0, evaluation_interval=10, hops=3, epochs=200, embedding_size=20, alpha=.5, save_vocab=None, load_vocab=None, verbose=False, load_profiles=None, save_profiles=None): self.data_dir = data_dir self.task_id = task_id self.model_dir = model_dir # self.isTrain=isTrain self.isInteractive = isInteractive self.OOV = OOV self.memory_size = memory_size self.random_state = random_state self.batch_size = batch_size self.learning_rate = learning_rate self.epsilon = epsilon self.max_grad_norm = max_grad_norm self.evaluation_interval = evaluation_interval self.hops = hops self.epochs = epochs self.embedding_size = embedding_size self.save_vocab = save_vocab self.load_vocab = load_vocab self.verbose = verbose self.alpha = alpha # Loading possible answers self.candidates, self.candid2indx = load_candidates( self.data_dir, self.task_id) self.n_cand = len(self.candidates) print("Candidate Size", self.n_cand) self.indx2candid = dict( (self.candid2indx[key], key) for key in self.candid2indx) # task data self.trainData, self.testData, self.valData = load_dialog_task( self.data_dir, self.task_id, self.candid2indx, self.OOV) data = self.trainData + self.testData + self.valData # Find profiles types if load_profiles: with open(load_profiles, 'rb') as f: self._profiles_mapping = pickle.load(f) else: self._profiles_mapping = generate_profile_encoding(self.trainData) if save_profiles: with open(save_profiles, 'wb') as f: pickle.dump(self._profiles_mapping, f) profiles_idx_set = set(self._profiles_mapping.values()) print("Profiles:", self._profiles_mapping) # Vocabulary self.build_vocab(data, self.candidates, self.save_vocab, self.load_vocab) # self.candidates_vec=vectorize_candidates_sparse(self.candidates,self.word_idx) self.candidates_vec = vectorize_candidates( self.candidates, self.word_idx, self.candidate_sentence_size) # Model initialisation optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, epsilon=self.epsilon) self.sess = tf.Session() self.model = MemN2NDialog(self.batch_size, self.vocab_size, self.n_cand, self.sentence_size, self.embedding_size, self.candidates_vec, profiles_idx_set, session=self.sess, hops=self.hops, max_grad_norm=self.max_grad_norm, alpha=alpha, optimizer=optimizer, task_id=task_id, verbose=verbose) self.saver = tf.train.Saver(max_to_keep=50) # self.summary_writer = tf.train.SummaryWriter(self.model.root_dir, self.model.graph_output.graph) self.summary_writer = tf.summary.FileWriter( self.model.root_dir, self.model.graph_output.graph)