class Runner: def __init__(self): self.model_path = args['rootDir'] + '/chargemodel_' + args[ 'model_arch'] + '.mdl' def main(self): args['datasetsize'] = 'big' if args['model_arch'] in ['lstmgrid']: args['batchSize'] = 64 elif args['model_arch'] in ['lstmibgan']: args['classify_type'] = 'single' args['batchSize'] = 64 elif args['model_arch'] in ['lstmibgan_law']: args['classify_type'] = 'single' args['batchSize'] = 64 args['task'] = 'law' elif args['model_arch'] in ['lstmibgan_toi']: args['classify_type'] = 'single' args['batchSize'] = 64 args['task'] = 'toi' self.textData = TextData('cail') self.start_token = self.textData.word2index['START_TOKEN'] self.end_token = self.textData.word2index['END_TOKEN'] args['vocabularySize'] = self.textData.getVocabularySize() if args['model_arch'] in ['lstmibgan_law']: args['chargenum'] = self.textData.getLawNum() elif args['model_arch'] in ['lstmibgan_toi']: args['chargenum'] = 11 else: args['chargenum'] = self.textData.getChargeNum() print(self.textData.getVocabularySize()) if args['model_arch'] == 'lstm': print('Using LSTM model.') self.model = LSTM_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmatt': print('Using LSTM attention model.') self.model = LSTM_att_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'transformer': print('Using Transformer model.') self.model = TransformerModel(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmib': print('Using LSTM information bottleneck model.') self.model = LSTM_IB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'].startswith('lstmibgan'): print('Using LSTM information bottleneck GAN model. Task: ' + args['task']) LM = torch.load(args['rootDir'] + '/LM' + args['datasetsize'] + '.pkl', map_location=args['device']) for param in LM.parameters(): param.requires_grad = False LSTM_IB_GAN.train(self.textData, LM) elif args['model_arch'] == 'lstmibcp': print('Using LSTM information bottleneck model. -- complete words') self.model = LSTM_IB_CP_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmcapib': print('Using LSTM capsule information bottleneck model.') self.model = LSTM_capsule_IB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmiterib': print('Using LSTM iteratively information bottleneck model.') self.model = LSTM_iterIB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmcap': print('Using LSTM capsule model.') self.model = LSTM_capsule_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmgrid': print('Using LSTM grid model.') self.model = LSTM_grid_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmgmib': print('Using LSTM Gaussian Mixture IB model.') self.model = LSTM_GMIB_Model(self.textData.word2index, self.textData.index2word) self.model = self.model.to(args['device']) self.train() def train(self, print_every=10000, plot_every=10, learning_rate=0.001): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every print_littleloss_total = 0 print(type(self.textData.word2index)) optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, eps=1e-3, amsgrad=True) iter = 1 batches = self.textData.getBatches() n_iters = len(batches) print('niters ', n_iters) args['trainseq2seq'] = False max_accu = -1 # accuracy = self.test('test', max_accu) for epoch in range(args['numEpochs']): losses = [] for batch in batches: optimizer.zero_grad() x = {} x['enc_input'] = autograd.Variable( torch.LongTensor(batch.encoderSeqs)).to(args['device']) x['enc_len'] = batch.encoder_lens x['labels'] = autograd.Variable(torch.LongTensor( batch.label)).to(args['device']) if args['model_arch'] not in [ 'lstmiterib', 'lstmgrid', 'lstmgmib' ]: x['labels'] = x['labels'][:, 0] if args['model_arch'] in ['lstmgmib']: loss, littleloss = self.model(x) # batch seq_len outsize print_littleloss_total += littleloss.data else: loss = self.model(x) # batch seq_len outsize loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args['clip']) optimizer.step() print_loss_total += loss.data plot_loss_total += loss.data losses.append(loss.data) if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print_littleloss_avg = print_littleloss_total / print_every print_littleloss_total = 0 if args['model_arch'] in ['lstmgmib']: print('%s (%d %d%%) %.4f ' % (timeSince(start, iter / (n_iters * args['numEpochs'])), iter, iter / n_iters * 100, print_loss_avg), end='') print(print_littleloss_avg) else: print('%s (%d %d%%) %.4f' % (timeSince(start, iter / (n_iters * args['numEpochs'])), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 iter += 1 if args['model_arch'] in ['lstmiterib', 'lstmgrid', 'lstmgmib']: accuracy, EM, p, r, acc, F_macro, F_micro, S = self.test( 'test', max_accu) if EM > max_accu or max_accu == -1: print('accuracy = ', EM, '>= min_accuracy(', max_accu, '), saving model...') torch.save(self.model, self.model_path) max_accu = accuracy print('Epoch ', epoch, 'loss = ', sum(losses) / len(losses), 'Valid accuracy = ', accuracy, EM, p, r, acc, F_macro, F_micro, S, 'max accuracy=', max_accu) else: accuracy = self.test('test', max_accu) if accuracy > max_accu or max_accu == -1: print('accuracy = ', accuracy, '>= min_accuracy(', max_accu, '), saving model...') torch.save(self.model, self.model_path) max_accu = accuracy print('Epoch ', epoch, 'loss = ', sum(losses) / len(losses), 'Valid accuracy = ', accuracy, 'max accuracy=', max_accu) # self.test() # showPlot(plot_losses) def test(self, datasetname, max_accuracy, eps=1e-20): # if not hasattr(self, 'testbatches'): # self.testbatches = {} # if datasetname not in self.testbatches: # self.testbatches[datasetname] = self.textData.getBatches(datasetname) right = 0 total = 0 dset = [] exact_match = 0 p = 0.0 r = 0.0 acc = 0.0 TP_c = np.zeros(args['chargenum']) FP_c = np.zeros(args['chargenum']) FN_c = np.zeros(args['chargenum']) TN_c = np.zeros(args['chargenum']) with torch.no_grad(): pppt = False for batch in self.textData.getBatches(datasetname): x = {} x['enc_input'] = autograd.Variable( torch.LongTensor(batch.encoderSeqs)).to(args['device']) x['enc_len'] = batch.encoder_lens x['labels'] = autograd.Variable(torch.LongTensor( batch.label)).to(args['device']) if args['model_arch'] in [ 'lstmiterib', 'lstmgrid', 'lstmgmib' ]: answer = self.model.predict(x).cpu().numpy() y = F.one_hot(torch.LongTensor(batch.label), num_classes=args['chargenum'] + 2) y = y[:, :, :args['chargenum']] # add content class y, _ = torch.max(y, dim=1) y = y.bool().numpy() exact_match += ((answer == y).sum( axis=1) == args['chargenum']).sum() total += answer.shape[0] tp_c = ((answer == True) & (answer == y)).sum(axis=0) # c fp_c = ((answer == True) & (y == False)).sum(axis=0) # c fn_c = ((answer == False) & (y == True)).sum(axis=0) # c tn_c = ((answer == False) & (y == False)).sum(axis=0) # c TP_c += tp_c FP_c += fp_c FN_c += fn_c TN_c += tn_c right = exact_match else: output_probs, output_labels = self.model.predict(x) if args['model_arch'] == 'lstmib' or args[ 'model_arch'] == 'lstmibcp': output_labels, sampled_words, wordsamplerate = output_labels if not pppt: pppt = True for w, choice in zip(batch.encoderSeqs[0], sampled_words[0]): if choice[1] == 1: print(self.textData.index2word[w], end='') print('sample rate: ', wordsamplerate[0]) elif args['model_arch'] == 'lstmcapib': output_labels, sampled_words, wordsamplerate = output_labels if not pppt: pppt = True for w, choice in zip( batch.encoderSeqs[0], sampled_words[0, output_labels[0], :]): if choice == 1: print(self.textData.index2word[w], end='') print('sample rate: ', wordsamplerate[0]) batch_correct = output_labels.cpu().numpy( ) == torch.LongTensor(batch.label).cpu().numpy() right += sum(batch_correct) total += x['enc_input'].size()[0] for ind, c in enumerate(batch_correct): if not c: dset.append((batch.encoderSeqs[ind], batch.label[ind], output_labels[ind])) accuracy = right / total if accuracy > max_accuracy: with open( args['rootDir'] + '/error_case_' + args['model_arch'] + '.txt', 'w') as wh: for d in dset: wh.write(''.join( [self.textData.index2word[wid] for wid in d[0]])) wh.write('\t') wh.write(self.textData.lawinfo['i2c'][int(d[1])]) wh.write('\t') wh.write(self.textData.lawinfo['i2c'][int(d[2])]) wh.write('\n') wh.close() if args['model_arch'] in ['lstmiterib', 'lstmgrid', 'lstmgmib']: P_c = TP_c / (TP_c + FP_c) R_c = TP_c / (TP_c + FN_c) F_c = 2 * P_c * R_c / (P_c + R_c) F_macro = np.nanmean(F_c) TP_micro = np.sum(TP_c) FP_micro = np.sum(FP_c) FN_micro = np.sum(FN_c) P_micro = TP_micro / (TP_micro + FP_micro) R_micro = TP_micro / (TP_micro + FN_micro) F_micro = 2 * P_micro * R_micro / (P_micro + R_micro) S = 100 * (F_macro + F_micro) / 2 return accuracy, exact_match / total, p, r, acc, F_macro, F_micro, S else: return accuracy def indexesFromSentence(self, sentence): return [ self.textData.word2index[word] if word in self.textData.word2index else self.textData.word2index['UNK'] for word in sentence ] def tensorFromSentence(self, sentence): indexes = self.indexesFromSentence(sentence) # indexes.append(self.textData.word2index['END_TOKEN']) return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1) def evaluate(self, sentence, correctlabel, max_length=20): with torch.no_grad(): input_tensor = self.tensorFromSentence(sentence) input_length = input_tensor.size()[0] # encoder_hidden = encoder.initHidden() # encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) x = {} # print(input_tensor) x['enc_input'] = torch.transpose(input_tensor, 0, 1) x['enc_len'] = [input_length] x['labels'] = [correctlabel] # print(x['enc_input'], x['enc_len']) # print(x['enc_input'].shape) decoded_words, label, _ = self.model.predict(x, True) return decoded_words, label def evaluateRandomly(self, n=10): for i in range(n): sample = random.choice(self.textData.datasets['train']) print('>', sample) output_words, label = self.evaluate(sample[2], sample[1]) output_sentence = ' '.join(output_words[0]) # batch=1 print('<', output_sentence, label) print('')
class Runner: def __init__(self): self.model_path = args['rootDir'] + '/chargemodel_' + args[ 'model_arch'] + '.mdl' def main(self): args['batchSize'] = 32 self.textData = TextData('cail') self.start_token = self.textData.word2index['START_TOKEN'] self.end_token = self.textData.word2index['END_TOKEN'] args['vocabularySize'] = self.textData.getVocabularySize() args['chargenum'] = self.textData.getChargeNum() print(self.textData.getVocabularySize()) if args['model_arch'] == 'lstm': print('Using LSTM model.') self.model = LSTM_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmatt': print('Using LSTM attention model.') self.model = LSTM_att_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'transformer': print('Using Transformer model.') self.model = TransformerModel(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmib': print('Using LSTM information bottleneck model.') self.model = LSTM_IB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmibgan': print('Using LSTM information bottleneck GAN model.') LSTM_IB_GAN.train(self.textData) elif args['model_arch'] == 'lstmibcp': print('Using LSTM information bottleneck model. -- complete words') self.model = LSTM_IB_CP_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmcapib': print('Using LSTM capsule information bottleneck model.') self.model = LSTM_capsule_IB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmiterib': print('Using LSTM iteratively information bottleneck model.') self.model = LSTM_iterIB_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmcap': print('Using LSTM capsule model.') self.model = LSTM_capsule_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmgrid': print('Using LSTM grid model.') self.model = LSTM_grid_Model(self.textData.word2index, self.textData.index2word) self.train() elif args['model_arch'] == 'lstmgmib': print('Using LSTM Gaussian Mixture IB model.') self.model = nn.parallel.DataParallel( LSTM_GMIB_Model(self.textData.word2index, self.textData.index2word)) self.train() args['device'] = "cuda:0" if torch.cuda.is_available() else "cpu" self.model.to(args['device']) def train(self, print_every=10000, plot_every=10, learning_rate=0.001): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every print_littleloss_total = 0 print(type(self.textData.word2index)) optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, eps=1e-3, amsgrad=True) iter = 1 batches = self.textData.getBatches() n_iters = len(batches) print('niters ', n_iters) args['trainseq2seq'] = False max_accu = -1 # accuracy = self.test('test', max_accu) for epoch in range(args['numEpochs']): losses = [] for batch in batches: optimizer.zero_grad() x = {} x['enc_input'] = autograd.Variable( torch.LongTensor(batch.encoderSeqs)).to(args['device']) x['enc_len'] = autograd.Variable( torch.LongTensor(batch.encoder_lens)).to(args['device']) x['labels'] = autograd.Variable(torch.LongTensor( batch.label)).to(args['device']) if args['model_arch'] not in [ 'lstmiterib', 'lstmgrid', 'lstmgmib' ]: x['labels'] = x['labels'][:, 0] if args['model_arch'] in ['lstmgmib']: loss, littleloss = self.model(x) # batch seq_len outsize print_littleloss_total += littleloss.data else: loss = self.model(x) # batch seq_len outsize loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), args['clip']) optimizer.step() print_loss_total += loss.data plot_loss_total += loss.data losses.append(loss.data) if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print_littleloss_avg = print_littleloss_total / print_every print_littleloss_total = 0 if args['model_arch'] in ['lstmgmib']: print('%s (%d %d%%) %.4f ' % (timeSince(start, iter / (n_iters * args['numEpochs'])), iter, iter / n_iters * 100, print_loss_avg), end='') print(print_littleloss_avg) else: print('%s (%d %d%%) %.4f' % (timeSince(start, iter / (n_iters * args['numEpochs'])), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 iter += 1 if args['model_arch'] in ['lstmiterib', 'lstmgrid', 'lstmgmib']: accuracy, EM, p, r, acc = self.test('test', max_accu) if accuracy > max_accu or max_accu == -1: print('accuracy = ', accuracy, '>= min_accuracy(', max_accu, '), saving model...') torch.save(self.model, self.model_path) max_accu = accuracy print('Epoch ', epoch, 'loss = ', sum(losses) / len(losses), 'Valid accuracy = ', accuracy, EM, p, r, acc, 'max accuracy=', max_accu) else: accuracy = self.test('test', max_accu) if accuracy > max_accu or max_accu == -1: print('accuracy = ', accuracy, '>= min_accuracy(', max_accu, '), saving model...') torch.save(self.model, self.model_path) max_accu = accuracy print('Epoch ', epoch, 'loss = ', sum(losses) / len(losses), 'Valid accuracy = ', accuracy, 'max accuracy=', max_accu) # self.test() # showPlot(plot_losses) def test(self, datasetname, max_accuracy): # if not hasattr(self, 'testbatches'): # self.testbatches = {} # if datasetname not in self.testbatches: # self.testbatches[datasetname] = self.textData.getBatches(datasetname) right = 0 total = 0 dset = [] exact_match = 0 p = 0.0 r = 0.0 acc = 0.0 with torch.no_grad(): pppt = False for batch in self.textData.getBatches(datasetname): x = {} x['enc_input'] = autograd.Variable( torch.LongTensor(batch.encoderSeqs)) x['enc_len'] = batch.encoder_lens if args['model_arch'] in [ 'lstmiterib', 'lstmgrid', 'lstmgmib' ]: answerlist = self.model.predict(x) for anses, gold in zip(answerlist, batch.label): anses = [int(ele) for ele in anses] if anses[0] == gold[0]: right += 1 goldlist = list(gold[:gold.index(args['chargenum'])]) intersect = set(anses) joint = set(anses) intersect = intersect.intersection(set(goldlist)) joint.update(set(goldlist)) intersect_size = len(intersect) joint_size = len(joint) if intersect_size == joint_size: exact_match += 1 # print(intersect,joint, anses, goldlist) acc = (acc * total + intersect_size / joint_size) / (total + 1) p = (p * total + intersect_size / len(anses)) / (total + 1) r = (r * total + intersect_size / len(goldlist)) / (total + 1) # print(acc, p,r) # exit() total += 1 else: output_probs, output_labels = self.model.predict(x) if args['model_arch'] == 'lstmib' or args[ 'model_arch'] == 'lstmibcp': output_labels, sampled_words, wordsamplerate = output_labels if not pppt: pppt = True for w, choice in zip(batch.encoderSeqs[0], sampled_words[0]): if choice[1] == 1: print(self.textData.index2word[w], end='') print('sample rate: ', wordsamplerate[0]) elif args['model_arch'] == 'lstmcapib': output_labels, sampled_words, wordsamplerate = output_labels if not pppt: pppt = True for w, choice in zip( batch.encoderSeqs[0], sampled_words[0, output_labels[0], :]): if choice == 1: print(self.textData.index2word[w], end='') print('sample rate: ', wordsamplerate[0]) batch_correct = output_labels.cpu().numpy( ) == torch.LongTensor(batch.label).cpu().numpy() right += sum(batch_correct) total += x['enc_input'].size()[0] for ind, c in enumerate(batch_correct): if not c: dset.append((batch.encoderSeqs[ind], batch.label[ind], output_labels[ind])) accuracy = right / total if accuracy > max_accuracy: with open( args['rootDir'] + '/error_case_' + args['model_arch'] + '.txt', 'w') as wh: for d in dset: wh.write(''.join( [self.textData.index2word[wid] for wid in d[0]])) wh.write('\t') wh.write(self.textData.lawinfo['i2c'][int(d[1])]) wh.write('\t') wh.write(self.textData.lawinfo['i2c'][int(d[2])]) wh.write('\n') wh.close() if args['model_arch'] in ['lstmiterib', 'lstmgrid', 'lstmgmib']: return accuracy, exact_match / total, p, r, acc else: return accuracy def indexesFromSentence(self, sentence): return [ self.textData.word2index[word] if word in self.textData.word2index else self.textData.word2index['UNK'] for word in sentence ] def tensorFromSentence(self, sentence): indexes = self.indexesFromSentence(sentence) # indexes.append(self.textData.word2index['END_TOKEN']) return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1) def evaluate(self, sentence, correctlabel, max_length=20): with torch.no_grad(): input_tensor = self.tensorFromSentence(sentence) input_length = input_tensor.size()[0] # encoder_hidden = encoder.initHidden() # encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device) x = {} # print(input_tensor) x['enc_input'] = torch.transpose(input_tensor, 0, 1) x['enc_len'] = [input_length] x['labels'] = [correctlabel] # print(x['enc_input'], x['enc_len']) # print(x['enc_input'].shape) decoded_words, label, _ = self.model.predict(x, True) return decoded_words, label def evaluateRandomly(self, n=10): for i in range(n): sample = random.choice(self.textData.datasets['train']) print('>', sample) output_words, label = self.evaluate(sample[2], sample[1]) output_sentence = ' '.join(output_words[0]) # batch=1 print('<', output_sentence, label) print('')
class Chatbot: def __init__(self): self.args = self.predefined_args() def predefined_args(self): args = {} args['test'] = None args['createDataset'] = True args['playDataset'] = 10 args['reset'] = True args['device'] ='gpu' args['rootDir'] = '/home/v-leisha/DeepQA_pytorch/' args['watsonMode'] = False args['autoEncode'] = False args['corpus'] = 'cornell' args['datasetTag'] = '' args['ratioDataset'] = 1.0 args['maxLength'] = 50 args['filterVocab'] = 1 args['skipLines'] = True args['vocabularySize'] = 40000 args['hiddenSize'] = 200 args['numLayers'] = 2 args['softmaxSamples'] = 0 args['initEmbeddings'] = True args['embeddingSize'] = 120 args['embeddingSource'] = "GoogleNews-vectors-negative300.bin" args['numEpochs'] = 30 args['saveEvery'] = 2000 args['batchSize'] = 256 args['learningRate'] = 0.002 args['dropout'] = 0.9 args['encunit'] = 'lstm' args['decunit'] = 'lstm' args['enc_numlayer'] = 2 args['dec_numlayer'] = 2 args['maxLengthEnco'] = args['maxLength'] args['maxLengthDeco'] = args['maxLength'] + 2 return args def main(self): self.textData = TextData(self.args) self.args['vocabularySize'] = self.textData.getVocabularySize() print(self.textData.getVocabularySize()) self.model = Model(self.args) self.train() def train(self, print_every=10, plot_every=10, learning_rate=0.01): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every optimizer = optim.SGD(self.model.parameters(), lr=learning_rate) # criterion = nn.NLLLoss(size_average=False) iter = 1 batches = self.textData.getBatches() n_iters = len(batches) for batch in batches: print(iter) # batchsize = batch.shape[0] x={} x['enc_input'] = autograd.Variable(torch.LongTensor(batch.encoderSeqs)) x['enc_len'] = batch.encoder_lens x['dec_input'] = autograd.Variable(torch.LongTensor(batch.decoderSeqs)) x['dec_len'] = batch.decoder_lens x['dec_target'] = autograd.Variable(torch.LongTensor(batch.targetSeqs)) predictions = self.model(x) # batch seq_len outsize target_variable = x['dec_target'] #batch seq_lenyou # targetlen = target_variable.shape[1] # print(predictions.size(), target_variable.size()) loss = - torch.gather(predictions, 2, torch.unsqueeze(target_variable, 2)) mask = torch.sign(target_variable.float()) loss = loss * mask loss_mean = torch.mean(loss) loss_mean.backward() optimizer.step() # print(type(loss_mean.data[0])) print_loss_total += loss_mean.data[0] plot_loss_total += loss_mean.data[0] if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 iter+=1 showPlot(plot_losses)
class Chatbot: """ Main class which launch the training or testing mode """ class TestMode: """ Simple structure representing the different testing modes """ ALL = 'all' INTERACTIVE = 'interactive' # The user can write his own questions DAEMON = 'daemon' # The chatbot runs on background and can regularly be called to predict something def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.5' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] @staticmethod def parseArgs(args): """ Parse the arguments from the given command line Args: args (list<str>): List of arguments to parse. If None, the default sys.argv will be parsed """ parser = argparse.ArgumentParser() # Global options globalArgs = parser.add_argument_group('Global options') globalArgs.add_argument('--test', nargs='?', choices=[Chatbot.TestMode.ALL, Chatbot.TestMode.INTERACTIVE, Chatbot.TestMode.DAEMON], const=Chatbot.TestMode.ALL, default=None, help='if present, launch the program try to answer all sentences from data/test/ with' ' the defined model(s), in interactive mode, the user can wrote his own sentences,' ' use daemon mode to integrate the chatbot in another program') globalArgs.add_argument('--createDataset', action='store_true', help='if present, the program will only generate the dataset from the corpus (no training/testing)') globalArgs.add_argument('--playDataset', type=int, nargs='?', const=10, default=None, help='if set, the program will randomly play some samples(can be use conjointly with createDataset if this is the only action you want to perform)') globalArgs.add_argument('--reset', action='store_true', help='use this if you want to ignore the previous model present on the model directory (Warning: the model will be destroyed with all the folder content)') globalArgs.add_argument('--verbose', action='store_true', help='When testing, will plot the outputs at the same time they are computed') globalArgs.add_argument('--debug', action='store_true', help='run DeepQA with Tensorflow debug mode. Read TF documentation for more details on this.') globalArgs.add_argument('--keepAll', action='store_true', help='If this option is set, all saved model will be kept (Warning: make sure you have enough free disk space or increase saveEvery)') # TODO: Add an option to delimit the max size globalArgs.add_argument('--modelTag', type=str, default=None, help='tag to differentiate which model to store/load') globalArgs.add_argument('--rootDir', type=str, default=None, help='folder where to look for the models and data') globalArgs.add_argument('--watsonMode', action='store_true', help='Inverse the questions and answer when training (the network try to guess the question)') globalArgs.add_argument('--autoEncode', action='store_true', help='Randomly pick the question or the answer and use it both as input and output') globalArgs.add_argument('--device', type=str, default=None, help='\'gpu\' or \'cpu\' (Warning: make sure you have enough free RAM), allow to choose on which hardware run the model') globalArgs.add_argument('--seed', type=int, default=None, help='random seed for replication') # Dataset options datasetArgs = parser.add_argument_group('Dataset options') datasetArgs.add_argument('--corpus', choices=TextData.corpusChoices(), default=TextData.corpusChoices()[0], help='corpus on which extract the dataset.') datasetArgs.add_argument('--datasetTag', type=str, default='', help='add a tag to the dataset (file where to load the vocabulary and the precomputed samples, not the original corpus). Useful to manage multiple versions. Also used to define the file used for the lightweight format.') # The samples are computed from the corpus if it does not exist already. There are saved in \'data/samples/\' datasetArgs.add_argument('--ratioDataset', type=float, default=1.0, help='ratio of dataset used to avoid using the whole dataset') # Not implemented, useless ? datasetArgs.add_argument('--maxLength', type=int, default=10, help='maximum length of the sentence (for input and output), define number of maximum step of the RNN') datasetArgs.add_argument('--filterVocab', type=int, default=1, help='remove rarelly used words (by default words used only once). 0 to keep all words.') datasetArgs.add_argument('--skipLines', action='store_true', help='Generate training samples by only using even conversation lines as questions (and odd lines as answer). Useful to train the network on a particular person.') datasetArgs.add_argument('--vocabularySize', type=int, default=40000, help='Limit the number of words in the vocabulary (0 for unlimited)') # Network options (Warning: if modifying something here, also make the change on save/loadParams() ) nnArgs = parser.add_argument_group('Network options', 'architecture related option') nnArgs.add_argument('--hiddenSize', type=int, default=512, help='number of hidden units in each RNN cell') nnArgs.add_argument('--numLayers', type=int, default=2, help='number of rnn layers') nnArgs.add_argument('--softmaxSamples', type=int, default=0, help='Number of samples in the sampled softmax loss function. A value of 0 deactivates sampled softmax') nnArgs.add_argument('--initEmbeddings', action='store_true', help='if present, the program will initialize the embeddings with pre-trained word2vec vectors') nnArgs.add_argument('--embeddingSize', type=int, default=64, help='embedding size of the word representation') nnArgs.add_argument('--embeddingSource', type=str, default="GoogleNews-vectors-negative300.bin", help='embedding file to use for the word representation') # Training options trainingArgs = parser.add_argument_group('Training options') trainingArgs.add_argument('--numEpochs', type=int, default=30, help='maximum number of epochs to run') trainingArgs.add_argument('--saveEvery', type=int, default=2000, help='nb of mini-batch step before creating a model checkpoint') trainingArgs.add_argument('--batchSize', type=int, default=32, help='mini-batch size') trainingArgs.add_argument('--learningRate', type=float, default=0.002, help='Learning rate') trainingArgs.add_argument('--dropout', type=float, default=0.9, help='Dropout rate (keep probabilities)') return parser.parse_args(args) def main(self, args=None): """ Launch the training and/or the interactive mode """ print('Welcome to DeepQA v0.1 !') print() print('TensorFlow detected: v{}'.format(tf.__version__)) # General initialisation self.args = self.parseArgs(args) if not self.args.rootDir: self.args.rootDir = os.getcwd() # Use the current working directory #tf.logging.set_verbosity(tf.logging.INFO) # DEBUG, INFO, WARN (default), ERROR, or FATAL self.loadModelParams() # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) # TODO: Add a mode where we can force the input of the decoder // Try to visualize the predictions for # each word of the vocabulary / decoder input # TODO: For now, the model are trained for a specific dataset (because of the maxLength which define the # vocabulary). Add a compatibility mode which allow to launch a model trained on a different vocabulary ( # remap the word2id/id2word variables). if self.args.createDataset: print('Dataset created! Thanks for using this program') return # No need to go further # Prepare the model with tf.device(self.getDevice()): self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.summary.FileWriter(self._getSummaryName()) self.saver = tf.train.Saver(max_to_keep=200) # TODO: Fixed seed (WARNING: If dataset shuffling, make sure to do that after saving the # dataset, otherwise, all which cames after the shuffling won't be replicable when # reloading the dataset). How to restore the seed after loading ?? # Also fix seed for random.shuffle (does it works globally for all files ?) # Running session self.sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, # Allows backup device for non GPU-available operations (when forcing GPU) log_device_placement=False) # Too verbose ? ) # TODO: Replace all sess by self.sess (not necessary a good idea) ? if self.args.debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) print('Initialize variables...') self.sess.run(tf.global_variables_initializer()) # Reload the model eventually (if it exist.), on testing mode, the models are not loaded here (but in predictTestset) if self.args.test != Chatbot.TestMode.ALL: self.managePreviousModel(self.sess) # Initialize embeddings with pre-trained word2vec vectors if self.args.initEmbeddings: self.loadEmbedding(self.sess) if self.args.test: if self.args.test == Chatbot.TestMode.INTERACTIVE: self.mainTestInteractive(self.sess) elif self.args.test == Chatbot.TestMode.ALL: print('Start predicting...') self.predictTestset(self.sess) print('All predictions done') elif self.args.test == Chatbot.TestMode.DAEMON: print('Daemon mode, running in background...') else: raise RuntimeError('Unknown test mode: {}'.format(self.args.test)) # Should never happen else: self.mainTrain(self.sess) if self.args.test != Chatbot.TestMode.DAEMON: self.sess.close() print("The End! Thanks for using this program") def mainTrain(self, sess): """ Training loop Args: sess: The current running session """ # Specific training dependent loading self.textData.makeLighter(self.args.ratioDataset) # Limit the number of training samples mergedSummaries = tf.summary.merge_all() # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only # If restoring a model, restore the progression bar ? and current batch ? print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(self.args.numEpochs): print() print("----- Epoch {}/{} ; (lr={}) -----".format(e+1, self.args.numEpochs, self.args.learningRate)) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass encodeSentences, targetSentences, decodeSentences = \ self.model.initStatus(nextBatch) for i in range(self.args.batchSize): encodeSentence = [encodeSentences[j][i] for j in range(self.args.maxLengthEnco)] targetSentence = [targetSentences[j][i] for j in range(self.args.maxLengthDeco)] decodeSentence = [decodeSentences[j][i] for j in range(self.args.maxLengthDeco)] for j in range(2 * self.args.maxLengthDeco): actionIndex, step = self.model.getAction(encodeSentence, decodeSentence) nextDecodeSentence, r = reward.bleuScore(decodeSentence, targetSentence, actionIndex, step) loss = self.model.setPerception(encodeSentence, decodeSentence, nextDecodeSentence, actionIndex, step, r) if loss: if self.globStep % 100 == 0: perplexity = math.exp(float(loss)) if loss < 300 else float("inf") tqdm.write("----- Step %d -- Loss %.6f -- Perplexity %.6f" % ( self.globStep, loss, perplexity)) else: print 'training...' + 'epoch: ' + str(e) + ' loss: ' + str(loss) else: print 'not training...' self.globStep += 1 toc = datetime.datetime.now() print("Epoch finished in {}".format(toc-tic)) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def predictTestset(self, sess): """ Try predicting the sentences from the samples.txt file. The sentences are saved on the modelDir under the same name Args: sess: The current running session """ # Loading the file to predict with open(os.path.join(self.args.rootDir, self.TEST_IN_NAME), 'r') as f: lines = f.readlines() modelList = self._getModelList() if not modelList: print('Warning: No model found in \'{}\'. Please train a model before trying to predict'.format(self.modelDir)) return # Predicting for each model present in modelDir for modelName in sorted(modelList): # TODO: Natural sorting print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) print('Testing...') saveName = modelName[:-len(self.MODEL_EXT)] + self.TEST_OUT_SUFFIX # We remove the model extension and add the prediction suffix with open(saveName, 'w') as f: nbIgnored = 0 for line in tqdm(lines, desc='Sentences'): question = line[:-1] # Remove the endl character answer = self.singlePredict(question) if not answer: nbIgnored += 1 continue # Back to the beginning, try again predString = '{x[0]}{0}\n{x[1]}{1}\n\n'.format(question, self.textData.sequence2str(answer, clean=True), x=self.SENTENCES_PREFIX) if self.args.verbose: tqdm.write(predString) f.write(predString) print('Prediction finished, {}/{} sentences ignored (too long)'.format(nbIgnored, len(lines))) def mainTestInteractive(self, sess): """ Try predicting the sentences that the user will enter in the console Args: sess: The current running session """ # TODO: If verbose mode, also show similar sentences from the training set with the same words (include in mainTest also) # TODO: Also show the top 10 most likely predictions for each predicted output (when verbose mode) # TODO: Log the questions asked for latter re-use (merge with test/samples.txt) print('Testing: Launch interactive mode:') print('') print('Welcome to the interactive mode, here you can ask to Deep Q&A the sentence you want. Don\'t have high ' 'expectation. Type \'exit\' or just press ENTER to quit the program. Have fun.') while True: question = input(self.SENTENCES_PREFIX[0]) if question == '' or question == 'exit': break questionSeq = [] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print('Warning: sentence too long, sorry. Maybe try a simpler sentence.') continue # Back to the beginning, try again print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) if self.args.verbose: print(self.textData.batchSeq2str(questionSeq, clean=True, reverse=True)) print(self.textData.sequence2str(answer)) print() def singlePredict(self, question, questionSeq=None): """ Predict the sentence Args: question (str): the raw input sentence questionSeq (List<int>): output argument. If given will contain the input batch sequence Return: list <int>: the word ids corresponding to the answer """ # Create the input batch batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run(ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) answer = self.textData.deco2sentence(output) return answer def daemonPredict(self, sentence): """ Return the answer to a given sentence (same as singlePredict() but with additional cleaning) Args: sentence (str): the raw input sentence Return: str: the human readable sentence """ return self.textData.sequence2str( self.singlePredict(sentence), clean=True ) def daemonClose(self): """ A utility function to close the daemon when finish """ print('Exiting the daemon mode...') self.sess.close() print('Daemon closed.') def loadEmbedding(self, sess): """ Initialize embeddings with pre-trained word2vec vectors Will modify the embedding weights of the current loaded model Uses the GoogleNews pre-trained values (path hardcoded) """ # Fetch embedding variables from model with tf.variable_scope("embedding_rnn_seq2seq/rnn/embedding_wrapper", reuse=True): em_in = tf.get_variable("embedding") with tf.variable_scope("embedding_rnn_seq2seq/embedding_rnn_decoder", reuse=True): em_out = tf.get_variable("embedding") # Disable training for embeddings variables = tf.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES) variables.remove(em_in) variables.remove(em_out) # If restoring a model, we can leave here if self.globStep != 0: return # New model, we load the pre-trained word2vec data and initialize embeddings embeddings_path = os.path.join(self.args.rootDir, 'data', 'embeddings', self.args.embeddingSource) embeddings_format = os.path.splitext(embeddings_path)[1][1:] print("Loading pre-trained word embeddings from %s " % embeddings_path) with open(embeddings_path, "rb") as f: header = f.readline() vocab_size, vector_size = map(int, header.split()) binary_len = np.dtype('float32').itemsize * vector_size initW = np.random.uniform(-0.25,0.25,(len(self.textData.word2id), vector_size)) for line in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) if word in self.textData.word2id: if embeddings_format == 'bin': vector = np.fromstring(f.read(binary_len), dtype='float32') elif embeddings_format == 'vec': vector = np.fromstring(f.readline(), sep=' ', dtype='float32') else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) initW[self.textData.word2id[word]] = vector else: if embeddings_format == 'bin': f.read(binary_len) elif embeddings_format == 'vec': f.readline() else: raise Exception("Unkown format for embeddings: %s " % embeddings_format) # PCA Decomposition to reduce word2vec dimensionality if self.args.embeddingSize < vector_size: U, s, Vt = np.linalg.svd(initW, full_matrices=False) S = np.zeros((vector_size, vector_size), dtype=complex) S[:vector_size, :vector_size] = np.diag(s) initW = np.dot(U[:, :self.args.embeddingSize], S[:self.args.embeddingSize, :self.args.embeddingSize]) # Initialize input and output embeddings sess.run(em_in.assign(initW)) sess.run(em_out.assign(initW)) def managePreviousModel(self, sess): """ Restore or reset the model, depending of the parameters If the destination directory already contains some file, it will handle the conflict as following: * If --reset is set, all present files will be removed (warning: no confirmation is asked) and the training restart from scratch (globStep & cie reinitialized) * Otherwise, it will depend of the directory content. If the directory contains: * No model files (only summary logs): works as a reset (restart from scratch) * Other model files, but modelName not found (surely keepAll option changed): raise error, the user should decide by himself what to do * The right model file (eventually some other): no problem, simply resume the training In any case, the directory will exist as it has been created by the summary writer Args: sess: The current running session """ print 'WARNING: ' modelName = self._getModelName() if os.listdir(self.modelDir): if self.args.reset: print('Reset: Destroying previous model at {}'.format(self.modelDir)) # Analysing directory content elif os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore(sess, modelName) # Will crash when --reset is not activated and the model has not been saved yet elif self._getModelList(): print('Conflict with previous models.') raise RuntimeError('Some models are already present in \'{}\'. You should check them first (or re-try with the keepAll flag)'.format(self.modelDir)) else: # No other model to conflict with (probably summary files) print('No previous model found, but some files found at {}. Cleaning...'.format(self.modelDir)) # Warning: No confirmation asked self.args.reset = True if self.args.reset: fileList = [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir)] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'.format(self.modelDir)) def _saveSession(self, sess): """ Save the model parameters and the variables Args: sess: the current session """ tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() model_name = self._getModelName() with open(model_name, 'w') as f: # HACK: Simulate the old model existance to avoid rewriting the file parser f.write('This file is used internally by DeepQA to check the model existance. Please do not remove.\n') self.saver.save(sess, model_name) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def _getModelList(self): """ Return the list of the model files inside the model directory """ return [os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) if f.endswith(self.MODEL_EXT)] def loadModelParams(self): """ Load the some values associated with the current model, like the current globStep value For now, this function does not need to be called before loading the model (no parameters restored). However, the modelDir name will be initialized here so it is required to call this function before managePreviousModel(), _getModelName() or _getSummaryName() Warning: if you modify this function, make sure the changes mirror saveModelParams, also check if the parameters should be reset in managePreviousModel """ # Compute the current model path self.modelDir = os.path.join(self.args.rootDir, self.MODEL_DIR_BASE) if self.args.modelTag: self.modelDir += '-' + self.args.modelTag # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if not self.args.reset and not self.args.createDataset and os.path.exists(configName): # Loading config = ConfigParser.ConfigParser() config.read(configName) # Check the version currentVersion = config.get('General', 'version') if currentVersion != self.CONFIG_VERSION: raise UserWarning('Present configuration version {0} does not match {1}. You can try manual changes on \'{2}\''.format(currentVersion, self.CONFIG_VERSION, configName)) # Restoring the the parameters self.globStep = config.getint('General' ,'globStep') self.args.watsonMode = config.getboolean('General', 'watsonMode') self.args.autoEncode = config.getboolean('General', 'autoEncode') self.args.corpus = config.get('General', 'corpus') self.args.datasetTag = config.get('Dataset', 'datasetTag') self.args.maxLength = config.getint('Dataset', 'maxLength') # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) self.args.filterVocab = config.getint('Dataset', 'filterVocab') self.args.skipLines = config.getboolean('Dataset', 'skipLines') self.args.vocabularySize = config.getint('Dataset', 'vocabularySize') self.args.hiddenSize = config.getint('Network', 'hiddenSize') self.args.numLayers = config.getint('Network', 'numLayers') self.args.softmaxSamples = config.getint('Network', 'softmaxSamples') self.args.initEmbeddings = config.getboolean('Network', 'initEmbeddings') self.args.embeddingSize = config.getint('Network', 'embeddingSize') self.args.embeddingSource = config.get('Network', 'embeddingSource') # No restoring for training params, batch size or other non model dependent parameters # Show the restored params print() print('Warning: Restoring parameters:') print('globStep: {}'.format(self.globStep)) print('watsonMode: {}'.format(self.args.watsonMode)) print('autoEncode: {}'.format(self.args.autoEncode)) print('corpus: {}'.format(self.args.corpus)) print('datasetTag: {}'.format(self.args.datasetTag)) print('maxLength: {}'.format(self.args.maxLength)) print('filterVocab: {}'.format(self.args.filterVocab)) print('skipLines: {}'.format(self.args.skipLines)) print('vocabularySize: {}'.format(self.args.vocabularySize)) print('hiddenSize: {}'.format(self.args.hiddenSize)) print('numLayers: {}'.format(self.args.numLayers)) print('softmaxSamples: {}'.format(self.args.softmaxSamples)) print('initEmbeddings: {}'.format(self.args.initEmbeddings)) print('embeddingSize: {}'.format(self.args.embeddingSize)) print('embeddingSource: {}'.format(self.args.embeddingSource)) print() # For now, not arbitrary independent maxLength between encoder and decoder self.args.maxLengthEnco = self.args.maxLength self.args.maxLengthDeco = self.args.maxLength + 2 if self.args.watsonMode: self.SENTENCES_PREFIX.reverse() def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = ConfigParser.ConfigParser() config.set('General', 'version', self.CONFIG_VERSION) config.set('General', 'globStep', str(self.globStep)) config.set('General', 'watsonMode', str(self.args.watsonMode)) config.set('General', 'autoEncode', str(self.args.autoEncode)) config.set('General', 'corpus', str(self.args.corpus)) config.set('Dataset', 'datasetTag', str(self.args.datasetTag)) config.set('Dataset', 'maxLength', str(self.args.maxLength)) config.set('Dataset', 'filterVocab', str(self.args.filterVocab)) config.set('Dataset', 'skipLines', str(self.args.skipLines)) config.set('Dataset', 'vocabularySize', str(self.args.vocabularySize)) config.set('Network', 'hiddenSize', str(self.args.hiddenSize)) config.set('Network', 'numLayers', str(self.args.numLayers)) config.set('Network', 'softmaxSamples', str(self.args.softmaxSamples)) config.set('Network', 'initEmbeddings', str(self.args.initEmbeddings)) config.set('Network', 'embeddingSize', str(self.args.embeddingSize)) config.set('Network', 'embeddingSource', str(self.args.embeddingSource)) # Keep track of the learning params (but without restoring them) config.set('Training (won\'t be restored)', 'learningRate', str(self.args.learningRate)) config.set('Training (won\'t be restored)', 'batchSize', str(self.args.batchSize)) config.set('Training (won\'t be restored)', 'dropout', str(self.args.dropout)) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getSummaryName(self): """ Parse the argument to decide were to save the summary, at the same place that the model The folder could already contain logs if we restore the training, those will be merged Return: str: The path and name of the summary """ return self.modelDir def _getModelName(self): """ Parse the argument to decide were to save/load the model This function is called at each checkpoint and the first time the model is load. If keepAll option is set, the globStep value will be included in the name. Return: str: The path and name were the model need to be saved """ modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def getDevice(self): """ Parse the argument to decide on which device run the model Return: str: The name of the device on which run the program """ if self.args.device == 'cpu': return '/cpu:0' elif self.args.device == 'gpu': return '/gpu:0' elif self.args.device is None: # No specified device (default) return None else: print('Warning: Error in the device name: {}, use the default device'.format(self.args.device)) return None
class Chatbot: def __init__(self): """ """ # Model/dataset parameters self.args = None # Task specific object self.textData = None # Dataset self.model = None # Sequence to sequence model # Tensorflow utilities for convenience saving/logging self.writer = None self.saver = None self.modelDir = '' # Where the model is saved self.globStep = 0 # Represent the number of iteration for the current model # TensorFlow main session (we keep track for the daemon) self.sess = None # Filename and directories constants self.MODEL_DIR_BASE = 'save/model' self.MODEL_NAME_BASE = 'model' self.MODEL_EXT = '.ckpt' self.CONFIG_FILENAME = 'params.ini' self.CONFIG_VERSION = '0.3' self.TEST_IN_NAME = 'data/test/samples.txt' self.TEST_OUT_SUFFIX = '_predictions.txt' self.SENTENCES_PREFIX = ['Q: ', 'A: '] def main(self, args=None): self.args = {} self.args['rootDir'] = os.getcwd() # Use the current working directory self.args['corpus'] = 'cornell' self.args['maxLength'] = 10 self.args['hiddenSize'] = 256 self.args['numLayers'] = 2 self.args['embeddingSize'] = 32 self.args['softmaxSamples'] = 0 self.args['numEpochs'] = 50 self.args['saveEvery'] = 5000 self.args['batchSize'] = 10 self.args['learningRate'] = 0.001 test_yes = True self.args['interactive'] = test_yes self.args['test'] = test_yes self.args['reset'] = False self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) self.model = Model(self.args, self.textData) # Saver/summaries self.writer = tf.train.SummaryWriter(self.modelDir) if '12' in tf.__version__: # HACK: Solve new tf Saver V2 format self.saver = tf.train.Saver(max_to_keep=200, write_version=1) # Arbitrary limit ? else: self.saver = tf.train.Saver(max_to_keep=200) self.sess = tf.Session() print('Initialize variables...') self.sess.run(tf.initialize_all_variables()) self.managePreviousModel(self.sess) if self.args['interactive']: self.mainTestInteractive(self.sess) else: self.mainTrain(self.sess) self.sess.close() def set_up_things(self, args=None): self.args = {} self.args['rootDir'] = os.getcwd() # Use the current working directory self.args['corpus'] = 'cornell' self.args['maxLength'] = 10 self.args['hiddenSize'] = 256 self.args['numLayers'] = 2 self.args['embeddingSize'] = 32 self.args['softmaxSamples'] = 0 self.args['numEpochs'] = 50 self.args['saveEvery'] = 5000 self.args['batchSize'] = 10 self.args['learningRate'] = 0.001 self.args['reset'] = False test_yes = True self.args['interactive'] = test_yes self.args['test'] = test_yes self.loadModelParams( ) # Update the self.modelDir and self.globStep, for now, not used when loading Model (but need to be called before _getSummaryName) self.textData = TextData(self.args) self.model = Model(self.args, self.textData) self.writer = tf.train.SummaryWriter(self.modelDir) if '12' in tf.__version__: # HACK: Solve new tf Saver V2 format self.saver = tf.train.Saver(max_to_keep=200, write_version=1) # Arbitrary limit ? else: self.saver = tf.train.Saver(max_to_keep=200) self.sess = tf.Session() print('Initialize variables...') self.sess.run(tf.initialize_all_variables()) self.managePreviousModel(self.sess) def get_answer(self, ques): questionSeq = [] # Will be contain the question as seen by the encoder output_answer = "" answer = self.singlePredict(ques, questionSeq) if not answer: output_answer = "Woah buddy, slow down! Can you enter a few less words?" output_answer = self.textData.sequence2str(answer, clean=True) return output_answer def mainTrain(self, sess): mergedSummaries = tf.merge_all_summaries( ) # Define the summary operator (Warning: Won't appear on the tensorboard graph) if self.globStep == 0: # Not restoring from previous run self.writer.add_graph(sess.graph) # First time only print('Start training (press Ctrl+C to save and exit)...') try: # If the user exit while training, we still try to save the model for e in range(0, self.args['numEpochs']): print() print("----- Epoch {}/{} ; (lr={}) -----".format( e + 1, self.args['numEpochs'], self.args['learningRate'])) batches = self.textData.getBatches() # TODO: Also update learning parameters eventually tic = datetime.datetime.now() for nextBatch in tqdm(batches, desc="Training"): # Training pass ops, feedDict = self.model.step(nextBatch) assert len(ops) == 2 # training, loss _, loss, summary = sess.run(ops + (mergedSummaries, ), feedDict) self.writer.add_summary(summary, self.globStep) self.globStep += 1 # Checkpoint if self.globStep % self.args['saveEvery'] == 0: self._saveSession(sess) toc = datetime.datetime.now() print( "Epoch finished in {}".format(toc - tic) ) # Warning: Will overflow if an epoch takes more than 24 hours, and the output isn't really nicer except (KeyboardInterrupt, SystemExit): # If the user press Ctrl+C while testing progress print('Interruption detected, exiting the program...') self._saveSession(sess) # Ultimate saving before complete exit def mainTestInteractive(self, sess): print('Testing: Launch interactive mode:') #allq = [] #f2 = open('data/test/responses.txt', 'w') #with open('data/test/samples.txt') as f: # allq = f.readlines() #for question in allq: output_answer = "we know no king but the king" i = 0 while True: question = output_answer #input(self.SENTENCES_PREFIX[0]) i = i + 1 if i == 3: i = 1 if len(question) >= 4: x = len(question) - randint(0, 3) else: x = len(question) - randint(0, 1) print("BOT" + str(i) + ": " + question) question = output_answer[:x] print() if question == '' or question == 'exit': break questionSeq = [ ] # Will be contain the question as seen by the encoder answer = self.singlePredict(question, questionSeq) if not answer: print( 'Warning: sentence too long, sorry. Maybe try a simpler sentence.' ) output_answer = output_answer[:-1] continue # Back to the beginning, try again #f2.write("Q: " + question + " | ") #f2.write("A: " + self.textData.sequence2str(answer, clean=True) + "\n\n") #print('{}{}'.format(self.SENTENCES_PREFIX[1], self.textData.sequence2str(answer, clean=True))) output_answer = self.textData.sequence2str(answer, clean=True) time.sleep(2) #print("BOT2: " + output_answer) #print() #f2.close() def singlePredict(self, question, questionSeq=None): batch = self.textData.sentence2enco(question) if not batch: return None if questionSeq is not None: # If the caller want to have the real input questionSeq.extend(batch.encoderSeqs) # Run the model ops, feedDict = self.model.step(batch) output = self.sess.run( ops[0], feedDict) # TODO: Summarize the output too (histogram, ...) #print("OUTPUT: ") #print(output) answer = self.textData.deco2sentence(output) return answer def _saveSession(self, sess): tqdm.write('Checkpoint reached: saving model (don\'t stop the run)...') self.saveModelParams() self.saver.save(sess, self._getModelName() ) # TODO: Put a limit size (ex: 3GB for the modelDir) tqdm.write('Model saved.') def loadModelParams(self): self.modelDir = os.path.join(self.args['rootDir'], self.MODEL_DIR_BASE) # If there is a previous model, restore some parameters configName = os.path.join(self.modelDir, self.CONFIG_FILENAME) if os.path.exists(configName): # Loading config = configparser.ConfigParser() config.read(configName) # Restoring the the parameters self.globStep = config['General'].getint('globStep') self.args['maxLength'] = config['General'].getint( 'maxLength' ) # We need to restore the model length because of the textData associated and the vocabulary size (TODO: Compatibility mode between different maxLength) #self.args.watsonMode = config['General'].getboolean('watsonMode') #self.args.datasetTag = config['General'].get('datasetTag') self.args['hiddenSize'] = config['Network'].getint('hiddenSize') self.args['numLayers'] = config['Network'].getint('numLayers') self.args['embeddingSize'] = config['Network'].getint( 'embeddingSize') self.args['softmaxSamples'] = config['Network'].getint( 'softmaxSamples') # For now, not arbitrary independent maxLength between encoder and decoder self.args['maxLengthEnco'] = self.args['maxLength'] self.args['maxLengthDeco'] = self.args['maxLength'] + 2 def saveModelParams(self): """ Save the params of the model, like the current globStep value Warning: if you modify this function, make sure the changes mirror loadModelParams """ config = configparser.ConfigParser() config['General'] = {} config['General']['version'] = self.CONFIG_VERSION config['General']['globStep'] = str(self.globStep) config['General']['maxLength'] = str(self.args['maxLength']) #config['General']['watsonMode'] = str(self.args['watsonMode']) config['Network'] = {} config['Network']['hiddenSize'] = str(self.args['hiddenSize']) config['Network']['numLayers'] = str(self.args['numLayers']) config['Network']['embeddingSize'] = str(self.args['embeddingSize']) config['Network']['softmaxSamples'] = str(self.args['softmaxSamples']) # Keep track of the learning params (but without restoring them) config['Training (won\'t be restored)'] = {} config['Training (won\'t be restored)']['learningRate'] = str( self.args['learningRate']) config['Training (won\'t be restored)']['batchSize'] = str( self.args['batchSize']) with open(os.path.join(self.modelDir, self.CONFIG_FILENAME), 'w') as configFile: config.write(configFile) def _getModelName(self): modelName = os.path.join(self.modelDir, self.MODEL_NAME_BASE) # if self.args.keepAll: # We do not erase the previously saved model by including the current step on the name # modelName += '-' + str(self.globStep) return modelName + self.MODEL_EXT def managePreviousModel(self, sess): print('WARNING: ', end='') modelName = self._getModelName() if os.listdir(self.modelDir): #if self.args.reset: # print('Reset: Destroying previous model at {}'.format(self.modelDir)) # Analysing directory content if os.path.exists(modelName): # Restore the model print('Restoring previous model from {}'.format(modelName)) self.saver.restore( sess, modelName ) # Will crash when --reset is not activated and the model has not been saved yet print('Model restored.') else: # No other model to conflict with (probably summary files) print( 'No previous model found, but some files found at {}. Cleaning...' .format(self.modelDir)) # Warning: No confirmation asked self.args['reset'] = True if self.args['reset']: fileList = [ os.path.join(self.modelDir, f) for f in os.listdir(self.modelDir) ] for f in fileList: print('Removing {}'.format(f)) os.remove(f) else: print('No previous model found, starting from clean directory: {}'. format(self.modelDir))
class BidirectionLSTM(object): def __init__(self, encoder_hidden_units, input_embedding_size, bath_size): self.textData = TextData("train.tsv", "data", 100, "test_", 800) self.vocab_size = self.textData.getVocabularySize() self.input_embedding_size = input_embedding_size self.encoder_hidden_units = encoder_hidden_units self.batch_size = bath_size self.buildNetwork() def buildNetwork(self): tf.reset_default_graph() with tf.name_scope("minibatch"): self.encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name="encoder_inputs") self.other_encoder = tf.placeholder(shape=(None, None), dtype=tf.int32, name="other_encoder_inputs") self.encoder_inputs_length = tf.placeholder( shape=(None, ), dtype=tf.int32, name='encoder_inputs_length') self.other_encoder_inputs_length = tf.placeholder( shape=(None, ), dtype=tf.int32, name='other_encoder_inputs_length') self.decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.float32, name="decoder_targets") with tf.name_scope("embedding"): embeddings = tf.Variable(tf.random_uniform( [self.vocab_size, self.input_embedding_size], -1.0, 1.0), dtype=tf.float32) encoder_inputs_embedded = tf.nn.embedding_lookup( embeddings, self.encoder_inputs) other_encoder_inputs_embedded = tf.nn.embedding_lookup( embeddings, self.other_encoder) other_encoder_cell = tf.nn.rnn_cell.BasicLSTMCell( self.encoder_hidden_units) encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(self.encoder_hidden_units) _, other_final_state = tf.nn.dynamic_rnn( cell=other_encoder_cell, inputs=other_encoder_inputs_embedded, sequence_length=self.other_encoder_inputs_length, time_major=True, dtype=tf.float32) ((_, _), (encoder_fw_final_state, encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn( cell_fw=encoder_cell, cell_bw=encoder_cell, inputs=encoder_inputs_embedded, sequence_length=self.encoder_inputs_length, dtype=tf.float32, time_major=True) encoder_final_state_h = tf.concat( (encoder_fw_final_state.h, encoder_bw_final_state.h, other_final_state.h), 1) fc_layer = tf.contrib.layers.fully_connected full_connect_units = 1024 ouput_m = fc_layer(encoder_final_state_h, full_connect_units) ouput_m1 = fc_layer(ouput_m, 512) self.final_output = fc_layer(ouput_m1, 1, activation_fn=None) self.cost = tf.reduce_sum( tf.pow(self.final_output - self.decoder_targets, 2)) / 2 self.error = tf.sqrt( tf.reduce_mean( tf.pow( tf.log(self.final_output + 1) - tf.log(self.decoder_targets + 1), 2))) self.optimizer = tf.train.AdamOptimizer(0.01).minimize(self.cost) def batchHandle(self, inputs, max_sequence_length=None): sequence_lengths = [len(seq) for seq in inputs] batch_size = len(inputs) if max_sequence_length is None: max_sequence_length = max(sequence_lengths) inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD for i, seq in enumerate(inputs): for j, element in enumerate(seq): inputs_batch_major[i, j] = element inputs_time_major = inputs_batch_major.swapaxes(0, 1) return inputs_time_major, sequence_lengths def train(self): batches = self.textData.getBatches(self.batch_size) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) batches_in_epoch = 100 j = 0 for i in range(100): for nextBatch in tqdm(batches, desc="batch_train"): feedDict = {} feedDict[self.encoder_inputs], feedDict[ self.encoder_inputs_length] = self.batchHandle( nextBatch.desc, max(nextBatch.desc_length)) feedDict[self.other_encoder], feedDict[ self.other_encoder_inputs_length] = self.batchHandle( nextBatch.other_elem, max(nextBatch.other_elem_length)) feedDict[self.decoder_targets] = [nextBatch.target] _, l, output_, error_ = sess.run( [self.optimizer, self.cost, self.final_output, self.error], feedDict) j += 1 if j == 0 or j % batches_in_epoch == 0: print("batch {}".format(i)) print(" minibatch loss:{}".format(l)) print("error rate ", error_) print()