def init(): print("\tInitialising sentences") print("\t\tLoading and cleaning json files") json_of_convs = load_all_json_conv('./Dataset/messages') print("\t\tLoading two person convs") duo_conversations = get_chat_friend_and_me(json_of_convs) print("\t\tMaking two person convs discussions") discussions = get_discussions(duo_conversations) print("\t\tCreating pairs for training") pairs_of_sentences = make_pairs(discussions) print(f"\t\t{len(pairs_of_sentences)} different pairs") print("\t\tCreating Vocabulary") voc = Voc() print("\t\tPopulating Vocabulary") voc.createVocFromPairs(pairs_of_sentences) print(f"\t\tVocabulary of : {voc.num_words} differents words") print('\tBuilding encoder and decoder ...') embedding = nn.Embedding(voc.num_words, HIDDEN_SIZE) encoder = EncoderRNN(HIDDEN_SIZE, embedding, ENCODER_N_LAYERS, DROPOUT) decoder = LuongAttnDecoderRNN(ATTN_MODEL, embedding, HIDDEN_SIZE, voc.num_words, DECODER_N_LAYERS, DROPOUT) encoder_optimizer = optim.Adam(encoder.parameters(), lr=LEARNING_RATE) decoder_optimizer = optim.Adam(decoder.parameters(), lr=LEARNING_RATE * DECODER_LEARNING_RATIO) checkpoint = None if LOADFILENAME: print("\t\tLoading last training") checkpoint = torch.load(LOADFILENAME) # If loading a model trained on GPU to CPU # checkpoint=torch.load(loadFilename,map_location=torch.device('cpu')) encoder_sd = checkpoint['en'] decoder_sd = checkpoint['de'] encoder_optimizer_sd = checkpoint['en_opt'] decoder_optimizer_sd = checkpoint['de_opt'] embedding_sd = checkpoint['embedding'] voc.__dict__ = checkpoint['voc_dict'] print("\t\tPopulating from last training") embedding.load_state_dict(embedding_sd) encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) encoder_optimizer.load_state_dict(encoder_optimizer_sd) decoder_optimizer.load_state_dict(decoder_optimizer_sd) encoder = encoder.to(DEVICE) decoder = decoder.to(DEVICE) return (encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, voc, pairs_of_sentences, checkpoint)
def main(): input_lang = Lang('data/WORDMAP_en.json') output_lang = Lang('data/WORDMAP_zh.json') print("input_lang.n_words: " + str(input_lang.n_words)) print("output_lang.n_words: " + str(output_lang.n_words)) train_data = TranslationDataset('train') val_data = TranslationDataset('valid') # Initialize encoder & decoder models encoder = EncoderRNN(input_lang.n_words, hidden_size, encoder_n_layers, dropout) decoder = LuongAttnDecoderRNN(attn_model, hidden_size, output_lang.n_words, decoder_n_layers, dropout) # Use appropriate device encoder = encoder.to(device) decoder = decoder.to(device) # Initialize optimizers print('Building optimizers ...') encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) # Initializations print('Initializing ...') train_batch_time = ExpoAverageMeter() # forward prop. + back prop. time train_losses = ExpoAverageMeter() # loss (per word decoded) val_batch_time = ExpoAverageMeter() val_losses = ExpoAverageMeter() best_loss = 100000 epochs_since_improvement = 0 # Epochs for epoch in range(start_epoch, epochs): # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20 if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) adjust_learning_rate(encoder_optimizer, 0.8) # One epoch's training # Ensure dropout layers are in train mode encoder.train() decoder.train() start = time.time() # Batches for i_batch in range(len(train_data)): input_variable, lengths, target_variable, mask, max_target_len = train_data[ i_batch] train_loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, encoder_optimizer, decoder_optimizer) # Keep track of metrics train_losses.update(train_loss) train_batch_time.update(time.time() - start) start = time.time() # Print status if i_batch % print_every == 0: print( '[{0}] Epoch: [{1}][{2}/{3}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( timestamp(), epoch, i_batch, len(train_data), batch_time=train_batch_time, loss=train_losses)) # One epoch's validation start = time.time() # Batches for i_batch in range(len(val_data)): input_variable, lengths, target_variable, mask, max_target_len = val_data[ i_batch] val_loss = valid(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder) # Keep track of metrics val_losses.update(val_loss) val_batch_time.update(time.time() - start) start = time.time() # Print status if i_batch % print_every == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( i_batch, len(val_data), batch_time=val_batch_time, loss=val_losses)) val_loss = val_losses.avg print('\n * LOSS - {loss:.3f}\n'.format(loss=val_loss)) # Check if there was an improvement is_best = val_loss < best_loss best_loss = min(best_loss, val_loss) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 save_checkpoint(epoch, encoder, decoder, encoder_optimizer, decoder_optimizer, input_lang, output_lang, val_loss, is_best) # Initialize search module searcher = GreedySearchDecoder(encoder, decoder) for input_sentence, target_sentence in pick_n_valid_sentences( input_lang, output_lang, 10): decoded_words = evaluate(searcher, input_sentence, input_lang, output_lang) print('> {}'.format(input_sentence)) print('= {}'.format(target_sentence)) print('< {}'.format(''.join(decoded_words))) # Reshuffle train and valid samples np.random.shuffle(train_data.samples) np.random.shuffle(val_data.samples)
def main(): train_loader = ChatbotDataset('train') val_loader = ChatbotDataset('valid') # Initialize word embeddings embedding = nn.Embedding(voc.num_words, hidden_size) # Initialize encoder & decoder models encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) # Use appropriate device encoder = encoder.to(device) decoder = decoder.to(device) # Initialize optimizers print('Building optimizers ...') encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) # Initializations print('Initializing ...') batch_time = AverageMeter() # forward prop. + back prop. time losses = AverageMeter() # loss (per word decoded) # Epochs for epoch in range(start_epoch, epochs): # One epoch's training # Ensure dropout layers are in train mode encoder.train() decoder.train() start = time.time() # Batches for i in range(train_loader.__len__()): input_variable, lengths, target_variable, mask, max_target_len = train_loader.__getitem__(i) loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, encoder_optimizer, decoder_optimizer) # Keep track of metrics losses.update(loss, max_target_len) batch_time.update(time.time() - start) start = time.time() if i % print_every == 0: print('[{0}] Epoch: [{1}][{2}/{3}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(timestamp(), epoch, i, len(train_loader), batch_time=batch_time, loss=losses)) # One epoch's validation val_loss = validate(val_loader, encoder, decoder) print('\n * LOSS - {loss:.3f}\n'.format(loss=val_loss)) # Initialize search module searcher = GreedySearchDecoder(encoder, decoder) for sentence in pick_n_valid_sentences(10): decoded_words = evaluate(searcher, sentence) print('Human: {}'.format(sentence)) print('Bot: {}'.format(''.join(decoded_words))) # Save checkpoint if epoch % save_every == 0: directory = save_dir if not os.path.exists(directory): os.makedirs(directory) torch.save({ 'epoch': epoch, 'en': encoder.state_dict(), 'de': decoder.state_dict(), 'en_opt': encoder_optimizer.state_dict(), 'de_opt': decoder_optimizer.state_dict(), 'loss': loss, 'voc': voc.__dict__ }, os.path.join(directory, '{}_{}_{}.tar'.format('checkpoint', epoch, val_loss)))
'embedding': embedding.state_dict() }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint'))) print('Building encoder and decoder ...') # word embedding embedding = nn.Embedding(VOC.num_words, hp.hidden_size) encoder = EncoderRNN(hp.hidden_size, embedding, hp.n_layers, hp.dropout) decoder = LuongAttnDecoderRNN(hp.attn_model, embedding, hp.hidden_size, VOC.num_words, hp.n_layers, hp.dropout) encoder = encoder.to(device) decoder = decoder.to(device) print('Models built and ready to go!') encoder.train() decoder.train() print('Building optimizers ...') encoder_optimizer = optim.Adam(encoder.parameters(), lr=hp.lr) decoder_optimizer = optim.Adam(decoder.parameters(), lr=hp.lr * hp.decoder_learning_ratio) encoder_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( encoder_optimizer, 5) decoder_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( decoder_optimizer, 5) if loadFilename: encoder_optimizer.load_state_dict(encoder_optimizer_sd)
def main(): corpus_name = "cornell movie-dialogs corpus" corpus = os.path.join("data", corpus_name) printLines(os.path.join(corpus, "movie_lines.txt")) # Define path to new file datafile = os.path.join(corpus, "formatted_movie_lines.txt") linefile = os.path.join(corpus, "movie_lines.txt") conversationfile = os.path.join(corpus, "movie_conversations.txt") # Initialize lines dict, conversations list, and field ids MOVIE_LINES_FIELDS = ["lineID", "characterID", "movieID", "character", "text"] MOVIE_CONVERSATIONS_FIELDS = ["character1ID", "character2ID", "movieID", "utteranceIDs"] # Load lines and process conversations preprocess = Preprocess(datafile, linefile, conversationfile, MOVIE_LINES_FIELDS, MOVIE_CONVERSATIONS_FIELDS) preprocess.loadLines() preprocess.loadConversations() preprocess.writeCSV() # Load/Assemble voc and pairs save_dir = os.path.join("data", "save") dataset = Dataset(corpus, corpus_name, datafile) voc, pairs = dataset.loadPrepareData() # # Print some pairs to validate # print("\npairs:") # for pair in pairs[:10]: # print(pair) # Trim voc and pairs pairs = dataset.trimRareWords(voc, pairs, MIN_COUNT) # Example for validation small_batch_size = 5 batches = dataset.batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)]) input_variable, lengths, target_variable, mask, max_target_len = batches print("input_variable:", input_variable) print("lengths:", lengths) print("target_variable:", target_variable) print("mask:", mask) print("max_target_len:", max_target_len) # Configure models model_name = 'cb_model' attn_model = 'dot' #attn_model = 'general' #attn_model = 'concat' hidden_size = 500 encoder_n_layers = 2 decoder_n_layers = 2 dropout = 0.1 batch_size = 64 # Set checkpoint to load from; set to None if starting from scratch loadFilename = None checkpoint_iter = 4000 #loadFilename = os.path.join(save_dir, model_name, corpus_name, # '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size), # '{}_checkpoint.tar'.format(checkpoint_iter)) if loadFilename: # If loading on same machine the model was trained on checkpoint = torch.load(loadFilename) # If loading a model trained on GPU to CPU #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu')) encoder_sd = checkpoint['en'] decoder_sd = checkpoint['de'] encoder_optimizer_sd = checkpoint['en_opt'] decoder_optimizer_sd = checkpoint['de_opt'] embedding_sd = checkpoint['embedding'] voc.__dict__ = checkpoint['voc_dict'] print('Building encoder and decoder ...') # Initialize word embeddings embedding = nn.Embedding(voc.num_words, hidden_size) if loadFilename: embedding.load_state_dict(embedding_sd) # Initialize encoder & decoder models encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) if loadFilename: encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) # Use appropriate device encoder = encoder.to(device) decoder = decoder.to(device) print('Models built and ready to go!') # Configure training/optimization clip = 50.0 teacher_forcing_ratio = 1.0 learning_rate = 0.0001 decoder_learning_ratio = 5.0 n_iteration = 4000 print_every = 1 save_every = 500 # Ensure dropout layers are in train mode encoder.train() decoder.train() # Initialize optimizers print('Building optimizers ...') encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio) if loadFilename: encoder_optimizer.load_state_dict(encoder_optimizer_sd) decoder_optimizer.load_state_dict(decoder_optimizer_sd) # Run training iterations print("Starting Training!") model = Model(dataset.batch2TrainData, teacher_forcing_ratio) model.trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename) # Set dropout layers to eval mode encoder.eval() decoder.eval() # Initialize search module searcher = GreedySearchDecoder(encoder, decoder)