def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 save_loss = [] for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) encoder.zero_grad() decoder.zero_grad() features = encoder(x) targets = pack_padded_sequence(word_index, lengths, batch_first=True)[0] outputs = decoder(features, word_index, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if iteration % 100 == 0: print 'loss', loss.data[0] save_loss.append(loss.data[0]) iteration += 1 torch.save(decoder.state_dict(), 'decoder.pkl') torch.save(encoder.state_dict(), 'encoder.pkl') with open('losses.txt', 'w') as f: print >> f, losses
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) # For each TSP problem for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def epoch_training(train_iter, val_iter, num_epoch=100, learning_rate=1e-4, hidden_size=100, early_stop=False, patience=2, epsilon=1e-4): # define model encoder = EncoderRNN(input_size=len(EN.vocab), hidden_size=hidden_size) decoder = DecoderRNN(hidden_size=hidden_size, output_size=len(DE.vocab)) # define loss criterion criterion = nn.NLLLoss(ignore_index=PAD_token) encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate) losses = np.ndarray(patience) res_loss = 13 res_encoder = None res_decoder = None res_epoch = 0 base_bleu = 0 not_updated = 0 for epoch in range(num_epoch): tl = train(train_iter, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) loss, val_bleu = evaluate(val_iter, encoder, decoder, criterion) logging.warning('******Epoch: ' + str(epoch) + ' Training Loss: ' + str(tl) + ' Validation Loss: ' + str(loss) + ' Validation Bleu: ' + str(val_bleu) + '*********') #save the model with the lowest validation loss if base_bleu <= val_bleu: base_bleu = val_bleu res_loss = loss res_encoder = encoder res_decoder = decoder res_epoch = epoch not_updated = 0 logging.warning('Updated validation loss as ' + str(res_loss) + 'With validation Bleu as ' + str(base_bleu) + ' at epoch ' + str(res_epoch)) else: not_updated += 1 if not_updated == patience: break print('Stop at Epoch: ' + str(res_epoch) + ", With Validation Loss: " + str(res_loss) + ", Validation Bleu: " + str(base_bleu)) logging.warning('Stop at Epoch: ' + str(res_epoch) + ", With Validation Loss: " + str(res_loss) + ", Validation Bleu: " + str(base_bleu)) return res_loss, res_encoder, res_decoder, base_bleu
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # Composea all processing together, to a tensor with (C,H,W) and value in range (0 - 1) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) print("cap size %s" % str(captions.size())) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] print(targets) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) print("cnn feats %s" % str(features.size())) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(): # Configuration for hyper-parameters config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models teachercnn = EncoderCNN(config.embed_size) teachercnn.eval() studentcnn = StudentCNN_Model1(config.embed_size) #Load the best teacher model teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, len(vocab), config.num_layers/2) if torch.cuda.is_available(): teachercnn.cuda() studentcnn.cuda() studentlstm.cuda() # Loss and Optimizer criterion_lstm = nn.CrossEntropyLoss() criterion_cnn = nn.MSELoss() params = list(studentlstm.parameters()) + list(studentcnn.parameters()) optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate) optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate) print('entering in to training loop') # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer_lstm.zero_grad() optimizer_cnn.zero_grad() features_tr = teachercnn(images) features_st = studentcnn(images) outputs = studentlstm(features_st, captions, lengths) loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets) loss.backward() optimizer_cnn.step() optimizer_lstm.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(studentlstm.state_dict(), os.path.join(config.student_lstm_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(studentcnn.state_dict(), os.path.join(config.student_cnn_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
h_RNN_layers=RNN_HIDDEN_LAYERS, h_RNN=RNN_HIDDEN_NODES, h_FC_dim=RNN_FC, dropout=DROPOUT, num_classes=2).to(device) # Parallelize model to multiple GPUs if torch.cuda.device_count() > 1: print("Using", torch.cuda.device_count(), "GPUs!") cnn_encoder = nn.DataParallel(cnn_encoder) rnn_decoder = nn.DataParallel(rnn_decoder) # Combine all EncoderCNN + DecoderRNN parameters crnn_params = list(cnn_encoder.module.fc1.parameters()) + list(cnn_encoder.module.bn1.parameters()) + \ list(cnn_encoder.module.fc2.parameters()) + list(cnn_encoder.module.bn2.parameters()) + \ list(cnn_encoder.module.fc3.parameters()) + list(rnn_decoder.parameters()) elif torch.cuda.device_count() == 1: print("Using", torch.cuda.device_count(), "GPU!") # Combine all EncoderCNN + DecoderRNN parameters crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \ list(cnn_encoder.fc2.parameters()) + list(cnn_encoder.bn2.parameters()) + \ list(cnn_encoder.fc3.parameters()) + list(rnn_decoder.parameters()) else: crnn_params = list(cnn_encoder.fc1.parameters()) + list(cnn_encoder.bn1.parameters()) + \ list(cnn_encoder.fc2.parameters()) + list(cnn_encoder.bn2.parameters()) + \ list(cnn_encoder.fc3.parameters()) + list(rnn_decoder.parameters()) optimizer = torch.optim.Adam(crnn_params, lr=LEARNING_RATE)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models save_in_file_loss = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_loss402.txt', "w") save_in_file_perplex = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_perplex402.txt', "w") save_in_file = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_step_loss402.txt', "w") loss_per_epoch = {} perplex_per_epoch = {} total_step = len(data_loader) print('\ntotal-step\n') print(total_step) for epoch in range(args.num_epochs): total_loss = 0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) total_loss += loss.item() text = 'Epoch : ' + str(epoch) + '\nStep : ' + str( i) + '\nLoss : ' + str( loss.item()) + '\nPerplexity : ' + str( np.exp(loss.item())) print('\ntext\n') print(text) save_in_file.write(text) # Save the model checkpoints if (i + 1) % args.save_step == 0: print('saving') torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) loss_per_epoch[epoch + 1] = total_loss / (total_step * args.batch_size) loss_text = str(epoch + 1) + ' : ' + str(loss_per_epoch[epoch + 1]) save_in_file_loss.write(loss_text) save_in_file_loss.write('\n') print('\nloss_text : ' + loss_text) perplex_per_epoch[epoch + 1] = np.exp(loss_per_epoch[epoch + 1]) perplex_text = str(epoch + 1) + ' : ' + str( perplex_per_epoch[epoch + 1]) save_in_file_perplex.write(perplex_text) save_in_file_perplex.write('\n') print('\nperplex_text : ' + perplex_text) save_in_file.close()
#number of input char types char_vocab = len(string.printable) # number of output classes = vocab size numOutputClass = len(labelCorpus.dictionary) print("Number of Classes:" + str(numOutputClass)) # Initialize models and start training encoder = CharCNN(char_vocab, args.hidden_size) decoder = DecoderRNN(args.hidden_size, numOutputClass) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate) criterion = nn.CrossEntropyLoss() if args.cuda: criterion.cuda() encoder.cuda() decoder.cuda() start = time.time() all_losses = [] loss_avg = 0 try: print("Training for %d epochs..." % args.n_epochs) numMiniBatches = len(linesInTrain) / args.batch_size
def main(args): train_losses = [] train_acc = [] # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): losses = [] accuracy = 0.0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) # record accuracy and loss losses.append(loss.item()) topv, topi = outputs.topk(1, dim=1) targets = targets.unsqueeze(-1) accuracy += float((topi == targets).sum()) / targets.shape[0] # update params decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}, Accuracy: {:.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), accuracy / float(i + 1))) with open('my_train_loss_t4_resnext.txt', 'a') as fi: fi.write('\n' + 'epoch = {}, i = {}, tr_loss = {}, acc = {}'. format(epoch + 1, i + 1, loss.item(), accuracy / float(i + 1))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join( args.model_path, 'my-decoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'my-encoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) train_losses.append(sum(losses) / total_step) train_acc.append(accuracy / total_step) # save losses over epoch f = open("train_loss.txt", "a") f.write(str(train_losses)) f.close() # save accuracies over epoch f = open("train_acc.txt", "a") f.write(str(train_acc)) f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models # transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab, transform) sasr_data_loader.load_data(args.data_file, args.init_flag) frogger_data_loader = sasr_data_loader.data_loader( args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(frogger_data_loader): images = to_var(images, volatile=True) if (list(images.size())[0] != 1): captions = to_var(captions) # print(list(images.size())[0]) # print(captions) # exit(0) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): configure(os.path.join(args['exp_dir'], 'log_dir')) transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_loader = get_loader({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'batch_size': args['batch_size'], 'transform': transform, 'num_workers': args['num_workers'], 'shuffle': args['shuffle'], 'mode': 'train' }) # valid_data_loader=get_loader({'data_dir' : args['data_dir'], # 'raw_data_dir' : args['raw_data_dir'], # 'batch_size' : int(args['batch_size']/4), # 'transform' : transform, # 'num_workers' : args['num_workers'], # 'shuffle' : args['shuffle'], # 'mode':'validate'}) args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir'])) encoder = EncoderCNN(args).train() decoder = DecoderRNN(args).train() if args['pretrained']: checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir']) checkpoint = Checkpoint.load(checkpoint_path) encoder.load_state_dict(checkpoint.encoder) decoder.load_state_dict(checkpoint.decoder) step = checkpoint.step epoch = checkpoint.epoch omit = True else: step = 0 epoch = 0 omit = False encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) # params=list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args['lr']) scheduler = StepLR(optimizer, step_size=40, gamma=0.1) # optimizer=YFOptimizer(params) total_step = len(data_loader) min_valid_loss = float('inf') for epoch in range(epoch, args['num_epochs']): scheduler.step() for idx, (images, captions, leng) in enumerate(data_loader): if omit: if idx < (step - total_step * epoch): logger.info( 'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'. format(idx, step, epoch, total_step, step - total_step * epoch)) continue else: omit = False images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, leng, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, leng) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5) optimizer.step() log_value('loss', loss.item(), step) step += 1 if step % args['log_step'] == 0: logger.info( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], idx, total_step, loss.item(), np.exp(loss.item()))) if step % args['valid_step'] == 0: # valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader) # if valid_loss<min_valid_loss: # min_valid_loss=valid_loss Checkpoint(encoder, decoder, optimizer, epoch, step).save(args['exp_dir'])
target_tensor = tensor_from_sentence(output_lang, pair[1]) return (input_tensor, target_tensor) if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") input_lang, output_lang, pairs = load_data() hidden_size = 256 encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device) decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device) criterion = nn.NLLLoss() encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01) decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01) training_pairs = [ tensors_from_pair(input_lang, output_lang, random.choice(pairs)) for i in range(75000) ] # epoch 75000 for epoch in range(2): for i, pair in enumerate(training_pairs): encoder_hidden = encoder.init_hidden().to(device) input_tensor = pair[0] output_tensor = pair[1] encoder_optimizer.zero_grad()
output_seq_len=out_seq_len) inp = Variable(torch.FloatTensor(in_seq).view(batch_size, in_seq_len, input_size)) epoch_loss = train(inp, out_seq, encoder, decoder, enc_opt, dec_opt, criterion, teacher_forcing_prob=0.5) loss_total += epoch_loss if epoch % print_every == print_every - 1: print("[%d/%d] Avg. loss per epoch: %0.3f" % (epoch + 1, num_epochs, loss_total / print_every)) loss_total = 0.0 encoder = EncoderRNN(input_size, hidden_size, n_layers=enc_layers) for tensor in encoder.parameters(): if len(list(tensor.size())) < 2: torch.nn.init.uniform(tensor) else: torch.nn.init.xavier_uniform(tensor) decoder = DecoderRNN(output_size, hidden_size, n_layers=dec_layers) for tensor in decoder.parameters(): if len(list(tensor.size())) < 2: torch.nn.init.uniform(tensor) else: torch.nn.init.xavier_uniform(tensor) train_epochs(1000, encoder, decoder, 0.005) def evaluate(in_seqs, num_steps, output_size): ''' Generate predictions for configurable batch_size and number of steps to predict. ''' inp = Variable(torch.FloatTensor(in_seqs)) enc_hidden = encoder.initHidden(inp.size()[0])
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) worker_thread_count = 1 retry_for_failed = 2 # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.L1Loss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): processed_items = [] threads = [] has_data_to_process = True def do_request(item): position = item['position'] #print(position) #print(item) retry = retry_for_failed while retry: r = requests.post('http://localhost:4567/', data=item) if r.status_code == 200: pil = Image.open(io.BytesIO(r.content)).convert('RGB') processed_items[position] = transform(pil) #print(position, processed_items[position]) break else: print("shouldb be here") time.sleep(2) retry -= 1 # Set mini-batch dataset image_tensors = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(images.size()) #print(torch.equal(images[0] ,images[1])) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_tensors) outputs = decoder(features, captions, lengths) codes = [] def worker(): while items_to_process.qsize() > 0 or has_data_to_process: item = items_to_process.get() if item is None: break do_request(item) items_to_process.task_done() print("ended thread processing") for j in range(worker_thread_count): t = threading.Thread(target=worker) t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() threads.append(t) for ii, image in enumerate(images): image_tensor = to_var(image.unsqueeze(0), volatile=True) feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) payload = {'code': sentence} data = {'position': ii, 'code': sentence} items_to_process.put(data) processed_items.append('failed') codes.append(sentence) has_data_to_process = False print(codes) print(items_to_process.qsize()) print(image.size()) print("waiting for threads") for t in threads: t.join() print("done reassembling images") for t in threads: t.shutdown = True t.join() bad_value = False for pi in processed_items: if isinstance(pi, str) and pi == "failed": bad_value = True if bad_value == True: print("failed conversion,skipping batch") continue output_tensor = torch.FloatTensor(len(processed_items), 3, images.size()[2], images.size()[3]) for ii, image_tensor in enumerate(processed_items): output_tensor[ii] = processed_items[ii] output_var = to_var(output_tensor, False) target_var = to_var(images, False) #loss = criterion(output_var,target_var) print("loss") print(loss) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
# Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) +\ list(encoder.embed.parameters()) # We don't want to retrain the resnet # TODO #4: Define the optimizer. optimizer = torch.optim.RMSprop(params) # Set the total number of training steps per epoch. total_step = math.ceil( len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) ################################################################ import torch.utils.data as data import numpy as np import os
def train(batch_size=32, vocab_threshold=5, vocab_from_file=True, embed_size=256, hidden_size=512, num_epochs=10, latest_model=None, cocoapi_dir="./Coco/"): # Keep track of train and validation losses and validation Bleu-4 scores by epoch train_losses = [] # Define a transform to pre-process the training images transform_train = transforms.Compose([ transforms.Resize(256), transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Build data loader, applying the transforms train_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=cocoapi_dir) # The size of the vocabulary vocab_size = len(train_loader.dataset.vocab) # Initialize the encoder and decoder checkpoint = None if latest_model: checkpoint = torch.load(latest_model) start_epoch = 1 if checkpoint: train_losses = checkpoint['train_losses'] val_losses = checkpoint['val_losses'] start_epoch = checkpoint['epoch'] encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) if checkpoint: encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) # Move models to GPU if CUDA is available if torch.cuda.is_available(): torch.cuda.set_device(1) encoder.cuda() decoder.cuda() # Define the loss function loss = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # Specify the learnable parameters of the model params = list(decoder.parameters()) + list( encoder.embed.parameters()) + list(encoder.bn.parameters()) # Define the optimizer optimizer = torch.optim.Adam(params=params, lr=0.001) if checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) # Set the total number of training and validation steps per epoch total_train_step = math.ceil( len(train_loader.dataset.caption_lengths) / train_loader.batch_sampler.batch_size) start_time = time.time() for epoch in range(start_epoch, num_epochs + 1): train_loss = train_one(train_loader, encoder, decoder, loss, optimizer, vocab_size, epoch, total_train_step) train_losses.append(train_loss) # Save the entire model anyway, regardless of being the best model so far or not filename = os.path.join("./models", "model-{}.pkl".format(epoch)) save_epoch(filename, encoder, decoder, optimizer, train_losses, epoch) print("Epoch [%d/%d] took %ds" % (epoch, num_epochs, time.time() - start_time)) start_time = time.time()
# transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) trainloader = get_loader(train_image_dir, train_caption_path, vocab, transform_train, batch_size, shuffle=True, num_workers=8) testloader = get_loader(test_image_dir, test_caption_path, vocab, transform_test, batch_size, shuffle=False, num_workers=8) checkpoints = os.listdir('checkpoint') encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers=1) encoder = encoder.to(device) decoder = decoder.to(device) params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) cur_epoch = 0 if checkpoints: num_checkpoint = -1 for cp in checkpoints: name, num = cp[:-4].split('_') num = int(num) if name == model_name and num_checkpoint < num: num_checkpoint = num if num_checkpoint > -1: state_dict = torch.load('checkpoint/{}_{}.tar'.format(model_name,num_checkpoint)) encoder.load_state_dict(state_dict['encoder_state_dict']) decoder.load_state_dict(state_dict['decoder_state_dict']) optimizer.load_state_dict(state_dict['optimizer_state_dict'])
def main(args): torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.MSCOCO_result, args.coco_detection_result, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, dummy_object=99, yolo=False) # Build the models encoder = EncoderCNN(args.embed_size) # the layout encoder hidden state size must be the same with decoder input size layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() layout_encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths, label_seqs, location_seqs, visual_seqs, layout_lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize # decoder.zero_grad() # layout_encoder.zero_grad() # encoder.zero_grad() # Modify This part for using visual features or not # features = encoder(images) layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths) # comb_features = features + layout_encoding comb_features = layout_encoding outputs = decoder(comb_features, captions, lengths) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( layout_encoder.state_dict(), os.path.join( args.model_path, 'layout_encoding-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): model_name = args.model_name model_path = os.path.join(args.model_path,model_name) # Create model directory if not os.path.exists(model_path): os.makedirs(model_path) # Create results directory if not os.path.isdir("./results"): os.system('mkdir ./results') # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize(args.crop_size), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #get ids ids = [] with open('TrainImageIds.csv', 'r') as f: reader = csv.reader(f) trainIds = list(reader) trainIds = [int(i) for i in trainIds[0]] coco = COCO('./data/annotations/captions_train2014.json') for img_id in trainIds: for entry in coco.imgToAnns[img_id]: ids.append(entry['id']) #get val ids val_ids = [] with open('ValImageIds.csv', 'r') as f: reader = csv.reader(f) valIds = list(reader) valIds = [int(i) for i in valIds[0]] coco = COCO('./data/annotations/captions_train2014.json') for img_id in valIds: for entry in coco.imgToAnns[img_id]: val_ids.append(entry['id']) # Build data loader train_loader = get_loader(args.image_dir, args.caption_path, ids, vocab, transform, args.batch_size_train, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_image_dir, args.caption_path, val_ids, vocab, transform, args.batch_size_val, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # load pretrained model (optional) # encoder.load_state_dict(torch.load('./models/rnn/encoder-best.ckpt')) # put checkpoint name # decoder.load_state_dict(torch.load('./models/rnn/decoder-best.ckpt')) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models def train(init_epoch=0): total_step = len(train_loader) train_losses = [] val_losses = [] prev_loss = -100 loss_increase_counter = 0 early_stop = True early_stop_threshold = 5 best_model = None for epoch in range(init_epoch, args.num_epochs): running_loss = 0.0 for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths, pretrained=args.pretrained) outputs = outputs loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() running_loss += loss.item() * images.size(0) # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (epoch+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( model_path, 'decoder-{}.ckpt'.format(epoch+1))) torch.save(encoder.state_dict(), os.path.join( model_path, 'encoder-{}.ckpt'.format(epoch+1))) train_loss = running_loss/len(ids) train_losses.append(train_loss) val_loss = val(epoch) val_losses.append(val_loss) if val_loss == min(val_losses): torch.save(decoder.state_dict(), os.path.join( model_path, 'decoder-best.ckpt')) torch.save(encoder.state_dict(), os.path.join( model_path, 'encoder-best.ckpt')) #write results to csv with open("./results/{}_results.csv".format(model_name),'a+', newline='') as csv_file: writer = csv.writer(csv_file, delimiter=',') # writer.writerow(["Epoch", "Train Loss", "Val Loss"]) writer.writerow([epoch+1, train_loss,val_loss]) if val_loss > prev_loss: loss_increase_counter += 1 else: loss_increase_counter = 0 if early_stop and loss_increase_counter > early_stop_threshold: print("Early Stopping..") break prev_loss = val_loss def val(epoch): running_loss = 0.0 for i, (images, captions, lengths) in enumerate(val_loader): images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths, pretrained=args.pretrained) outputs = outputs loss = criterion(outputs, targets) running_loss += loss.item() * images.size(0) return (running_loss/len(val_ids)) train(0)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.dictionary, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #encoder = EncoderCNN(args.embed_size).to(device) dictionary = pd.read_csv(args.dictionary, header=0, encoding='unicode_escape', error_bad_lines=False) dictionary = list(dictionary['keys']) decoder = DecoderRNN(len(dictionary), args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters( )) # + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (array, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset array = array.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize #features = encoder(images) outputs = decoder(array, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() #encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(): # load vocab Data here! with open('VocabData.pkl', 'rb') as f: VocabData = pickle.load(f) with open('FullImageCaps.pkl', 'rb') as f: FullImageCaps = pickle.load(f) # FullImageCaps_sub = loadData("full_image_descriptions.json") coco = loadCoco('captions_train2017.json') data = FullImageCaps + coco print(len(data) / 128) recovery = sys.argv[2] mode = sys.argv[1] lmdata = LMDataset(VocabData, data) lmloader = lmdata.getLoader(batchSize=128, shuffle=True) testloader = lmdata.getLoader(batchSize=1, shuffle=False) embedding = torch.Tensor(lmdata.embedding) vocab_size = len(lmdata.wordDict) max_len = 100 hidden_size = 1024 embedding_size = 300 max_epoch = 10 sos_id = lmdata.sos_id eos_id = lmdata.eos_id pad_id = lmdata.pad_id wordDict = VocabData['word_dict'] rev_vocab = [''] * vocab_size for word in wordDict: rev_vocab[wordDict[word]] = word they = torch.zeros(1, vocab_size) are = torch.zeros(1, vocab_size) students = torch.zeros(1, vocab_size) _from = torch.zeros(1, vocab_size) that = torch.zeros(1, vocab_size) school = torch.zeros(1, vocab_size) they_id = wordDict['they'] are_id = wordDict['are'] students_id = wordDict['students'] from_id = wordDict['from'] that_id = wordDict['that'] school_id = wordDict['school'] they[0, they_id] = 1 are[0, are_id] = 1 students[0, students_id] = 1 _from[0, from_id] = 1 that[0, that_id] = 1 school[0, school_id] = 1 strange_sentence = torch.cat([they, are, are, are, are, are], 0).unsqueeze(0) regular_sentence = torch.cat([they, are, students, _from, that, school], 0).unsqueeze(0) PATH = 'LMcheckpoint(1)' model = DecoderRNN(vocab_size, max_len, hidden_size, embedding_size, sos_id, eos_id, embedding_parameter=embedding, rnn_cell='lstm') if recovery == '1': model = loadCheckpoint(PATH, model) optimizer = optim.Adam(model.parameters(), lr=0.0002) criterion = nn.NLLLoss(ignore_index=pad_id) if torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() if mode == 'train': train_LM(lmloader, model, optimizer, criterion, pad_id, max_epoch, max_len) if mode == 'test': lm_loss = LanguageModelLoss(PATH, vocab_size, max_len, hidden_size, embedding_size, sos_id, eos_id, use_prob_vector=True) loss1 = lm_loss(strange_sentence) loss2 = lm_loss(regular_sentence) print(loss1.item(), loss2.item()) sampleSentence(model, testloader, rev_vocab)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' if decoder_state != 'new': start_epoch = int(decoder_state.split('-')[1]) print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Training on vanilla loss (using new model). Started {} .\n". format(str(datetime.now()))) f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) out = decoder(features, captions, lengths) loss = criterion(out, targets) batch_loss.append(loss.data[0]) loss.backward() optimizer.step() # # Print log info # if i % args.log_step == 0: # print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # with open(args.model_path + args.logfile, 'a') as f: # f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027)) ]) # Build vocab vocab = build_vocab(args.root_path, threshold=0) vocab_path = args.vocab_path with open(vocab_path, 'wb') as f: pickle.dump(vocab, f) len_vocab = vocab.idx print(vocab.idx2word) # Build data loader data_loader = get_loader(args.root_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) #Build atten models if torch.cuda.is_available(): encoder.cuda(1) decoder.cuda(1) # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # make one hot # cap_ = torch.unsqueeze(captions,2) # one_hot_ = torch.FloatTensor(captions.size(0),captions.size(1),len_vocab).zero_() # one_hot_caption = one_hot_.scatter_(2, cap_, 1) # Set mini-batch dataset images = to_var(images) captions = to_var(captions) #captions_ = to_var(one_hot_caption) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) captions = captions.view(-1) outputs = outputs.view(-1, len_vocab) loss = criterion(outputs, targets) loss.backward() optimizer.step() #print(targets) #print(outputs) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) #test set accuracy #print(outputs.max(1)[1]) outputs_np = outputs.max(1)[1].cpu().data.numpy() targets_np = targets.cpu().data.numpy() print(outputs_np) print(targets_np) location_match = 0 size_match = 0 shape_match = 0 exact_match = 0 for i in range(len(targets_np)): if outputs_np[i] == targets_np[i]: exact_match += 1 if i >= args.batch_size and i < args.batch_size * 2 and outputs_np[ i] == targets_np[i]: shape_match += 1 elif i >= args.batch_size * 2 and i < args.batch_size * 3 and outputs_np[ i] == targets_np[i]: location_match += 1 elif i >= args.batch_size * 3 and i < args.batch_size * 4 and outputs_np[ i] == targets_np[i]: size_match += 1 print( 'location match : %.4f, shape match : %.4f, exact_match: %.4f' % (location_match / (args.batch_size), shape_match / args.batch_size, exact_match / len(targets_np))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) +\ list(encoder.embed.parameters()) # We don't want to retrain the resnet # TODO #4: Define the optimizer. optimizer = torch.optim.RMSprop(params) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) ################################################################ import torch.utils.data as data import numpy as np import os import time
def main(args): print(args) epochs_since_improvement = 0 # Create model directory make_dir(args.model_path) # Image pre-processing, normalization for the pre-trained res-net transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper vocab_path = args.vocab_path with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader train_root = args.image_dir + cfg['train']['TRAIN_DIR'] train_json = args.caption_path + cfg['train']['train_annotation'] val_root = args.image_dir + cfg['train']['VAL_DIR'] val_json = args.caption_path + cfg['train']['valid_annotation'] # After patience epochs without improvement, break training patience = cfg['train']['patience'] early_stopping = EarlyStopping(patience=patience, verbose=True) if args.check_point and os.path.isfile(args.check_point): checkpoint = torch.load(args.check_point) old_vocab_size = 0 if args.fine_tuning: encoder = checkpoint['encoder'] decoder = checkpoint['decoder'] print("Fine tuning with check point is {}".format(args.check_point)) vocab, old_vocab_size = append_vocab(args.check_point_vocab, vocab) with open(vocab_path, 'wb') as v: print("Dump {} entries to vocab {}".format(vocab.idx, vocab_path)) pickle.dump(vocab, v) vocab_size = len(vocab) # Get decoder's previous state old_embed = decoder.embed.weight.data[:4124] old_weight = decoder.linear.weight.data[:4124] old_bias = decoder.linear.bias.data[:4124] # Initialize new embedding and linear layers decoder.embed = nn.Embedding(vocab_size, args.embed_size) decoder.linear = nn.Linear(args.hidden_size, vocab_size) if args.freeze_cri or args.lwf or args.distill: # Assign old neurons to the newly-initialized layer, fine-tuning only should ignore this print( "Assigning old neurons of embedding and linear layer to new decoder..." ) # Init by decoder's params decoder.embed.weight.data[: 4124, :] = old_embed # 4124 is the vocab size of S19 decoder.linear.weight.data[:4124] = old_weight decoder.linear.bias.data[:4124] = old_bias encoder.to(device) decoder.to(device) else: # Normal training procedure encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) if args.freeze_enc: args.task_name += '_freeze_enc' elif args.freeze_dec: args.task_name += '_freeze_dec' elif args.freeze_cri: args.task_name += '_freeze_cri' elif args.lwf: args.task_name += '_lwf' elif args.distill and args.kd1: args.task_name += '_kd1' elif args.distill and args.kd2: args.task_name += '_kd2' if args.task_type == 'seq': args.model_path = cfg['model']['model_path_format'].format( args.task_type, args.task_name + '_seq', 'models') args.cpkt_path = cfg['model']['model_path_format'].format( args.task_type, args.task_name + '_seq', 'best') else: args.model_path = cfg['model']['model_path_format'].format( args.task_type, args.task_name, 'models') args.cpkt_path = cfg['model']['model_path_format'].format( args.task_type, args.task_name, 'best') # Create model directory make_dir(args.model_path) # Pseudo-labeling option if args.lwf: print("Running pseudo-labeling option...") # Infer pseudo-labels using previous model pseudo_labels = infer_caption(img_path=train_root, json_path=train_json, model=args.check_point, vocab_path=vocab_path, prediction_path=None, id2class_path=None) # Freeze LSTM and decoder for later joint optimization for param in decoder.lstm.parameters(): param.requires_grad_(False) for param in encoder.parameters(): param.requires_grad_(False) data = append_json(pseudo_labels, train_json) # Create a new json file from the train_json train_json = args.caption_path + 'captions_train_lwf.json' with open(train_json, 'w') as file: json.dump(data, file) # Knowledge distillation option if args.distill: print("Running knowledge distillation...") # Teacher teacher_cnn = checkpoint['encoder'] teacher_lstm = checkpoint['decoder'] teacher_cnn.train() teacher_lstm.train() # Initialize a totally new captioning model - Student encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Student student_cnn = encoder student_lstm = decoder # Move teacher to cuda teacher_cnn.to(device) teacher_lstm.to(device) # Loss between GT caption and the prediction criterion_lstm = nn.CrossEntropyLoss() # Loss between predictions of teacher and student criterion_distill = nn.MSELoss() # Params of student params_st = list(student_lstm.parameters()) + list( student_cnn.parameters()) optimizer_lstm = torch.optim.Adam(params_st, lr=1e-4) optimizer_distill = torch.optim.Adam(student_cnn.parameters(), lr=1e-5) if args.freeze_enc: print("Freeze encoder technique!") for param in encoder.parameters(): param.requires_grad_(False) if args.freeze_dec: print("Freeze decoder technique!") for param in decoder.lstm.parameters(): param.requires_grad_(False) if args.freeze_cri: print("Critical Freezing technique!") layer_idx = -1 for child in encoder.resnet.children(): layer_idx += 1 if layer_idx == 0 or layer_idx == 4: # blk 1 & 2 for param in child.parameters(): param.requires_grad = False train_loader = get_loader(root=train_root, json=train_json, vocab=vocab, transform=transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(root=val_root, json=val_json, vocab=vocab, transform=transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Theses vars are for plotting avg_train_losses = [] avg_val_losses = [] for epoch in range(args.num_epochs): if args.distill: print("Training with distillation option!") train_step, train_loss_step = train_distill( epoch, train_loader=train_loader, student_cnn=student_cnn, student_lstm=student_lstm, teacher_cnn=teacher_cnn, teacher_lstm=teacher_lstm, criterion_lstm=criterion_lstm, criterion_distill=criterion_distill, optimizer_lstm=optimizer_lstm, optimizer_distill=optimizer_distill) # Validate after an epoch recent_val_loss, val_step, val_loss_step = validate( epoch, val_loader=val_loader, encoder=student_cnn, decoder=student_lstm, criterion=criterion) else: train_step, train_loss_step = train(epoch, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, optimizer=optimizer, first_training=True, old_vocab_size=old_vocab_size) # Validate after an epoch recent_val_loss, val_step, val_loss_step = validate( epoch, val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion) train_loss = np.average(train_loss_step) val_loss = np.average(val_loss_step) avg_train_losses.append(train_loss) avg_val_losses.append(val_loss) # Save checkpoint make_dir(args.cpkt_path) early_stopping(args.cpkt_path, cfg['train']['data_name'], epoch, epochs_since_improvement, encoder, decoder, optimizer, optimizer, val_loss) if early_stopping.early_stop: print("Early Stopping!") break if args.lwf: # Make all trainable for param in decoder.linear.parameters(): param.requires_grad_(True) for param in decoder.embed.parameters(): param.requires_grad_(True) for param in decoder.lstm.parameters(): param.requires_grad_(True) for param in encoder.parameters(): param.requires_grad_(True) print("Unfreezing parameters ...") print("Critical Freezing technique!") layer_idx = -1 for child in encoder.resnet.children(): layer_idx += 1 if layer_idx == 0 or layer_idx == 4: # blk 1 & 2 for param in child.parameters(): param.requires_grad = False # Joint optimization starts early_stopping = EarlyStopping(patience=patience, verbose=True) for epoch in range(args.num_epochs): train_step, train_loss_step = train(epoch, train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, optimizer=optimizer, first_training=False, old_vocab_size=old_vocab_size) # Validate after an epoch recent_val_loss, val_step, val_loss_step = validate( epoch, val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion) train_loss = np.average(train_loss_step) val_loss = np.average(val_loss_step) avg_train_losses.append(train_loss) avg_val_losses.append(val_loss) # Save checkpoint make_dir(args.cpkt_path) early_stopping(args.cpkt_path, cfg['train']['data_name'], epoch, epochs_since_improvement, encoder, decoder, optimizer, optimizer, val_loss) if early_stopping.early_stop: print("Early Stopping!") break
batch_size = 20 num_epochs = 100 img_dataset = ImageFolderWithPaths(root='./train', transform=transform) dataset_loader = torch.utils.data.DataLoader(img_dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=collate_func) vocab_size = vocab.index cnn = EncoderCNN(512).to(device) rnn = DecoderRNN(512, 512, vocab_size).to(device) criterion = nn.CrossEntropyLoss() params = list(cnn.linear.parameters()) + list(rnn.parameters()) optimizer = torch.optim.Adam(params, lr=1e-3) for epoch in range(num_epochs): tic = time.time() for i, (image, captions, lengths) in enumerate(dataset_loader): image = image.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] cnn.zero_grad() rnn.zero_grad() cnn_out = cnn.forward(image)
def main(): # Configuration for hyper-parameters torch.cuda.set_device(0) config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader train_image_path = os.path.join(config.image_path, 'train2017') json_path = os.path.join(config.caption_path, 'captions_train2017.json') train_loader = get_data_loader(train_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) val_image_path = os.path.join(config.image_path, 'val2017') json_path = os.path.join(config.caption_path, 'captions_val2017.json') val_loader = get_data_loader(val_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) print('entering in to training loop') # Train the Models with open('train1_log.txt', 'w') as logfile: logfile.write('Validation Error,Training Error') for epoch in range(0, 25): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i + 1) % config.save_step == 0: torch.save( encoder.state_dict(), os.path.join(config.teacher_cnn_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( decoder.state_dict(), os.path.join(config.teacher_lstm_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) print('Just Completed an Epoch, Initite Validation Error Test') avgvalloss = 0 for j, (images, captions, lengths, img_ids) in enumerate(val_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) valloss = criterion(outputs, targets) if j == 0: avgvalloss = valloss.data[0] avgvalloss = (avgvalloss + valloss.data[0]) / 2 if ((j + 1) % 1000 == 0): print('Average Validation Loss: %.4f' % (avgvalloss)) logfile.write( str(avgvalloss) + ',' + str(loss.data[0]) + str('\n')) break
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_caption_loader(args.caption_path, vocab, 75, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderRNN(len(vocab), args.embed_size, args.hidden_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.embedding.parameters()) + list(encoder.rnn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (captions_src, captions_tgt, lengths) in enumerate(data_loader): # Set mini-batch dataset captions_src = captions_src.to(device) captions_tgt = captions_tgt.to(device) targets = pack_padded_sequence(captions_tgt, lengths, batch_first=True)[0] # Forward, backward and optimize enc_output, enc_hidden = encoder(captions_src) outputs = decoder(enc_hidden[:, -1:, :], captions_tgt, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
# Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) + list(encoder.embed.parameters()) # TODO #4: Define the optimizer. optimizer = torch.optim.Adam(params=params, lr=0.001) # Set the total number of training steps per epoch. total_step = math.ceil( len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) # <a id='step2'></a> # ## Step 2: Train your Model # # Once you have executed the code cell in **Step 1**, the training procedure below should run without issue. # # It is completely fine to leave the code cell below as-is without modifications to train your model. However, if you would like to modify the code used to train the model below, you must ensure that your changes are easily parsed by your reviewer. In other words, make sure to provide appropriate comments to describe how your code works!
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))