def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ # transforms.ColorJitter(contrast = 0.3,saturation = 0.3), # transforms.RandomChoice([transforms.RandomHorizontalFlip(),transforms.RandomVerticalFlip()]), transforms.RandomAffine(0,translate = (0.1,0.1)), transforms.ToTensor(), transforms.Normalize((0.8, 0.7, 0.8), (1, 1, 1)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab,transform) sasr_data_loader.load_data(args.data_file,args.init_flag) frogger_data_loader = sasr_data_loader.data_loader(args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) + list(encoder.resnet.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) stransform = transforms.ToPILImage() img2vec = Img2Vec() total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i,(images,captions,lengths) in enumerate(frogger_data_loader): # image1 = images[0].squeeze() # # print(image1.size()) # # c = stransform(image1) # # vec = img2vec.get_vec(c,True) # # # print(vec) # # c.save('save_image1.png') # # image2 = images[1].squeeze() # # print(image2.size()) # # c = stransform(image2) # # # vec = img2vec.get_vec(c) # # # print(vec) # # c.save('save_image2.png') images = to_var(images, volatile=True) # images = images.to(device) if (list(images.size())[0]!=1): captions = to_var(captions) # print(images[0]) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() # print(images) features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch)) torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch)) # Close the training log file. f.close()
# Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % ( epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch)) torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch)) # Close the training log file. f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models (Gen) # TODO: put these in generator encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Build the models (Disc) discriminator = Discriminator(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() discriminator.cuda() # Loss and Optimizer (Gen) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Loss and Optimizer (Disc) params_disc = list(discriminator.parameters()) optimizer_disc = torch.optim.Adam(params_disc) # Train the Models total_step = len(data_loader) disc_losses = [] for epoch in range(args.num_epochs): for i, (images, captions, lengths, wrong_captions, wrong_lengths) in enumerate(data_loader): # pdb.set_trace() # TODO: train disc before gen # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) wrong_captions = to_var(wrong_captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) sampled_captions = decoder.sample(features) # sampled_captions = torch.zeros_like(sampled_ids) sampled_lengths = [] for row in range(sampled_captions.size(0)): for index, word_id in enumerate(sampled_captions[row, :]): # pdb.set_trace() word = vocab.idx2word[word_id.cpu().data.numpy()[0]] # sampled_captions[row, index].data = word if word == '<end>': sampled_lengths.append(index + 1) break elif index == sampled_captions.size(1) - 1: sampled_lengths.append(sampled_captions.size(1)) break sampled_lengths = np.array(sampled_lengths) sampled_lengths[::-1].sort() sampled_lengths = sampled_lengths.tolist() loss = criterion(outputs, targets) loss.backward() optimizer.step() # Train discriminator discriminator.zero_grad() rewards_real = discriminator(images, captions, lengths) rewards_fake = discriminator(images, sampled_captions, sampled_lengths) rewards_wrong = discriminator(images, wrong_captions, wrong_lengths) real_loss = -torch.mean(torch.log(rewards_real)) fake_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_fake), min=-1000)) wrong_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_wrong), min=-1000)) loss_disc = real_loss + fake_loss + wrong_loss disc_losses.append(loss_disc.cpu().data.numpy()[0]) loss_disc.backward() optimizer_disc.step() # print('iteration %i' % i) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models # if (i+1) % args.save_step == 0: if ( i + 1 ) % total_step == 0: # jm: saving at the last iteration instead torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( discriminator.state_dict(), os.path.join( args.model_path, 'discriminator-%d-%d.pkl' % (epoch + 1, i + 1))) # plot at the end of every epoch plt.plot(disc_losses, label='disc loss') plt.savefig('disc_losses.png') plt.clf()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
def main(args): random.seed() # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder_img = EncoderCNN(args.hidden_size) encoder_capt = EncoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) mlp = MLPNN(args.hidden_size + args.hidden_size) encoder_img_e = EncoderCNN(args.hidden_size) encoder_capt_e = EncoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # load the reward model encoder_img_e.load_state_dict(torch.load(args.encoder_path_e_img)) encoder_capt_e.load_state_dict(torch.load(args.encoder_path_e_capt)) if torch.cuda.is_available(): encoder_img.cuda() encoder_capt.cuda() mlp.cuda() encoder_img_e.cuda() encoder_capt_e.cuda() # Loss and Optimizer criterion = nn.MSELoss() params = list(encoder_capt.parameters()) + list( encoder_img.linear.parameters()) + list(encoder_img.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) features = encoder_img_e(images) outputs = encoder_capt_e(captions, lengths) scores = torch.mm(features, outputs.transpose(1, 0)) diagonal = scores.diag() #rvals = torch.ones(images.size[0]) # batchlength size rvals = diagonal.detach() # batchlength size #rvals = torch.autograd.Variable(diagonal, requires_grad=False) # targets = pack_padded_sequence(rvals, lengths, batch_first=True)[0] # Forward, Backward and Optimize encoder_capt.zero_grad() encoder_img.zero_grad() mlp.zero_grad() img_features = encoder_img(images) #TODO randomly convert the caption to be partial n = captions[0].size(0) t = n * torch.rand(captions.size(0), device=torch.device("cuda")) t = t.type(torch.long) for k in range(captions.size(0)): #print("t[",k,"]=",t[k]) if t[k] < lengths[k]: captions[k][t[k]] = 2 captions[k][t[k] + 1:n] = torch.zeros( n - int(t[k]) - 1, device=torch.device("cuda")) lengths = t + 1 lengths, indices = torch.sort(torch.tensor(lengths), descending=True) captions.index_copy_(0, indices, captions) img_features.index_copy_(0, indices, img_features) rvals.index_copy_(0, indices, rvals) cap_features = encoder_capt(captions, lengths) outputs = mlp(img_features, cap_features) loss = criterion(outputs, rvals) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( encoder_capt.state_dict(), os.path.join( args.model_path, 'encoder-capt-%d-%d-v.pkl' % (epoch + 1, i + 1))) torch.save( encoder_img.state_dict(), os.path.join( args.model_path, 'encoder-img-%d-%d-v.pkl' % (epoch + 1, i + 1))) torch.save( mlp.state_dict(), os.path.join(args.model_path, 'mlp-%d-%d-v.pkl' % (epoch + 1, i + 1)))
def main(cfg): # modelのディレクトリ作成 if not os.path.exists(hydra.utils.to_absolute_path(cfg.train.model_path)): os.makedirs(hydra.utils.to_absolute_path(cfg.train.model_path)) # 画像の前処理と正規化を行う transform = transforms.Compose([ transforms.RandomCrop(cfg.image.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(cfg.image.mean, cfg.image.std) ]) with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f: vocab = pickle.load(f) # data_loaderの読み込み data_loader = get_loader(hydra.utils.to_absolute_path(cfg.train.image_dir), hydra.utils.to_absolute_path(cfg.train.caption_path), vocab, transform, cfg.train.batch_size, shuffle=True, num_workers=cfg.train.num_workers) # modelの構築 encoder = EncoderCNN(cfg.train.embed_size).to(device) decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers).to(device) # lossとoptimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=cfg.train.learning_rate) # train total_step = len(data_loader) for epoch in range(cfg.train.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # ミニバッジデータセットのセット images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % cfg.train.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, cfg.train.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # modelをcheckpointごとにSaveする if (i + 1) % cfg.train.save_step == 0: torch.save(decoder.state_dict(), os.path.join( hydra.utils.to_absolute_path(cfg.train.model_path), 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save(encoder.state_dict(), os.path.join( hydra.utils.to_absolute_path(cfg.train.model_path), 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(): min_train_loss = 100.0 # Create model directory if not os.path.exists(args['model_path']): os.makedirs(args['model_path']) fp_loss = open( args['model_path'] + 'training_loss_resnet50_finetune_attention_lstm_node08.txt', 'w+') # Image preprocessing transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.9638, 0.9638, 0.9638), (0.1861, 0.1861, 0.1861)) ]) # Load vocabulary wrapper. with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args['image_dir'], args['caption_path'], vocab, transform, args['batch_size'], shuffle=True, num_workers=args['num_workers']) # Build the models encoder = EncoderCNN(args['embed_size']) decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers'], max_seq_length=50) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args['learning_rate']) #original optimizer # Train the Models total_step = len(data_loader) total = 1 for epoch in range(args['num_epochs']): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features, cnn_features = encoder(images) outputs = decoder(features, cnn_features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() fp_loss.write(str(total)) fp_loss.write("\t") fp_loss.write(str(loss.item())) fp_loss.write("\n") total = total + 1 # Print log info if i % args['log_step'] == 0: print('Epoch [%d/%d], Step [%d/%d], training-loss: %.4f' % (epoch, args['num_epochs'], i, total_step, loss.item())) if min_train_loss > loss.item(): min_train_loss = loss.item() torch.save( decoder.state_dict(), os.path.join( args['model_path'], 'decoder_resnet50_finetune_attention_lstm_node08.pkl')) torch.save( encoder.state_dict(), os.path.join( args['model_path'], 'encoder_resnet50_finetune_attention_lstm_node08.pkl')) fp_loss.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #read rationalization data rationalizations = [] max_length = 0 lengths = [] bad_worker_ids = [ 'A2CNSIECB9UP05', 'A23782O23HSPLA', 'A2F9ZBSR6AXXND', 'A3GI86L18Z71XY', 'AIXTI8PKSX1D2', 'A2QWHXMFQI18GQ', 'A3SB7QYI84HYJT', 'A2Q2A7AB6MMFLI', 'A2P1KI42CJVNIA', 'A1IJXPKZTJV809', 'A2WZ0RZMKQ2WGJ', 'A3EKETMVGU2PM9', 'A1OCEC1TBE3CWA', 'AE1RYK54MH11G', 'A2ADEPVGNNXNPA', 'A15QGLWS8CNJFU', 'A18O3DEA5Z4MJD', 'AAAL4RENVAPML', 'A3TZBZ92CQKQLG', 'ABO9F0JD9NN54', 'A8F6JFG0WSELT', 'ARN9ET3E608LJ', 'A2TCYNRAZWK8CC', 'A32BK0E1IPDUAF', 'ANNV3E6CIVCW4' ] with open('./Log/Rationalizations.txt') as f: for line in f: line = line.lower() line = re.sub('[^a-z\ \']+', " ", line) words = line.split() length = len(words) lengths.append(length) if length > max_length: max_length = length for index, word in enumerate(words): words[index] = vocab.word2idx[word] rationalizations.append(words) # max_length = max(rationalizations,key=len rationalizations = [np.array(xi) for xi in rationalizations] # for index,r in enumerate(rationalizations): # # print(max_length) # r = np.lib.pad(r,(0,max_length - len(r)),'constant') # rationalizations[index] = r # rationalizations = np.vstack(rationalizations) # print(rationalizations) # print(rationalizations.shape) # print(torch.from_numpy(rationalizations)) # rationalizations = torch.from_numpy(rationalizations) # print(np.asarray(rationalizations).reshape(rationalizations.shape,rationalizations.shape)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) frogger_data_loader = get_images('./data/FroggerDataset/', args.batch_size, transform) # exit(0) # Train the Models # data = iter(frogger_data_loader) # imgs = data.next()[0] # print(imgs) # print(frogger_data_loader[0]) # exit(0) # for i,(images) in enumerate(frogger_data_loader): # print(images) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, x in enumerate(frogger_data_loader): # print(x) # print(x[0]) # exit(0) # print(x[0]) # exit(0) images = to_var(x[0], volatile=True) print(images[0][1]) exit(0) captions = [] max_length = max(lengths[i:i + 2]) rats = rationalizations[i:i + 2] rats.sort(key=lambda s: len(s)) rats.reverse() # print(rats) # exit(0) for index, r in enumerate(rats): # print(max_length) r = np.lib.pad(r, (0, max_length - len(r)), 'constant') captions.append(r) # rationalizations = np.vstack(rationalizations) # captions.sort(key = lambda s: len(s)) captions = to_var(torch.from_numpy(np.asarray(captions))) # lengths.append(len(rationalizations[i])) new_lengths = [] # new_lengths.append(lengths[i]) new_lengths = lengths[i:i + 2] new_lengths.sort() new_lengths.reverse() captions = captions # print(captions) # print(new_lengths) targets = pack_padded_sequence(captions, new_lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() # print(images) features = encoder(images) # print(features) # print(rats) # print(len(lengths)) outputs = decoder(features, captions, new_lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader #################### YOUR CODE BEGINS HERE ################################## # TODO [YOU] - Peform image preprocessing and data augmentation by defining transform. transform = None data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) #################### YOUR CODE ENDS HERE #################################### # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss, optimizer and training pipeline #################### YOUR CODE BEGINS HERE ################################## criterion = None # Define the appropriate loss function params = None # Get the full list of parameters to be optimized optimizer = None # Define the optimizer. Make sure to specify the parameters to be optimized over # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = None # Get the features from the encoder outputs = None # Pass these features into the decoder, along with any other required arguments loss = None # Compute the loss from the output and targets # TODO [YOU] # 1. zero out the gradients for encoder, decoder # 2. perform backward pass # 3. perform a gradient step #################### YOUR CODE ENDS HERE #################################### # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def train_main(args): if not os.path.exists(args.base_dir + "model/"): os.mkdir(args.base_dir + "model/") transform = transforms.Compose([ transforms.RandomCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(base_dir + "vocab.pkl", "rb") as f: vocab = pickle.load(f) vocab_size = len(vocab) # 新建加载数据集 loader = get_loader(args.base_dir, args.part, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # 随机显示一张图片和对应标签 # plotting(loader, args) # 实例化编码器和解码器 encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, vocab_size, args.hidden_size, args.num_layers, max_seq=20) num_captions = 5 num_examples = len(loader) loss_func = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) + list( encoder.bn.parameters()) optimizer = Adam(params, 0.001) for epoch in range(args.num_epoch): for i, (images, captions, lengths) in enumerate(loader): for j in range(num_captions): caption = captions[:, j, :] length = torch.Tensor(lengths)[:, j] length, _ = torch.sort(length, dim=0, descending=True) targets = pack_padded_sequence(caption, length, batch_first=True)[0] # 正反向传播及优化 features = encoder(images) outputs = decoder(features, caption, length) loss = loss_func(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % 10 == 0: print( "Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}" .format(epoch + 1, args.num_epoch, i, num_examples, loss.item(), np.exp(loss.item()))) torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-epoch-{}.ckpt'.format(epoch + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-epoch-{}.ckpt'.format(epoch + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # if using policy gradient if (args.use_policy): # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) est_encoder = EncoderCNN(args.embed_size) estimator = Estimator(args.embed_size, len(vocab), args.hidden_size, args.num_layers) # if using pretrained model if (args.use_pretrained): encoder.load_state_dict(torch.load(args.pretrained_encoder)) decoder.load_state_dict(torch.load(args.pretrained_decoder)) est_encoder.load_state_dict(torch.load( args.pretrained_est_encoder)) estimator.load_state_dict(torch.load(args.pretrained_estimator)) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() est_encoder.cuda() estimator.cuda() # loss and optimizer BCE_loss = nn.BCELoss() label_real = to_var(torch.ones(args.batch_size, 1)) label_fake = to_var(torch.zeros(args.batch_size, 1)) cap_params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) est_params = list(est_encoder.linear.parameters()) + list( est_encoder.bn.parameters()) + list(estimator.parameters()) cap_optimizer = torch.optim.Adam(cap_params, lr=args.learning_rate) est_optimizer = torch.optim.Adam(est_params, lr=args.learning_rate) # training total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # leave last batch out if (i == total_step - 1): print('leaving last batch out because not enough data...') continue # Set mini-batch dataset # set images volatile because we don't want to calculate gradient of CNN images = to_var(images, volatile=True) captions = to_var(captions) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() est_encoder.zero_grad() estimator.zero_grad() features = encoder(images) # outputs is a list of captions outputs, log_probs = decoder(features, captions, lengths, True) # cut off the backward pass between estimator and decoder outputs = Variable(outputs.data) est_features = est_encoder(images) # get the rewards of the generated captions and real captions rewards_fake = estimator(est_features, outputs) rewards_real = estimator(est_features, captions) # backprop the loss for estimator est_loss_real = BCE_loss(rewards_real, label_real) est_loss_fake = BCE_loss(rewards_fake, label_fake) # check if estimator has been trained enough # print('fake rewards:', rewards_fake) # print('real rewards:', rewards_real) # print('real loss:', est_loss_real) # print('fake loss:', est_loss_fake) est_loss = est_loss_real + est_loss_fake # est_loss.backward(retain_graph=True) est_loss.backward() est_optimizer.step() # backprop the loss for encoder and decoder of the caption generator rewards_fake = Variable(rewards_fake.data) cap_loss = [] for r in range(rewards_fake.shape[0]): for l in range(log_probs.shape[1]): cap_loss.append(-log_probs[r][l] * rewards_fake[r]) cap_loss = torch.cat(cap_loss).sum() # cap_loss.backward() # cap_optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Estimator Loss: %.4f, Generator Loss: %.4f' % (epoch, args.num_epochs, i, total_step, est_loss.data[0], cap_loss.data[0])) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( est_encoder.state_dict(), os.path.join( args.model_path, 'est_encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( estimator.state_dict(), os.path.join( args.model_path, 'estimator-%d-%d.pkl' % (epoch + 1, i + 1))) # if using strict matching else: # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Loss and Optimizer criterion = nn.CrossEntropyLoss() # training total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) # pack_padded_sequence will pack a padded sequence (in time step order) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] outputs = decoder(features, captions, lengths, False) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def train( num_epochs: int, lr: float, batch_size: int, vocab_threshold: int, vocab_from_file: bool, embed_size: int, hidden_size: int, save_every: int, print_every: int, log_file: str )-> None: """ Train the captioning network with the required parameters. The training logs are saved in log_file. num_epochs: Number of epochs to train the model. batch_size: Mini-batch size for training. vocab_threshold: Minimum word count threshold for vocabulary initialisation. A word that appears in the dataset a fewer number of times than vocab_threshold will be discarded and will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold, the bigger the vocabulary. vocab_from_file: Whether to load the vocabulary from a pre-initialized file. embed_size: Dimensionality of image and word embeddings. hidden_size: Number of features in hidden state of the RNN decoder. save_every: Number of epochs between each checkpoint saving. print_every: Number of batches for printing average loss. log_file: Name of the training log file. Saves loss and perplexity. """ transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # Parameters to update. We do not re-train de CNN here params = list(encoder.embed.parameters()) + list(decoder.parameters()) # TODO: add learning rate scheduler # Optimizer for minimum search. optimizer = optim.Adam(params, lr=lr) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) # Open the training log file. f = open(log_file, 'w') for epoch in range(1, num_epochs + 1): for i_step in range(1, total_step + 1): # Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # for i in range(10): # print(torch.argmax(outputs[0,i, :]).item()) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % ( epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl")) torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl")) # Close the training log file. f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features, cnn_features = encoder(images) outputs = decoder(features, cnn_features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) #val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', # vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = VRNN(args.embed_size, args.hidden_size, len(vocab), args.latent_size, args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) try: start_epoch = int(float(decoder_state.split('-')[1])) except: pass if encoder_state != 'new': encoder.load_state_dict(torch.load(encoder_state)) if decoder_state != 'new': decoder.load_state_dict(torch.load(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Optimizer cross_entropy = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_loss_det = [] batch_kl = [] batch_ml = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # get lengths excluding <start> symbol lengths = [l - 1 for l in lengths] # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) # assuming following assertion assert min(lengths) > args.z_step + 2 # get targets from captions (excluding <start> tokens) #targets = pack_padded_sequence(captions[:,1:], lengths, batch_first=True)[0] targets_var = captions[:, args.z_step + 1] targets_det = pack_padded_sequence( captions[:, args.z_step + 2:], [l - args.z_step - 1 for l in lengths], batch_first=True)[0] # Get prior and approximate distributions decoder.zero_grad() encoder.zero_grad() features = encoder(images) prior, q_z, q_x, det_x = decoder(features, captions, lengths, z_step=args.z_step) # Calculate KL Divergence kl = torch.mean(kl_divergence(*q_z + prior)) # Get marginal likelihood from log likelihood of the correct symbol index = (torch.cuda.LongTensor(range(q_x.shape[0])), targets_var) ml = torch.mean(q_x[index]) # Get Cross-Entropy loss for deterministic decoder ce = cross_entropy(det_x, targets_det) elbo = ml - kl loss_var = -elbo loss_det = ce loss = loss_var + loss_det batch_loss.append(loss.data[0]) batch_loss_det.append(loss_det.data[0]) batch_kl.append(kl.data[0]) batch_ml.append(ml.data[0]) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) with open(args.model_path + args.logfile, 'a') as f: f.write( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) if args.train_encoder: torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
# Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) epoch_loss += loss.item() epoch_loss /= total_step # Save the weights. if save_every == -1: # Only save the best one so far! if epoch_loss <= smallest_loss: torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss))) torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss))) smallest_loss = epoch_loss elif epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss))) torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss))) # Close the training log file. f.close()
def main(args): checkpoint = True # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) init_folder = 'results' if not os.path.exists(init_folder): os.makedirs(init_folder) # else: # shutil.rmtree(init_folder) # os.makedirs(init_folder) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) if args.with_glove == 'True': # Get glove pickles glove_path = args.glove_path vectors = bcolz.open(f'{glove_path}/6B.{args.embed_size}.dat')[:] words = pickle.load( open(f'{glove_path}/6B.{args.embed_size}_words.pkl', 'rb')) word2idx = pickle.load( open(f'{glove_path}/6B.{args.embed_size}_idx.pkl', 'rb')) glove = {w: vectors[word2idx[w]] for w in words} # Get weights matrix weights_matrix = np.zeros((len(vocab), args.embed_size)) words_found = 0 # We compare the vocabulary from the built vocab, and the glove word vectors for i in range(len(vocab)): try: word = vocab.idx2word[i] weights_matrix[i] = glove[word] words_found += 1 except KeyError: weights_matrix[i] = np.random.normal(scale=0.6, size=(args.embed_size, )) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNNGlove(args.hidden_size, weights_matrix, args.num_layers).to(device) else: # Build models normally encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Build data loader train_data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_data_loader = get_loader(args.val_image_dir, args.val_caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) if not args.reset_training: if isfile(os.path.join(args.model_path, 'best_encoder.ckpt')): encoder.load_state_dict( torch.load(os.path.join(args.model_path, 'best_encoder.ckpt'))) print('Encoder weights loaded!') else: print( 'Weights file for encoder does not exist. Encoder will be initialized with default values.' ) if isfile(os.path.join(args.model_path, 'best_decoder.ckpt')): decoder.load_state_dict( torch.load(os.path.join(args.model_path, 'best_decoder.ckpt'))) print('Decoder weights loaded!') else: print( 'Weights file for decoder does not exist. Decoder will be initialized with default values.' ) if isfile(os.path.join(args.model_path, 'last_best_bleu4.npy')): temp = np.load(os.path.join(args.model_path, 'last_best_bleu4.npy'), allow_pickle='TRUE').item() best_bleu4 = temp['best_bleu4'] train_encoder = temp['train_encoder'] print( f'Previous best bleu4 score: {best_bleu4}, training_encoder: {train_encoder}' ) else: best_bleu4 = 0 train_encoder = False else: best_bleu4 = 0 train_encoder = False best_epoch = 0 # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) encoder_optimizer = torch.optim.Adam(params, lr=args.encoder_learning_rate) decoder_optimizer = torch.optim.Adam(params, lr=args.decoder_learning_rate) train_losses = [] val_losses = [] bleu1_scores = [] bleu2_scores = [] bleu3_scores = [] bleu4_scores = [] cider_scores = [] rouge_scores = [] for epoch in range(1, args.num_epochs + 1): train_loss = train(train_data_loader, encoder, decoder, criterion, encoder_optimizer, decoder_optimizer, epoch, train_encoder) score_dict, val_loss = validate(val_data_loader, encoder, decoder, criterion, vocab, epoch) train_losses.append(train_loss) val_losses.append(val_loss) bleu1_scores.append(score_dict['Bleu_1']) bleu2_scores.append(score_dict['Bleu_2']) bleu3_scores.append(score_dict['Bleu_3']) bleu4_scores.append(score_dict['Bleu_4']) cider_scores.append(score_dict['CIDEr']) rouge_scores.append(score_dict['ROUGE_L']) # Check if there was an improvement bleu4_score = score_dict['Bleu_4'] print(f'Last best score {best_bleu4}, at epoch {best_epoch}') if bleu4_score > best_bleu4: best_bleu4 = bleu4_score best_epoch = epoch print(f'New best score {best_bleu4}, at epoch {best_epoch}') torch.save(decoder.state_dict(), os.path.join(args.model_path, 'best_decoder.ckpt')) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'best_encoder.ckpt')) np.save(os.path.join(args.model_path, 'last_best_bleu4.npy'), { 'best_bleu4': best_bleu4, 'train_encoder': train_encoder }) else: if train_encoder: train_encoder = False print( 'No impovement in Bleu4 score. Switching from training Encoder to Decoder' ) else: train_encoder = True print( 'No impovement in Bleu4 score. Switching from training Decoder to Encoder' ) np.save(os.path.join(args.model_path, 'last_best_bleu4.npy'), { 'best_bleu4': best_bleu4, 'train_encoder': train_encoder }) ######################################################################################### plot_loss_graph(args.num_epochs, train_losses, val_losses, init_folder) plot_score_graph(args.num_epochs, bleu1_scores, bleu2_scores, bleu3_scores, bleu4_scores, cider_scores, rouge_scores, init_folder)