def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) encoder.load_state_dict(torch.load('searchimage.pkl')) for p in encoder.parameters(): p.requires_grad = False word_encoder = EncoderRNN(embed_dim, embed_dim, len(vocab), num_layers_rnn) word_encoder.load_state_dict(torch.load('searchword.pkl')) if torch.cuda.is_available(): encoder.cuda() word_encoder.cuda() # Loss and Optimizer criterion = nn.MSELoss() params = list( word_encoder.parameters()) # + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=2e-6, weight_decay=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i losses = [] for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): encoder.zero_grad() word_encoder.zero_grad() word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) features = encoder(x) outputs = word_encoder(word_index, lengths) loss = torch.mean((features - outputs).pow(2)) loss.backward() optimizer.step() losses.append(loss.data[0]) if iteration % 100 == 0: print 'loss', sum(losses) / float(len(losses)) losses = [] iteration += 1 torch.save(word_encoder.state_dict(), 'searchword.pkl') torch.save(encoder.state_dict(), 'searchimage.pkl')
def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 save_loss = [] for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) encoder.zero_grad() decoder.zero_grad() features = encoder(x) targets = pack_padded_sequence(word_index, lengths, batch_first=True)[0] outputs = decoder(features, word_index, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if iteration % 100 == 0: print 'loss', loss.data[0] save_loss.append(loss.data[0]) iteration += 1 torch.save(decoder.state_dict(), 'decoder.pkl') torch.save(encoder.state_dict(), 'encoder.pkl') with open('losses.txt', 'w') as f: print >> f, losses
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) # For each TSP problem for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) worker_thread_count = 1 retry_for_failed = 2 # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.L1Loss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): processed_items = [] threads = [] has_data_to_process = True def do_request(item): position = item['position'] #print(position) #print(item) retry = retry_for_failed while retry: r = requests.post('http://localhost:4567/', data=item) if r.status_code == 200: pil = Image.open(io.BytesIO(r.content)).convert('RGB') processed_items[position] = transform(pil) #print(position, processed_items[position]) break else: print("shouldb be here") time.sleep(2) retry -= 1 # Set mini-batch dataset image_tensors = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(images.size()) #print(torch.equal(images[0] ,images[1])) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_tensors) outputs = decoder(features, captions, lengths) codes = [] def worker(): while items_to_process.qsize() > 0 or has_data_to_process: item = items_to_process.get() if item is None: break do_request(item) items_to_process.task_done() print("ended thread processing") for j in range(worker_thread_count): t = threading.Thread(target=worker) t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() threads.append(t) for ii, image in enumerate(images): image_tensor = to_var(image.unsqueeze(0), volatile=True) feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) payload = {'code': sentence} data = {'position': ii, 'code': sentence} items_to_process.put(data) processed_items.append('failed') codes.append(sentence) has_data_to_process = False print(codes) print(items_to_process.qsize()) print(image.size()) print("waiting for threads") for t in threads: t.join() print("done reassembling images") for t in threads: t.shutdown = True t.join() bad_value = False for pi in processed_items: if isinstance(pi, str) and pi == "failed": bad_value = True if bad_value == True: print("failed conversion,skipping batch") continue output_tensor = torch.FloatTensor(len(processed_items), 3, images.size()[2], images.size()[3]) for ii, image_tensor in enumerate(processed_items): output_tensor[ii] = processed_items[ii] output_var = to_var(output_tensor, False) target_var = to_var(images, False) #loss = criterion(output_var,target_var) print("loss") print(loss) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) #val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', # vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = VRNN(args.embed_size, args.hidden_size, len(vocab), args.latent_size, args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) try: start_epoch = int(float(decoder_state.split('-')[1])) except: pass if encoder_state != 'new': encoder.load_state_dict(torch.load(encoder_state)) if decoder_state != 'new': decoder.load_state_dict(torch.load(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Optimizer cross_entropy = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_loss_det = [] batch_kl = [] batch_ml = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # get lengths excluding <start> symbol lengths = [l - 1 for l in lengths] # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) # assuming following assertion assert min(lengths) > args.z_step + 2 # get targets from captions (excluding <start> tokens) #targets = pack_padded_sequence(captions[:,1:], lengths, batch_first=True)[0] targets_var = captions[:, args.z_step + 1] targets_det = pack_padded_sequence( captions[:, args.z_step + 2:], [l - args.z_step - 1 for l in lengths], batch_first=True)[0] # Get prior and approximate distributions decoder.zero_grad() encoder.zero_grad() features = encoder(images) prior, q_z, q_x, det_x = decoder(features, captions, lengths, z_step=args.z_step) # Calculate KL Divergence kl = torch.mean(kl_divergence(*q_z + prior)) # Get marginal likelihood from log likelihood of the correct symbol index = (torch.cuda.LongTensor(range(q_x.shape[0])), targets_var) ml = torch.mean(q_x[index]) # Get Cross-Entropy loss for deterministic decoder ce = cross_entropy(det_x, targets_det) elbo = ml - kl loss_var = -elbo loss_det = ce loss = loss_var + loss_det batch_loss.append(loss.data[0]) batch_loss_det.append(loss_det.data[0]) batch_kl.append(kl.data[0]) batch_ml.append(ml.data[0]) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) with open(args.model_path + args.logfile, 'a') as f: f.write( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) if args.train_encoder: torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models save_in_file_loss = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_loss402.txt', "w") save_in_file_perplex = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_perplex402.txt', "w") save_in_file = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_step_loss402.txt', "w") loss_per_epoch = {} perplex_per_epoch = {} total_step = len(data_loader) print('\ntotal-step\n') print(total_step) for epoch in range(args.num_epochs): total_loss = 0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) total_loss += loss.item() text = 'Epoch : ' + str(epoch) + '\nStep : ' + str( i) + '\nLoss : ' + str( loss.item()) + '\nPerplexity : ' + str( np.exp(loss.item())) print('\ntext\n') print(text) save_in_file.write(text) # Save the model checkpoints if (i + 1) % args.save_step == 0: print('saving') torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) loss_per_epoch[epoch + 1] = total_loss / (total_step * args.batch_size) loss_text = str(epoch + 1) + ' : ' + str(loss_per_epoch[epoch + 1]) save_in_file_loss.write(loss_text) save_in_file_loss.write('\n') print('\nloss_text : ' + loss_text) perplex_per_epoch[epoch + 1] = np.exp(loss_per_epoch[epoch + 1]) perplex_text = str(epoch + 1) + ' : ' + str( perplex_per_epoch[epoch + 1]) save_in_file_perplex.write(perplex_text) save_in_file_perplex.write('\n') print('\nperplex_text : ' + perplex_text) save_in_file.close()
def main(args): torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.coco_detection_result, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) # the layout encoder hidden state size must be the same with decoder input size layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() layout_encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths, label_seqs, location_seqs, layout_lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() layout_encoder.zero_grad() encoder.zero_grad() features = encoder(images) layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths) comb_features = features + layout_encoding outputs = decoder(comb_features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) autoencoder = Autoencoder(args.embed_size, args.embeddings_path, args.hidden_size, len(vocab), args.num_layers).to(device) print(len(vocab)) # optimizer params = list( filter( lambda p: p.requires_grad, list(autoencoder.parameters())[1:] + list(encoder.linear.parameters()))) # print(params) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Define summary writer writer = SummaryWriter() # Loss tracker best_loss = float('inf') # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # print(captions) # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) L_ling, L_vis = autoencoder(features, captions, lengths) loss = 0.2 * L_ling + 0.8 * L_vis # Want visual loss to have bigger impact autoencoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Save the model checkpoints when loss improves if loss.item() < best_loss: best_loss = loss print("Saving checkpoints") torch.save( autoencoder.state_dict(), os.path.join( args.model_path, 'autoencoder-frozen-best.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'encoder-frozen-best.ckpt'.format(epoch + 1, i + 1))) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Log train loss on tensorboard writer.add_scalar('frozen-loss/L_ling', L_ling.item(), epoch * total_step + i) writer.add_scalar('frozen-loss/L_vis', L_vis.item(), epoch * total_step + i) writer.add_scalar('frozen-loss/combined', loss.item(), epoch * total_step + i) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( autoencoder.state_dict(), os.path.join( args.model_path, 'autoencoder-frozen-{}-{}.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'encoder-frozen-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models # transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab, transform) sasr_data_loader.load_data(args.data_file, args.init_flag) frogger_data_loader = sasr_data_loader.data_loader( args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(frogger_data_loader): images = to_var(images, volatile=True) if (list(images.size())[0] != 1): captions = to_var(captions) # print(list(images.size())[0]) # print(captions) # exit(0) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Build data loader data_loader = get_loader(args.image_dir, args.instance_path, args.tag_path, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) all_object_ids = data_loader.dataset.inverse_object_id_mapping.keys() num_objects = len(all_object_ids) with open(args.inverse_object_id_mapping, "wb") as f: pickle.dump(data_loader.dataset.inverse_object_id_mapping, f) # Build the models encoderCNN = EncoderCNN(args.embed_size).to(device) encoderRNN = EncoderRNN(num_objects, args.embed_size, args.hidden_size).to(device) model = Model(num_objects, args.embed_size).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss(ignore_index=0) params = list(encoderRNN.parameters()) + list( encoderCNN.linear.parameters()) + list( encoderCNN.bn.parameters()) + list(model.parameters()) optimizer = torch.optim.RMSprop(params, lr=args.learning_rate, weight_decay=0.0001, momentum=0.91) teacher_forcing_ratio = 0.5 # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, objects, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) targets = objects.to(device).unsqueeze(-1) #targets = pack_padded_sequence(objects, lengths, batch_first=True)[0] # Forward, backward and optimize image_features = encoderCNN(images) h0 = torch.zeros((1, args.batch_size, args.hidden_size)).to(device) c0 = torch.zeros((1, args.batch_size, args.hidden_size)).to(device) loss = 0 teacher_forcing = True if np.random.random( ) > teacher_forcing_ratio else False input = targets[:, 0] if teacher_forcing: for j in range(targets.shape[1] - 1): hashtag_features, (h0, c0), Ul = encoderRNN(input, h0, c0) outputs = model(image_features, hashtag_features, Ul) loss += criterion(outputs, targets[:, j + 1].squeeze(1)) input = targets[:, j + 1] else: for j in range(targets.shape[1] - 1): hashtag_features, (h0, c0), Ul = encoderRNN(input, h0, c0) outputs = model(image_features, hashtag_features, Ul) loss += criterion(outputs, targets[:, j + 1].squeeze(1)) _, top1 = outputs.topk(1, dim=1) input = top1 encoderCNN.zero_grad() encoderRNN.zero_grad() model.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0 and (epoch + 1) % 5 == 0: torch.save( encoderCNN.state_dict(), os.path.join( args.model_path, 'encoderCNN-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoderRNN.state_dict(), os.path.join( args.model_path, 'encoderRNN-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( model.state_dict(), os.path.join(args.model_path, 'model-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) transform_val = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_dir, args.val_caption_path, vocab, transform_val, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) encoder.freeze_bottom() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # decoder = BahdanauAttnDecoderRNN(args.hidden_size, args.embed_size, len(vocab)).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) accs, b1s, b2s, b3s, b4s = [], [], [], [], [] for epoch in range(args.num_epochs): decoder.train() encoder.train() losses = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) losses.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) # acc, b1, b2, b3, b4 = evaluate(val_loader, encoder, decoder, vocab) # accs.append(acc) # b1s.append(b1) # b2s.append(b2) # b3s.append(b3) # b4s.append(b4) avg_loss = sum(losses) / total_step print('Epoch {} Average Training Loss: {:.4f}'.format( epoch + 1, avg_loss)) with open('stem_freeze_freq1000.txt', 'a') as file: file.write("Epoch {} \n".format(epoch + 1)) file.write('Average Accuracy: {} \n'.format(acc)) file.write('Average Loss: {} \n'.format(avg_loss)) file.write('Average BLEU gram1: {} \n'.format(b1)) file.write('Average BLEU gram2: {} \n'.format(b2)) file.write('Average BLEU gram3: {} \n'.format(b3)) file.write('Average BLEU gram4: {} \n'.format(b4)) file.write('\n') plt.title("Accuracy vs BLEU score") plt.plot(np.arange(1, args.num_epochs + 1), accs, label='accuracy') plt.plot(np.arange(1, args.num_epochs + 1), b1s, label='BLEU 1') plt.plot(np.arange(1, args.num_epochs + 1), b2s, label='BLEU 2') plt.plot(np.arange(1, args.num_epochs + 1), b3s, label='BLEU 3') plt.plot(np.arange(1, args.num_epochs + 1), b4s, label='BLEU 4') plt.xlabel("epochs") plt.xticks(np.arange(1, args.num_epochs + 1)) plt.legend(loc='upper left') plt.savefig('accuracy_BLEU.png') plt.clf()
def main(args): # Create model directory if not os.path.exists(args.model_path + "_" + args.model_type): os.makedirs(args.model_path + "_" + args.model_type) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ #transforms.RandomCrop(args.crop_size), #transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, mode=args.mode) # Build the models encoder = EncoderCNN(args.embed_size, args.model_type, args.mode).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() #params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(encoder.parameters()) + list(decoder.parameters()) # fine tune if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters()) elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters()) # train from scratch #if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters()) #elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) #Image.fromarray((((images[0].permute(1, 2, 0).detach().cpu().numpy() + 1) / 2) * 255).astype(np.uint8)).show() captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(targets.shape) #print(targets) # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints torch.save(decoder.state_dict(), os.path.join( args.model_path + "_" + args.model_type + "/", 'decoder.pt')) torch.save(encoder.state_dict(), os.path.join( args.model_path + "_" + args.model_type + "/", 'encoder.pt'))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ # transforms.ColorJitter(contrast = 0.3,saturation = 0.3), # transforms.RandomChoice([transforms.RandomHorizontalFlip(),transforms.RandomVerticalFlip()]), transforms.RandomAffine(0,translate = (0.1,0.1)), transforms.ToTensor(), transforms.Normalize((0.8, 0.7, 0.8), (1, 1, 1)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab,transform) sasr_data_loader.load_data(args.data_file,args.init_flag) frogger_data_loader = sasr_data_loader.data_loader(args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) + list(encoder.resnet.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) stransform = transforms.ToPILImage() img2vec = Img2Vec() total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i,(images,captions,lengths) in enumerate(frogger_data_loader): # image1 = images[0].squeeze() # # print(image1.size()) # # c = stransform(image1) # # vec = img2vec.get_vec(c,True) # # # print(vec) # # c.save('save_image1.png') # # image2 = images[1].squeeze() # # print(image2.size()) # # c = stransform(image2) # # # vec = img2vec.get_vec(c) # # # print(vec) # # c.save('save_image2.png') images = to_var(images, volatile=True) # images = images.to(device) if (list(images.size())[0]!=1): captions = to_var(captions) # print(images[0]) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() # print(images) features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models (Gen) # TODO: put these in generator encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Build the models (Disc) discriminator = Discriminator(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() discriminator.cuda() # Loss and Optimizer (Gen) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Loss and Optimizer (Disc) params_disc = list(discriminator.parameters()) optimizer_disc = torch.optim.Adam(params_disc) # Train the Models total_step = len(data_loader) disc_losses = [] for epoch in range(args.num_epochs): for i, (images, captions, lengths, wrong_captions, wrong_lengths) in enumerate(data_loader): # pdb.set_trace() # TODO: train disc before gen # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) wrong_captions = to_var(wrong_captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) sampled_captions = decoder.sample(features) # sampled_captions = torch.zeros_like(sampled_ids) sampled_lengths = [] for row in range(sampled_captions.size(0)): for index, word_id in enumerate(sampled_captions[row, :]): # pdb.set_trace() word = vocab.idx2word[word_id.cpu().data.numpy()[0]] # sampled_captions[row, index].data = word if word == '<end>': sampled_lengths.append(index + 1) break elif index == sampled_captions.size(1) - 1: sampled_lengths.append(sampled_captions.size(1)) break sampled_lengths = np.array(sampled_lengths) sampled_lengths[::-1].sort() sampled_lengths = sampled_lengths.tolist() loss = criterion(outputs, targets) loss.backward() optimizer.step() # Train discriminator discriminator.zero_grad() rewards_real = discriminator(images, captions, lengths) rewards_fake = discriminator(images, sampled_captions, sampled_lengths) rewards_wrong = discriminator(images, wrong_captions, wrong_lengths) real_loss = -torch.mean(torch.log(rewards_real)) fake_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_fake), min=-1000)) wrong_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_wrong), min=-1000)) loss_disc = real_loss + fake_loss + wrong_loss disc_losses.append(loss_disc.cpu().data.numpy()[0]) loss_disc.backward() optimizer_disc.step() # print('iteration %i' % i) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models # if (i+1) % args.save_step == 0: if ( i + 1 ) % total_step == 0: # jm: saving at the last iteration instead torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( discriminator.state_dict(), os.path.join( args.model_path, 'discriminator-%d-%d.pkl' % (epoch + 1, i + 1))) # plot at the end of every epoch plt.plot(disc_losses, label='disc loss') plt.savefig('disc_losses.png') plt.clf()
def main(args): random.seed() # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder_img = EncoderCNN(args.hidden_size) encoder_capt = EncoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) mlp = MLPNN(args.hidden_size + args.hidden_size) encoder_img_e = EncoderCNN(args.hidden_size) encoder_capt_e = EncoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # load the reward model encoder_img_e.load_state_dict(torch.load(args.encoder_path_e_img)) encoder_capt_e.load_state_dict(torch.load(args.encoder_path_e_capt)) if torch.cuda.is_available(): encoder_img.cuda() encoder_capt.cuda() mlp.cuda() encoder_img_e.cuda() encoder_capt_e.cuda() # Loss and Optimizer criterion = nn.MSELoss() params = list(encoder_capt.parameters()) + list( encoder_img.linear.parameters()) + list(encoder_img.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) features = encoder_img_e(images) outputs = encoder_capt_e(captions, lengths) scores = torch.mm(features, outputs.transpose(1, 0)) diagonal = scores.diag() #rvals = torch.ones(images.size[0]) # batchlength size rvals = diagonal.detach() # batchlength size #rvals = torch.autograd.Variable(diagonal, requires_grad=False) # targets = pack_padded_sequence(rvals, lengths, batch_first=True)[0] # Forward, Backward and Optimize encoder_capt.zero_grad() encoder_img.zero_grad() mlp.zero_grad() img_features = encoder_img(images) #TODO randomly convert the caption to be partial n = captions[0].size(0) t = n * torch.rand(captions.size(0), device=torch.device("cuda")) t = t.type(torch.long) for k in range(captions.size(0)): #print("t[",k,"]=",t[k]) if t[k] < lengths[k]: captions[k][t[k]] = 2 captions[k][t[k] + 1:n] = torch.zeros( n - int(t[k]) - 1, device=torch.device("cuda")) lengths = t + 1 lengths, indices = torch.sort(torch.tensor(lengths), descending=True) captions.index_copy_(0, indices, captions) img_features.index_copy_(0, indices, img_features) rvals.index_copy_(0, indices, rvals) cap_features = encoder_capt(captions, lengths) outputs = mlp(img_features, cap_features) loss = criterion(outputs, rvals) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( encoder_capt.state_dict(), os.path.join( args.model_path, 'encoder-capt-%d-%d-v.pkl' % (epoch + 1, i + 1))) torch.save( encoder_img.state_dict(), os.path.join( args.model_path, 'encoder-img-%d-%d-v.pkl' % (epoch + 1, i + 1))) torch.save( mlp.state_dict(), os.path.join(args.model_path, 'mlp-%d-%d-v.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): configure(os.path.join(args['exp_dir'], 'log_dir')) transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_loader = get_loader({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'batch_size': args['batch_size'], 'transform': transform, 'num_workers': args['num_workers'], 'shuffle': args['shuffle'], 'mode': 'train' }) # valid_data_loader=get_loader({'data_dir' : args['data_dir'], # 'raw_data_dir' : args['raw_data_dir'], # 'batch_size' : int(args['batch_size']/4), # 'transform' : transform, # 'num_workers' : args['num_workers'], # 'shuffle' : args['shuffle'], # 'mode':'validate'}) args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir'])) encoder = EncoderCNN(args).train() decoder = DecoderRNN(args).train() if args['pretrained']: checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir']) checkpoint = Checkpoint.load(checkpoint_path) encoder.load_state_dict(checkpoint.encoder) decoder.load_state_dict(checkpoint.decoder) step = checkpoint.step epoch = checkpoint.epoch omit = True else: step = 0 epoch = 0 omit = False encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) # params=list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args['lr']) scheduler = StepLR(optimizer, step_size=40, gamma=0.1) # optimizer=YFOptimizer(params) total_step = len(data_loader) min_valid_loss = float('inf') for epoch in range(epoch, args['num_epochs']): scheduler.step() for idx, (images, captions, leng) in enumerate(data_loader): if omit: if idx < (step - total_step * epoch): logger.info( 'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'. format(idx, step, epoch, total_step, step - total_step * epoch)) continue else: omit = False images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, leng, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, leng) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5) optimizer.step() log_value('loss', loss.item(), step) step += 1 if step % args['log_step'] == 0: logger.info( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], idx, total_step, loss.item(), np.exp(loss.item()))) if step % args['valid_step'] == 0: # valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader) # if valid_loss<min_valid_loss: # min_valid_loss=valid_loss Checkpoint(encoder, decoder, optimizer, epoch, step).save(args['exp_dir'])
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_image_dir, args.val_caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) train_loss_arr = [] val_loss_arr = [] train_bleu_arr = [] val_bleu_arr = [] for epoch in range(1, args.num_epochs + 1, 1): iteration_loss = [] iteration_bleu = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets) iteration_loss.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions, sampled_ids, vocab) iteration_bleu.append(bleu_score_batch) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, i, total_step, loss.item()) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " Step: " + str(i) + "/" + str(total_step) + " loss: " + str(loss.item()) + " Bleu: " + str(bleu_score_batch) + "\n") f_log.close() # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) train_loss_arr.append(np.array(iteration_loss)) train_bleu_arr.append(np.array(iteration_bleu)) val_loss = 0 val_steps = 0 val_iteration_loss = [] val_iteration_bleu = [] for j, (images_val, captions_val, lengths_val) in enumerate(val_loader): # Set mini-batch dataset images_val = images_val.to(device) captions_val = captions_val.to(device) targets = pack_padded_sequence(captions_val, lengths_val, batch_first=True)[0] # Forward, backward and optimize features = encoder(images_val) outputs = decoder(features, captions_val, lengths_val) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets).item() val_loss += loss val_iteration_loss.append(loss) val_steps += 1 #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions_val, sampled_ids, vocab) val_iteration_bleu.append(bleu_score_batch) val_loss /= val_steps print('Epoch [{}/{}], Val Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, val_loss) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " val loss: " + str(val_loss) + " Bleu: " + str(bleu_score_batch) + "\n\n") f_log.close() val_loss_arr.append(np.array(val_iteration_loss)) val_bleu_arr.append(np.array(val_iteration_bleu)) np.save(os.path.join(args.model_path, "train_loss.npy"), np.array(train_loss_arr)) np.save(os.path.join(args.model_path, "val_loss.npy"), np.array(val_loss_arr)) np.save(os.path.join(args.model_path, "train_bleu.npy"), np.array(train_bleu_arr)) np.save(os.path.join(args.model_path, "val_bleu.npy"), np.array(val_bleu_arr))
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size) decoder = AttnDecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(data_loader) decoder_hidden = decoder.init_hidden() for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): images = cuda_variable(images, volatile=True) captions = cuda_variable(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(captions, decoder_hidden, features, lengths) # outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): train_losses = [] train_acc = [] # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): losses = [] accuracy = 0.0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) # record accuracy and loss losses.append(loss.item()) topv, topi = outputs.topk(1, dim=1) targets = targets.unsqueeze(-1) accuracy += float((topi == targets).sum()) / targets.shape[0] # update params decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}, Accuracy: {:.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), accuracy / float(i + 1))) with open('my_train_loss_t4_resnext.txt', 'a') as fi: fi.write('\n' + 'epoch = {}, i = {}, tr_loss = {}, acc = {}'. format(epoch + 1, i + 1, loss.item(), accuracy / float(i + 1))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join( args.model_path, 'my-decoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'my-encoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) train_losses.append(sum(losses) / total_step) train_acc.append(accuracy / total_step) # save losses over epoch f = open("train_loss.txt", "a") f.write(str(train_losses)) f.close() # save accuracies over epoch f = open("train_acc.txt", "a") f.write(str(train_acc)) f.close()
def main(): # Configuration for hyper-parameters config = Config() # Create model directory if not os.path.exists(config.model_path): os.makedirs(config.model_path) # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available() encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(decoder.state_dict(), os.path.join(config.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(config.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
# Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # print(images.shape) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics.
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # Composea all processing together, to a tensor with (C,H,W) and value in range (0 - 1) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) print("cap size %s" % str(captions.size())) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] print(targets) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) print("cnn feats %s" % str(features.size())) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' if decoder_state != 'new': start_epoch = int(decoder_state.split('-')[1]) print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Training on vanilla loss (using new model). Started {} .\n". format(str(datetime.now()))) f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) out = decoder(features, captions, lengths) loss = criterion(out, targets) batch_loss.append(loss.data[0]) loss.backward() optimizer.step() # # Print log info # if i % args.log_step == 0: # print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # with open(args.model_path + args.logfile, 'a') as f: # f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
def train( num_epochs: int, lr: float, batch_size: int, vocab_threshold: int, vocab_from_file: bool, embed_size: int, hidden_size: int, save_every: int, print_every: int, log_file: str )-> None: """ Train the captioning network with the required parameters. The training logs are saved in log_file. num_epochs: Number of epochs to train the model. batch_size: Mini-batch size for training. vocab_threshold: Minimum word count threshold for vocabulary initialisation. A word that appears in the dataset a fewer number of times than vocab_threshold will be discarded and will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold, the bigger the vocabulary. vocab_from_file: Whether to load the vocabulary from a pre-initialized file. embed_size: Dimensionality of image and word embeddings. hidden_size: Number of features in hidden state of the RNN decoder. save_every: Number of epochs between each checkpoint saving. print_every: Number of batches for printing average loss. log_file: Name of the training log file. Saves loss and perplexity. """ transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # Parameters to update. We do not re-train de CNN here params = list(encoder.embed.parameters()) + list(decoder.parameters()) # TODO: add learning rate scheduler # Optimizer for minimum search. optimizer = optim.Adam(params, lr=lr) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) # Open the training log file. f = open(log_file, 'w') for epoch in range(1, num_epochs + 1): for i_step in range(1, total_step + 1): # Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # for i in range(10): # print(torch.argmax(outputs[0,i, :]).item()) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % ( epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl")) torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl")) # Close the training log file. f.close()
rnn = DecoderRNN(512, 512, vocab_size).to(device) criterion = nn.CrossEntropyLoss() params = list(cnn.linear.parameters()) + list(rnn.parameters()) optimizer = torch.optim.Adam(params, lr=1e-3) for epoch in range(num_epochs): tic = time.time() for i, (image, captions, lengths) in enumerate(dataset_loader): image = image.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] cnn.zero_grad() rnn.zero_grad() cnn_out = cnn.forward(image) lstm_out = rnn.forward(cnn_out, captions, lengths) loss = criterion(lstm_out, targets) loss.backward() optimizer.step() if i % 1000 == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, len(dataset_loader), loss.item(), np.exp(loss.item()))) toc = time.time()
class Worker: def __init__(self, args): # Initialize MPI/NCCL and set topology variables self.init_dist(args.gpu_only) self.rank = self.dist.get_rank() self.world_size = self.dist.get_world_size() self.local_rank = self.dist.get_local_rank() self.local_size = self.dist.get_local_size() self.n_gpus = self.dist.get_n_gpus() self.n_nodes = self.world_size / self.local_size self.node = self.rank // self.local_size self.n_cpu_workers = (self.local_size - self.n_gpus) * self.n_nodes self.n_gpu_workers = self.n_gpus * self.n_nodes # Set RNG seed for reproducibility, can be left on torch.manual_seed(1234) # CuDNN reproducibility if args.reproducible: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Set number of threads if self.dist.is_cpu_rank(): #torch.set_num_threads(args.num_threads) print("[Rank {}] Setting number of OMP threads to {}".format( self.rank, args.num_threads), flush=True) # Calculate batch sizes self.total_batch_size = args.batch_size self.cpu_batch_size = args.cpu_batch_size assert ((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ % (self.n_gpus * self.n_nodes) == 0), "GPU batch size is not an integer" self.gpu_batch_size = int((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ / (self.n_gpus * self.n_nodes)) self.batch_size = self.cpu_batch_size if self.dist.is_cpu_rank( ) else self.gpu_batch_size print("[Rank {}] Current CUDA device: {}".format( self.rank, torch.cuda.current_device()), flush=True) def init_dist(self, gpu_only): # C++ extension module with JIT compilation dist_module = load( name="dist", sources=["dist.cu"], verbose=True, with_cuda=True, extra_cuda_cflags=[ '-ccbin', 'g++', '-std=c++11', '-O3', #'-I/usr/mpi/gcc/openmpi-2.1.2-hfi/include', #'-I/usr/mpi/gcc/mvapich2-2.3b-hfi/include', '-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/include', #'-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/include64', '-I/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/include' ], extra_ldflags=[ '-L/opt/packages/cuda/9.2/lib64', '-lcudart', '-lrt', #'-L/usr/mpi/gcc/openmpi-2.1.2-hfi/lib64', '-lmpi', #'-L/usr/mpi/gcc/mvapich2-2.3b-hfi/lib', '-lmpi', '-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/lib', '-lmpi', #'-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/lib64', '-lmpi', '-L/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/lib', '-lnccl' ], build_directory="/home/jchoi157/torch_extensions") self.dist = dist_module.DistManager(gpu_only, False) def average_gradients(self): # Only all-reduce decoder parameters since encoder is pre-trained for param in self.decoder.parameters(): if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cuda(0, non_blocking=True) param.grad.data *= (self.cpu_batch_size / self.total_batch_size) else: param.grad.data *= (self.gpu_batch_size / self.total_batch_size) self.dist.hetero_allreduce(param.grad.data) if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cpu() def train(self, args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader( args.image_dir, args.caption_path, vocab, transform, self.rank, self.world_size, self.local_size, self.n_gpus, self.total_batch_size, self.cpu_batch_size, self.gpu_batch_size, self.batch_size, shuffle=(False if args.reproducible else True), no_partition=args.no_partition) self.num_batches = len(data_loader) print("[Rank {}] batch size {}, num batches {}".format( self.rank, self.total_batch_size if args.no_partition else self.batch_size, self.num_batches), flush=True) # Build the models self.encoder = EncoderCNN(args.embed_size) self.decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if self.dist.is_gpu_rank(): self.encoder = self.encoder.cuda(self.local_rank) self.decoder = self.decoder.cuda(self.local_rank) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(self.decoder.parameters()) + list( self.encoder.linear.parameters()) + list( self.encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): epoch_start_time = time.time() batch_time_sum = 0 batch_time_total = 0 processed_batches = 0 processed_batches_total = 0 batch_start_time = time.time() for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset if self.dist.is_gpu_rank(): images = images.cuda(self.local_rank) captions = captions.cuda(self.local_rank) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward, all-reduce and optimize features = self.encoder(images) outputs = self.decoder(features, captions, lengths) loss = criterion(outputs, targets) self.decoder.zero_grad() self.encoder.zero_grad() loss.backward() if not args.no_partition: self.average_gradients() optimizer.step() batch_time = time.time() - batch_start_time batch_time_sum += batch_time batch_time_total += batch_time processed_batches += 1 processed_batches_total += 1 saved_loss = loss.item() # Print log info if i % args.log_step == 0 and i != 0: print( 'Rank [{}], Epoch [{}/{}], Step [{}/{}], Average time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, args.num_epochs, i, total_step, batch_time_sum / processed_batches, saved_loss, np.exp(saved_loss)), flush=True) batch_time_sum = 0 processed_batches = 0 # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( self.decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( self.encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) batch_start_time = time.time() epoch_time = time.time() - epoch_start_time print( '!!! Rank [{}], Epoch [{}], Time: {:.6f}, Average batch time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, epoch_time, batch_time_total / processed_batches_total, saved_loss, np.exp(saved_loss)), flush=True)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models, can use a feedforward/convolutional encoder and an RNN decoder encoder = EncoderCNN(args.embed_size).to( device) #can be sequential or convolutional decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion1 = nn.CrossEntropyLoss() criterion2 = nn.NLLLoss() softmax = nn.LogSoftmax(dim=1) params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_training_steps = args.num_iters losses = [] perplexity = [] for epoch in range(args.num_epochs): for i in range(total_training_steps): prog_data = generate_training_data(args.batch_size) images = [im[0] for im in prog_data] transforms = [transform[1] for transform in prog_data] [ele.insert(0, '<start>') for ele in transforms] #start token for each sequence [ele.append('<end>') for ele in transforms] #end token for each sequence lengths = [len(trans) for trans in transforms] maximum_len = max(lengths) for trans in transforms: if len(trans) != maximum_len: trans.extend(['pad'] * (maximum_len - len(trans))) padded_lengths = [len(trans) for trans in transforms] transforms = [[word_to_int(word) for word in transform] for transform in transforms] transforms = torch.tensor(transforms, device=device) images = torch.tensor(images, device=device) images = images.unsqueeze( 1) #Uncomment this line when training using EncoderCNN lengths = torch.tensor(lengths, device=device) padded_lengths = torch.tensor(padded_lengths, device=device) targets = pack_padded_sequence(transforms, padded_lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, transforms, padded_lengths) #print(outputs) loss = criterion1(outputs, targets) losses.append(loss.item()) perplexity.append(np.exp(loss.item())) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f},Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_training_steps, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) y = losses z = perplexity x = np.arange(len(losses)) plt.plot(x, y, label='Cross Entropy Loss') plt.plot(x, z, label='Perplexity') plt.xlabel('Iterations') plt.ylabel('Cross Entropy Loss and Perplexity') plt.title("Cross Entropy Loss and Model Perplexity During Training") plt.legend() plt.savefig('plots/plots_cnn/cnn4_gpu', dpi=100)
# Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics.
def main(args): if not os.path.exists( args.model_path ): # # create model folder to keep model setting pickle files os.makedirs(args.model_path) # image preprocessing and normailzation transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # load vocabulary wrapper file # get data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size) # build encoder decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() # get loss params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # get optimization # train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # set mini batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # forward and backward decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # optimization # Print loss and perplexity if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # save the models pickle file settings if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))