def main(): # Load vocabulary wrapper. with open(vocab_path) as f: vocab = pickle.load(f) encoder = EncoderCNN(4096, embed_dim) decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) #load data with open(image_data_file) as f: image_data = pickle.load(f) image_features = si.loadmat(image_feature_file) img_features = image_features['fc7'][0] img_features = np.concatenate(img_features) print 'here' iteration = 0 save_loss = [] for i in range(10): # epoch use_caption = i % 5 print 'Epoch', i for x, y in make_mini_batch(img_features, image_data, use_caption=use_caption): word_padding, lengths = make_word_padding(y, vocab) x = Variable(torch.from_numpy(x).cuda()) word_index = Variable(torch.from_numpy(word_padding).cuda()) encoder.zero_grad() decoder.zero_grad() features = encoder(x) targets = pack_padded_sequence(word_index, lengths, batch_first=True)[0] outputs = decoder(features, word_index, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() if iteration % 100 == 0: print 'loss', loss.data[0] save_loss.append(loss.data[0]) iteration += 1 torch.save(decoder.state_dict(), 'decoder.pkl') torch.save(encoder.state_dict(), 'encoder.pkl') with open('losses.txt', 'w') as f: print >> f, losses
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) # For each TSP problem for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # Composea all processing together, to a tensor with (C,H,W) and value in range (0 - 1) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) print("cap size %s" % str(captions.size())) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] print(targets) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) print("cnn feats %s" % str(features.size())) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
for epoch in range(num_epochs): tic = time.time() for i, (image, captions, lengths) in enumerate(dataset_loader): image = image.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] cnn.zero_grad() rnn.zero_grad() cnn_out = cnn.forward(image) lstm_out = rnn.forward(cnn_out, captions, lengths) loss = criterion(lstm_out, targets) loss.backward() optimizer.step() if i % 1000 == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, len(dataset_loader), loss.item(), np.exp(loss.item()))) toc = time.time() print('epoch %d time %.2f mins' % (epoch, (toc - tic) / 60)) torch.save(cnn.state_dict(), 'cnn.pkl') torch.save(rnn.state_dict(), 'rnn.pkl')
# Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) epoch_loss += loss.item() epoch_loss /= total_step # Save the weights. if save_every == -1: # Only save the best one so far! if epoch_loss <= smallest_loss: torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss))) torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss))) smallest_loss = epoch_loss elif epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss))) torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss))) # Close the training log file. f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models save_in_file_loss = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_loss402.txt', "w") save_in_file_perplex = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_epoch_perplex402.txt', "w") save_in_file = open( '/media/raid6/shivam/imagecaption/simple_cnn_img_attention/mod_step_loss402.txt', "w") loss_per_epoch = {} perplex_per_epoch = {} total_step = len(data_loader) print('\ntotal-step\n') print(total_step) for epoch in range(args.num_epochs): total_loss = 0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) total_loss += loss.item() text = 'Epoch : ' + str(epoch) + '\nStep : ' + str( i) + '\nLoss : ' + str( loss.item()) + '\nPerplexity : ' + str( np.exp(loss.item())) print('\ntext\n') print(text) save_in_file.write(text) # Save the model checkpoints if (i + 1) % args.save_step == 0: print('saving') torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) loss_per_epoch[epoch + 1] = total_loss / (total_step * args.batch_size) loss_text = str(epoch + 1) + ' : ' + str(loss_per_epoch[epoch + 1]) save_in_file_loss.write(loss_text) save_in_file_loss.write('\n') print('\nloss_text : ' + loss_text) perplex_per_epoch[epoch + 1] = np.exp(loss_per_epoch[epoch + 1]) perplex_text = str(epoch + 1) + ' : ' + str( perplex_per_epoch[epoch + 1]) save_in_file_perplex.write(perplex_text) save_in_file_perplex.write('\n') print('\nperplex_text : ' + perplex_text) save_in_file.close()
# Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # if epoch == 3 and i_step % 5000 == 0: # torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d-%d.pkl' % epoch, i_step)) # torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d-%d.pkl' % epoch, i_step)) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join(output_path, 'decoder-%d.pkl' % epoch)) torch.save(encoder.state_dict(), os.path.join(output_path, 'encoder-%d.pkl' % epoch)) scheduler.step() # Close the training log file. f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_caption_loader(args.caption_path, vocab, 75, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderRNN(len(vocab), args.embed_size, args.hidden_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.embedding.parameters()) + list(encoder.rnn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (captions_src, captions_tgt, lengths) in enumerate(data_loader): # Set mini-batch dataset captions_src = captions_src.to(device) captions_tgt = captions_tgt.to(device) targets = pack_padded_sequence(captions_tgt, lengths, batch_first=True)[0] # Forward, backward and optimize enc_output, enc_hidden = encoder(captions_src) outputs = decoder(enc_hidden[:, -1:, :], captions_tgt, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models, can use a feedforward/convolutional encoder and an RNN decoder encoder = EncoderCNN(args.embed_size).to( device) #can be sequential or convolutional decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion1 = nn.CrossEntropyLoss() criterion2 = nn.NLLLoss() softmax = nn.LogSoftmax(dim=1) params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_training_steps = args.num_iters losses = [] perplexity = [] for epoch in range(args.num_epochs): for i in range(total_training_steps): prog_data = generate_training_data(args.batch_size) images = [im[0] for im in prog_data] transforms = [transform[1] for transform in prog_data] [ele.insert(0, '<start>') for ele in transforms] #start token for each sequence [ele.append('<end>') for ele in transforms] #end token for each sequence lengths = [len(trans) for trans in transforms] maximum_len = max(lengths) for trans in transforms: if len(trans) != maximum_len: trans.extend(['pad'] * (maximum_len - len(trans))) padded_lengths = [len(trans) for trans in transforms] transforms = [[word_to_int(word) for word in transform] for transform in transforms] transforms = torch.tensor(transforms, device=device) images = torch.tensor(images, device=device) images = images.unsqueeze( 1) #Uncomment this line when training using EncoderCNN lengths = torch.tensor(lengths, device=device) padded_lengths = torch.tensor(padded_lengths, device=device) targets = pack_padded_sequence(transforms, padded_lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, transforms, padded_lengths) #print(outputs) loss = criterion1(outputs, targets) losses.append(loss.item()) perplexity.append(np.exp(loss.item())) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f},Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_training_steps, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) y = losses z = perplexity x = np.arange(len(losses)) plt.plot(x, y, label='Cross Entropy Loss') plt.plot(x, z, label='Perplexity') plt.xlabel('Iterations') plt.ylabel('Cross Entropy Loss and Perplexity') plt.title("Cross Entropy Loss and Model Perplexity During Training") plt.legend() plt.savefig('plots/plots_cnn/cnn4_gpu', dpi=100)
class Worker: def __init__(self, args): # Initialize MPI/NCCL and set topology variables self.init_dist(args.gpu_only) self.rank = self.dist.get_rank() self.world_size = self.dist.get_world_size() self.local_rank = self.dist.get_local_rank() self.local_size = self.dist.get_local_size() self.n_gpus = self.dist.get_n_gpus() self.n_nodes = self.world_size / self.local_size self.node = self.rank // self.local_size self.n_cpu_workers = (self.local_size - self.n_gpus) * self.n_nodes self.n_gpu_workers = self.n_gpus * self.n_nodes # Set RNG seed for reproducibility, can be left on torch.manual_seed(1234) # CuDNN reproducibility if args.reproducible: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Set number of threads if self.dist.is_cpu_rank(): #torch.set_num_threads(args.num_threads) print("[Rank {}] Setting number of OMP threads to {}".format( self.rank, args.num_threads), flush=True) # Calculate batch sizes self.total_batch_size = args.batch_size self.cpu_batch_size = args.cpu_batch_size assert ((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ % (self.n_gpus * self.n_nodes) == 0), "GPU batch size is not an integer" self.gpu_batch_size = int((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ / (self.n_gpus * self.n_nodes)) self.batch_size = self.cpu_batch_size if self.dist.is_cpu_rank( ) else self.gpu_batch_size print("[Rank {}] Current CUDA device: {}".format( self.rank, torch.cuda.current_device()), flush=True) def init_dist(self, gpu_only): # C++ extension module with JIT compilation dist_module = load( name="dist", sources=["dist.cu"], verbose=True, with_cuda=True, extra_cuda_cflags=[ '-ccbin', 'g++', '-std=c++11', '-O3', #'-I/usr/mpi/gcc/openmpi-2.1.2-hfi/include', #'-I/usr/mpi/gcc/mvapich2-2.3b-hfi/include', '-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/include', #'-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/include64', '-I/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/include' ], extra_ldflags=[ '-L/opt/packages/cuda/9.2/lib64', '-lcudart', '-lrt', #'-L/usr/mpi/gcc/openmpi-2.1.2-hfi/lib64', '-lmpi', #'-L/usr/mpi/gcc/mvapich2-2.3b-hfi/lib', '-lmpi', '-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/lib', '-lmpi', #'-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/lib64', '-lmpi', '-L/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/lib', '-lnccl' ], build_directory="/home/jchoi157/torch_extensions") self.dist = dist_module.DistManager(gpu_only, False) def average_gradients(self): # Only all-reduce decoder parameters since encoder is pre-trained for param in self.decoder.parameters(): if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cuda(0, non_blocking=True) param.grad.data *= (self.cpu_batch_size / self.total_batch_size) else: param.grad.data *= (self.gpu_batch_size / self.total_batch_size) self.dist.hetero_allreduce(param.grad.data) if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cpu() def train(self, args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader( args.image_dir, args.caption_path, vocab, transform, self.rank, self.world_size, self.local_size, self.n_gpus, self.total_batch_size, self.cpu_batch_size, self.gpu_batch_size, self.batch_size, shuffle=(False if args.reproducible else True), no_partition=args.no_partition) self.num_batches = len(data_loader) print("[Rank {}] batch size {}, num batches {}".format( self.rank, self.total_batch_size if args.no_partition else self.batch_size, self.num_batches), flush=True) # Build the models self.encoder = EncoderCNN(args.embed_size) self.decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if self.dist.is_gpu_rank(): self.encoder = self.encoder.cuda(self.local_rank) self.decoder = self.decoder.cuda(self.local_rank) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(self.decoder.parameters()) + list( self.encoder.linear.parameters()) + list( self.encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): epoch_start_time = time.time() batch_time_sum = 0 batch_time_total = 0 processed_batches = 0 processed_batches_total = 0 batch_start_time = time.time() for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset if self.dist.is_gpu_rank(): images = images.cuda(self.local_rank) captions = captions.cuda(self.local_rank) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward, all-reduce and optimize features = self.encoder(images) outputs = self.decoder(features, captions, lengths) loss = criterion(outputs, targets) self.decoder.zero_grad() self.encoder.zero_grad() loss.backward() if not args.no_partition: self.average_gradients() optimizer.step() batch_time = time.time() - batch_start_time batch_time_sum += batch_time batch_time_total += batch_time processed_batches += 1 processed_batches_total += 1 saved_loss = loss.item() # Print log info if i % args.log_step == 0 and i != 0: print( 'Rank [{}], Epoch [{}/{}], Step [{}/{}], Average time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, args.num_epochs, i, total_step, batch_time_sum / processed_batches, saved_loss, np.exp(saved_loss)), flush=True) batch_time_sum = 0 processed_batches = 0 # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( self.decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( self.encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) batch_start_time = time.time() epoch_time = time.time() - epoch_start_time print( '!!! Rank [{}], Epoch [{}], Time: {:.6f}, Average batch time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, epoch_time, batch_time_total / processed_batches_total, saved_loss, np.exp(saved_loss)), flush=True)
def main(args): # random set manualSeed = random.randint(1, 100) # print("Random Seed: ", manualSeed) random.seed(manualSeed) torch.manual_seed(manualSeed) torch.cuda.manual_seed_all(manualSeed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) audio_len, comment_len, mfcc_dim = caculate_max_len(args.audio_dir,args.text_path, vocab) # mfcc_features = audio_preprocess(args.audio_dir, N, AUDIO_LEN, MFCC_DIM).astype(np.float32) # Build data loader data_loader = data_get(args.audio_dir,audio_len, args.text_path, comment_len, vocab ) # Build the models encoder = EncoderRNN(mfcc_dim, args.embed_size, args.hidden_size).to(device) decoder = DecoderRNN(args.embed_size+Z_DIM, args.hidden_size, len(vocab), args.num_layers).to(device) # decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, ((audio,audio_len), (comment,comment_len)) in enumerate(data_loader): audio = audio.to(device) audio = audio.unsqueeze(0) comment = comment.to(device) comment = comment.unsqueeze(0) targets = pack_padded_sequence(comment, [comment_len], batch_first=True)[0] # Forward, backward and optimize audio_features = encoder(audio, [audio_len]) if(Z_DIM>0): z = Variable(torch.randn(audio_features.shape[0], Z_DIM)).cuda() audio_features = torch.cat([z,audio_features],1) outputs = decoder(audio_features, comment, [comment_len]) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'.format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (epoch+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
def main(): # Configuration for hyper-parameters config = Config() # Create model directory if not os.path.exists(config.model_path): os.makedirs(config.model_path) # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available() encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(decoder.state_dict(), os.path.join(config.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(config.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
best_val_accuracy = 0 for epoch in range(args.num_epoch): # optimizer.step() train(args, trainloader, epoch) val_acc = validation(args, valloader, epoch) if val_acc > best_val_accuracy: print("Saving the models") torch.save(force_encoder_model.state_dict(), os.path.join( args.model_dir, 'force_encoder.ckpt')) torch.save(rgb_mask_encoder_model.state_dict(), os.path.join( args.model_dir, 'rgb_mask_encoder.ckpt')) torch.save(decoder_model.state_dict(), os.path.join( args.model_dir, 'decoder.ckpt')) best_val_accuracy = val_acc
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) worker_thread_count = 1 retry_for_failed = 2 # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.L1Loss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): processed_items = [] threads = [] has_data_to_process = True def do_request(item): position = item['position'] #print(position) #print(item) retry = retry_for_failed while retry: r = requests.post('http://localhost:4567/', data=item) if r.status_code == 200: pil = Image.open(io.BytesIO(r.content)).convert('RGB') processed_items[position] = transform(pil) #print(position, processed_items[position]) break else: print("shouldb be here") time.sleep(2) retry -= 1 # Set mini-batch dataset image_tensors = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(images.size()) #print(torch.equal(images[0] ,images[1])) # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_tensors) outputs = decoder(features, captions, lengths) codes = [] def worker(): while items_to_process.qsize() > 0 or has_data_to_process: item = items_to_process.get() if item is None: break do_request(item) items_to_process.task_done() print("ended thread processing") for j in range(worker_thread_count): t = threading.Thread(target=worker) t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() threads.append(t) for ii, image in enumerate(images): image_tensor = to_var(image.unsqueeze(0), volatile=True) feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().data.numpy() sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) payload = {'code': sentence} data = {'position': ii, 'code': sentence} items_to_process.put(data) processed_items.append('failed') codes.append(sentence) has_data_to_process = False print(codes) print(items_to_process.qsize()) print(image.size()) print("waiting for threads") for t in threads: t.join() print("done reassembling images") for t in threads: t.shutdown = True t.join() bad_value = False for pi in processed_items: if isinstance(pi, str) and pi == "failed": bad_value = True if bad_value == True: print("failed conversion,skipping batch") continue output_tensor = torch.FloatTensor(len(processed_items), 3, images.size()[2], images.size()[3]) for ii, image_tensor in enumerate(processed_items): output_tensor[ii] = processed_items[ii] output_var = to_var(output_tensor, False) target_var = to_var(images, False) #loss = criterion(output_var,target_var) print("loss") print(loss) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): if not os.path.exists( args.model_path ): # # create model folder to keep model setting pickle files os.makedirs(args.model_path) # image preprocessing and normailzation transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # load vocabulary wrapper file # get data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size) # build encoder decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() # get loss params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # get optimization # train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # set mini batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # forward and backward decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # optimization # Print loss and perplexity if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # save the models pickle file settings if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_image_dir, args.val_caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) train_loss_arr = [] val_loss_arr = [] train_bleu_arr = [] val_bleu_arr = [] for epoch in range(1, args.num_epochs + 1, 1): iteration_loss = [] iteration_bleu = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets) iteration_loss.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions, sampled_ids, vocab) iteration_bleu.append(bleu_score_batch) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, i, total_step, loss.item()) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " Step: " + str(i) + "/" + str(total_step) + " loss: " + str(loss.item()) + " Bleu: " + str(bleu_score_batch) + "\n") f_log.close() # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) train_loss_arr.append(np.array(iteration_loss)) train_bleu_arr.append(np.array(iteration_bleu)) val_loss = 0 val_steps = 0 val_iteration_loss = [] val_iteration_bleu = [] for j, (images_val, captions_val, lengths_val) in enumerate(val_loader): # Set mini-batch dataset images_val = images_val.to(device) captions_val = captions_val.to(device) targets = pack_padded_sequence(captions_val, lengths_val, batch_first=True)[0] # Forward, backward and optimize features = encoder(images_val) outputs = decoder(features, captions_val, lengths_val) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets).item() val_loss += loss val_iteration_loss.append(loss) val_steps += 1 #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions_val, sampled_ids, vocab) val_iteration_bleu.append(bleu_score_batch) val_loss /= val_steps print('Epoch [{}/{}], Val Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, val_loss) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " val loss: " + str(val_loss) + " Bleu: " + str(bleu_score_batch) + "\n\n") f_log.close() val_loss_arr.append(np.array(val_iteration_loss)) val_bleu_arr.append(np.array(val_iteration_bleu)) np.save(os.path.join(args.model_path, "train_loss.npy"), np.array(train_loss_arr)) np.save(os.path.join(args.model_path, "val_loss.npy"), np.array(val_loss_arr)) np.save(os.path.join(args.model_path, "train_bleu.npy"), np.array(train_bleu_arr)) np.save(os.path.join(args.model_path, "val_bleu.npy"), np.array(val_bleu_arr))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) obj = data_loader.MsvdDataset() datas = obj.getAll() #print(len(datas)) os.chdir(r'E:/jupyterNotebook/our_project/') # Train the models total_step = len(datas) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(datas): #print(epoch,i,images.shape) # Set mini-batch dataset images = images.to(device) # Forward, backward and optimize features = encoder(images) features = features.cpu().detach().numpy() features = features.mean(axis=0) features = torch.from_numpy(features).view(1, -1).to(device) #print(features.shape) for j in range(1): #for j in range(len(captions)): captions[j] = captions[j].long() captions[j] = captions[j].view(1, -1).to(device) targets = pack_padded_sequence(captions[j], lengths[j], batch_first=True)[0] outputs = decoder(features, captions[j], lengths[j]) #print(targets.shape) #print(outputs.shape) loss = criterion(outputs, targets) decoder.zero_grad() #encoder.zero_grad() loss.backward() optimizer.step() print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) #print(os.path) if (i + 1) % 25 == 0: #args.save_step == 0: torch.save( decoder.state_dict(), os.path.join('E:\jupyterNotebook\our_project\models', 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join('E:\jupyterNotebook\our_project\models', 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): train_losses = [] train_acc = [] # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): losses = [] accuracy = 0.0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) # record accuracy and loss losses.append(loss.item()) topv, topi = outputs.topk(1, dim=1) targets = targets.unsqueeze(-1) accuracy += float((topi == targets).sum()) / targets.shape[0] # update params decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}, Accuracy: {:.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), accuracy / float(i + 1))) with open('my_train_loss_t4_resnext.txt', 'a') as fi: fi.write('\n' + 'epoch = {}, i = {}, tr_loss = {}, acc = {}'. format(epoch + 1, i + 1, loss.item(), accuracy / float(i + 1))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join( args.model_path, 'my-decoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'my-encoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) train_losses.append(sum(losses) / total_step) train_acc.append(accuracy / total_step) # save losses over epoch f = open("train_loss.txt", "a") f.write(str(train_losses)) f.close() # save accuracies over epoch f = open("train_acc.txt", "a") f.write(str(train_acc)) f.close()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.MSCOCO_result, args.coco_detection_result, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, dummy_object=99, yolo=False) # Build the models encoder = EncoderCNN(args.embed_size) # the layout encoder hidden state size must be the same with decoder input size layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() layout_encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths, label_seqs, location_seqs, visual_seqs, layout_lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize # decoder.zero_grad() # layout_encoder.zero_grad() # encoder.zero_grad() # Modify This part for using visual features or not # features = encoder(images) layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths) # comb_features = features + layout_encoding comb_features = layout_encoding outputs = decoder(comb_features, captions, lengths) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( layout_encoder.state_dict(), os.path.join( args.model_path, 'layout_encoding-%d-%d.pkl' % (epoch + 1, i + 1)))
class ImageDescriptor(): def __init__(self, args, encoder): assert(args.mode == 'train' or 'val' or 'test') self.__args = args self.__mode = args.mode self.__attention_mechanism = args.attention self.__stats_manager = ImageDescriptorStatsManager() self.__validate_when_training = args.validate_when_training self.__history = [] if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) self.__config_path = os.path.join( args.model_dir, f'config-{args.encoder}{args.encoder_ver}.txt') # Device configuration self.__device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # training set vocab with open(args.vocab_path, 'rb') as f: self.__vocab = pickle.load(f) # validation set vocab with open(args.vocab_path.replace('train', 'val'), 'rb') as f: self.__vocab_val = pickle.load(f) # coco dataset self.__coco_train = CocoDataset( args.image_dir, args.caption_path, self.__vocab, args.crop_size) self.__coco_val = CocoDataset( args.image_dir, args.caption_path.replace('train', 'val'), self.__vocab_val, args.crop_size) # data loader self.__train_loader = torch.utils.data.DataLoader(dataset=self.__coco_train, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) self.__val_loader = torch.utils.data.DataLoader(dataset=self.__coco_val, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Build the models self.__encoder = encoder.to(self.__device) self.__decoder = DecoderRNN(args.embed_size, args.hidden_size, len(self.__vocab), args.num_layers, attention_mechanism=self.__attention_mechanism).to(self.__device) # Loss and optimizer self.__criterion = nn.CrossEntropyLoss() self.__params = list(self.__decoder.parameters( )) + list(self.__encoder.linear.parameters()) + list(self.__encoder.bn.parameters()) self.__optimizer = torch.optim.Adam( self.__params, lr=args.learning_rate) # Load checkpoint and check compatibility if os.path.isfile(self.__config_path): with open(self.__config_path, 'r') as f: content = f.read()[:-1] if content != repr(self): # save the error info with open('config.err', 'w') as f: print(f'f.read():\n{content}', file=f) print(f'repr(self):\n{repr(self)}', file=f) raise ValueError( "Cannot create this experiment: " "I found a checkpoint conflicting with the current setting.") self.load(file_name=args.checkpoint) else: self.save() def setting(self): ''' Return the setting of the experiment. ''' return {'Net': (self.__encoder, self.__decoder), 'Optimizer': self.__optimizer, 'BatchSize': self.__args.batch_size} @property def epoch(self): return len(self.__history) @property def history(self): return self.__history # @property # def mode(self): # return self.__args.mode # @mode.setter # def mode(self, m): # self.__args.mode = m def __repr__(self): ''' Pretty printer showing the setting of the experiment. This is what is displayed when doing `print(experiment). This is also what is saved in the `config.txt file. ''' string = '' for key, val in self.setting().items(): string += '{}({})\n'.format(key, val) return string def state_dict(self): ''' Returns the current state of the model. ''' return {'Net': (self.__encoder.state_dict(), self.__decoder.state_dict()), 'Optimizer': self.__optimizer.state_dict(), 'History': self.__history} def save(self): ''' Saves the model on disk, i.e, create/update the last checkpoint. ''' file_name = os.path.join( self.__args.model_dir, '{}{}-epoch-{}.ckpt'.format(self.__args.encoder, self.__args.encoder_ver, self.epoch)) torch.save(self.state_dict(), file_name) with open(self.__config_path, 'w') as f: print(self, file=f) print(f'Save to {file_name}.') def load(self, file_name=None): ''' Loads the model from the last checkpoint saved on disk. Args: file_name (str): path to the checkpoint file ''' if not file_name: # find the latest .ckpt file try: file_name = max( glob.iglob(os.path.join(self.__args.model_dir, '*.ckpt')), key=os.path.getctime) print(f'Load from {file_name}.') except: raise FileNotFoundError( 'No checkpoint file in the model directory.') else: file_name = os.path.join(self.__args.model_dir, file_name) print(f'Load from {file_name}.') try: checkpoint = torch.load(file_name, map_location=self.__device) except: raise FileNotFoundError( 'Please check --checkpoint, the name of the file') self.load_state_dict(checkpoint) del checkpoint def load_state_dict(self, checkpoint): ''' Loads the model from the input checkpoint. Args: checkpoint: an object saved with torch.save() from a file. ''' self.__encoder.load_state_dict(checkpoint['Net'][0]) self.__decoder.load_state_dict(checkpoint['Net'][1]) self.__optimizer.load_state_dict(checkpoint['Optimizer']) self.__history = checkpoint['History'] # The following loops are used to fix a bug that was # discussed here: https://github.com/pytorch/pytorch/issues/2830 # (it is supposed to be fixed in recent PyTorch version) for state in self.__optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(self.__device) def train(self, plot_loss=None): ''' Train the network using backpropagation based on the optimizer and the training set. Args: plot_loss (func, optional): if not None, should be a function taking a single argument being an experiment (meant to be `self`). Similar to a visitor pattern, this function is meant to inspect the current state of the experiment and display/plot/save statistics. For example, if the experiment is run from a Jupyter notebook, `plot` can be used to display the evolution of the loss with `matplotlib`. If the experiment is run on a server without display, `plot` can be used to show statistics on `stdout` or save statistics in a log file. (default: None) ''' self.__encoder.train() self.__decoder.train() self.__stats_manager.init() total_step = len(self.__train_loader) start_epoch = self.epoch print("Start/Continue training from epoch {}".format(start_epoch)) if plot_loss is not None: plot_loss(self) for epoch in range(start_epoch, self.__args.num_epochs): t_start = time.time() self.__stats_manager.init() for i, (images, captions, lengths) in enumerate(self.__train_loader): # Set mini-batch dataset if not self.__attention_mechanism: images = images.to(self.__device) captions = captions.to(self.__device) else: with torch.no_grad(): images = images.to(self.__device) captions = captions.to(self.__device) targets = pack_padded_sequence( captions, lengths, batch_first=True)[0] # Forward, backward and optimize if not self.__attention_mechanism: features = self.__encoder(images) outputs = self.__decoder(features, captions, lengths) self.__decoder.zero_grad() self.__encoder.zero_grad() else: self.__encoder.zero_grad() self.__decoder.zero_grad() features, cnn_features = self.__encoder(images) outputs = self.__decoder( features, captions, lengths, cnn_features=cnn_features) loss = self.__criterion(outputs, targets) loss.backward() self.__optimizer.step() with torch.no_grad(): self.__stats_manager.accumulate( loss=loss.item(), perplexity=np.exp(loss.item())) # Print log info each iteration if i % self.__args.log_step == 0: print('[Training] Epoch: {}/{} | Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}' .format(epoch+1, self.__args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) if not self.__validate_when_training: self.__history.append(self.__stats_manager.summarize()) print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}".format( self.epoch, time.time() - t_start, self.__history[-1]['loss'], self.__history[-1]['perplexity'])) else: self.__history.append( (self.__stats_manager.summarize(), self.evaluate())) print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}\nEvaluation Loss: {:.6f} | Evaluation Perplexity: {:.6f}".format( self.epoch, time.time() - t_start, self.__history[-1][0]['loss'], self.__history[-1][0]['perplexity'], self.__history[-1][1]['loss'], self.__history[-1][1]['perplexity'])) # Save the model checkpoints self.save() if plot_loss is not None: plot_loss(self) print("Finish training for {} epochs".format(self.__args.num_epochs)) def evaluate(self, print_info=False): ''' Evaluates the experiment, i.e., forward propagates the validation set through the network and returns the statistics computed by the stats manager. Args: print_info (bool): print the results of loss and perplexity ''' self.__stats_manager.init() self.__encoder.eval() self.__decoder.eval() total_step = len(self.__val_loader) with torch.no_grad(): for i, (images, captions, lengths) in enumerate(self.__val_loader): images = images.to(self.__device) captions = captions.to(self.__device) targets = pack_padded_sequence( captions, lengths, batch_first=True)[0] # Forward if not self.__attention_mechanism: features = self.__encoder(images) outputs = self.__decoder(features, captions, lengths) else: features, cnn_features = self.__encoder(images) outputs = self.__decoder( features, captions, lengths, cnn_features=cnn_features) loss = self.__criterion(outputs, targets) self.__stats_manager.accumulate( loss=loss.item(), perplexity=np.exp(loss.item())) if i % self.__args.log_step == 0: print('[Validation] Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}' .format(i, total_step, loss.item(), np.exp(loss.item()))) summarize = self.__stats_manager.summarize() if print_info: print( f'[Validation] Average loss for this epoch is {summarize["loss"]:.6f}') print( f'[Validation] Average perplexity for this epoch is {summarize["perplexity"]:.6f}\n') self.__encoder.train() self.__decoder.train() return summarize def mode(self, mode=None): ''' Get the current mode or change mode. Args: mode (str): 'train' or 'eval' mode ''' if not mode: return self.__mode self.__mode = mode def __load_image(self, image): ''' Load image at `image_path` for evaluation. Args: image (PIL Image): image ''' image = image.resize([224, 224], Image.LANCZOS) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) image = transform(image).unsqueeze(0) return image def test(self, image_path=None, plot=False): ''' Evaluate the model by generating the caption for the corresponding image at `image_path`. Note: This function will not provide BLEU socre. Args: image_path (str): file path of the evaluation image plot (bool): plot or not ''' self.__encoder.eval() self.__decoder.eval() with torch.no_grad(): if not image_path: image_path = self.__args.image_path image = Image.open(image_path) # only process with RGB image if np.array(image).ndim == 3: img = self.__load_image(image).to(self.__device) # generate an caption if not self.__attention_mechanism: feature = self.__encoder(img) sampled_ids = self.__decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() else: feature, cnn_features = self.__encoder(img) sampled_ids = self.__decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.__vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption[1:-1]) # Print out the image and the generated caption print(sentence) if plot: image = Image.open(image_path) plt.imshow(np.asarray(image)) else: print('Not support for non-RGB image.') self.__encoder.train() self.__decoder.train() def coco_image(self, idx, ds='val'): ''' Access iamge_id (which is part of the file name) and corresponding image caption of index `idx` in COCO dataset. Note: For jupyter notebook Args: idx (int): index of COCO dataset Returns: (dict) ''' assert(ds == 'train' or 'val') if ds == 'train': ann_id = self.__coco_train.ids[idx] return self.__coco_train.coco.anns[ann_id] else: ann_id = self.__coco_val.ids[idx] return self.__coco_val.coco.anns[ann_id] @property def len_of_train_set(self): ''' Number of training ''' return len(self.__coco_train) @property def len_of_val_set(self): return len(self.__coco_val) def bleu_score(self, idx, ds='val', plot=False, show_caption=False): ''' Evaluate the BLEU score for index `idx` in COCO dataset. Note: For jupyter notebook Args: idx (int): index ds (str): training or validation dataset plot (bool): plot the image or not Returns: score (float): bleu score ''' assert(ds == 'train' or 'val') self.__encoder.eval() self.__decoder.eval() with torch.no_grad(): try: if ds == 'train': ann_id = self.__coco_train.ids[idx] coco_ann = self.__coco_train.coco.anns[ann_id] else: ann_id = self.__coco_val.ids[idx] coco_ann = self.__coco_val.coco.anns[ann_id] except: raise IndexError('Invalid index') image_id = coco_ann['image_id'] image_id = str(image_id) if len(image_id) != 6: for _ in range(6 - len(image_id)): image_id = '0' + image_id image_path = f'{self.__args.image_dir}/COCO_train2014_000000{image_id}.jpg' if ds == 'val': image_path = image_path.replace('train', 'val') coco_list = coco_ann['caption'].split() image = Image.open(image_path) if np.array(image).ndim == 3: img = self.__load_image(image).to(self.__device) # generate an caption if not self.__attention_mechanism: feature = self.__encoder(img) sampled_ids = self.__decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() else: feature, cnn_features = self.__encoder(img) sampled_ids = self.__decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.__vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break # strip punctuations and spacing sampled_list = [c for c in sampled_caption[1:-1] if c not in punctuation] score = sentence_bleu(coco_list, sampled_list, smoothing_function=SmoothingFunction().method4) if plot: plt.figure() image = Image.open(image_path) plt.imshow(np.asarray(image)) plt.title(f'score: {score}') plt.xlabel(f'file: {image_path}') # Print out the generated caption if show_caption: print(f'Sampled caption:\n{sampled_list}') print(f'COCO caption:\n{coco_list}') else: print('Not support for non-RGB image.') return return score
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.dictionary, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #encoder = EncoderCNN(args.embed_size).to(device) dictionary = pd.read_csv(args.dictionary, header=0, encoding='unicode_escape', error_bad_lines=False) dictionary = list(dictionary['keys']) decoder = DecoderRNN(len(dictionary), args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters( )) # + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (array, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset array = array.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize #features = encoder(images) outputs = decoder(array, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() #encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models # transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab, transform) sasr_data_loader.load_data(args.data_file, args.init_flag) frogger_data_loader = sasr_data_loader.data_loader( args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(frogger_data_loader): images = to_var(images, volatile=True) if (list(images.size())[0] != 1): captions = to_var(captions) # print(list(images.size())[0]) # print(captions) # exit(0) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json', vocab, transform, 1, False, 1) start_epoch = 0 encoder_state = args.encoder decoder_state = args.decoder # Build the models encoder = EncoderCNN(args.embed_size) if not args.train_encoder: encoder.eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if args.restart: encoder_state, decoder_state = 'new', 'new' if encoder_state == '': encoder_state = 'new' if decoder_state == '': decoder_state = 'new' if decoder_state != 'new': start_epoch = int(decoder_state.split('-')[1]) print("Using encoder: {}".format(encoder_state)) print("Using decoder: {}".format(decoder_state)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) """ Make logfile and log output """ with open(args.model_path + args.logfile, 'a+') as f: f.write("Training on vanilla loss (using new model). Started {} .\n". format(str(datetime.now()))) f.write("Using encoder: new\nUsing decoder: new\n\n") if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) batch_loss = [] batch_acc = [] # Train the Models total_step = len(data_loader) for epoch in range(start_epoch, args.num_epochs): for i, (images, captions, lengths, _, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) out = decoder(features, captions, lengths) loss = criterion(out, targets) batch_loss.append(loss.data[0]) loss.backward() optimizer.step() # # Print log info # if i % args.log_step == 0: # print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # with open(args.model_path + args.logfile, 'a') as f: # f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n' # %(epoch, args.num_epochs, i, total_step, # loss.data[0], np.exp(loss.data[0]), acc, gt_acc)) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) with open(args.model_path + 'training_loss.pkl', 'w+') as f: pickle.dump(batch_loss, f) with open(args.model_path + 'training_val.pkl', 'w+') as f: pickle.dump(batch_acc, f) with open(args.model_path + args.logfile, 'a') as f: f.write("Training finished at {} .\n\n".format(str(datetime.now())))
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def main(): # Configuration for hyper-parameters config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models teachercnn = EncoderCNN(config.embed_size) teachercnn.eval() studentcnn = StudentCNN_Model1(config.embed_size) #Load the best teacher model teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, len(vocab), config.num_layers/2) if torch.cuda.is_available(): teachercnn.cuda() studentcnn.cuda() studentlstm.cuda() # Loss and Optimizer criterion_lstm = nn.CrossEntropyLoss() criterion_cnn = nn.MSELoss() params = list(studentlstm.parameters()) + list(studentcnn.parameters()) optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate) optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate) print('entering in to training loop') # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer_lstm.zero_grad() optimizer_cnn.zero_grad() features_tr = teachercnn(images) features_st = studentcnn(images) outputs = studentlstm(features_st, captions, lengths) loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets) loss.backward() optimizer_cnn.step() optimizer_lstm.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(studentlstm.state_dict(), os.path.join(config.student_lstm_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(studentcnn.state_dict(), os.path.join(config.student_cnn_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
# Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) epoch_loss += loss.item() epoch_loss /= total_step # Save the weights. if save_every == -1: # Only save the best one so far! if epoch_loss <= smallest_loss: torch.save( decoder.state_dict(), os.path.join( './models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss))) torch.save( encoder.state_dict(), os.path.join( './models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss))) smallest_loss = epoch_loss elif epoch % save_every == 0: torch.save( decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss)))
# Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Log the loss to Azure ML run.log('loss', loss.item()) run.log('perplexity', np.exp(loss.item())) run.log('stats', stats) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics (on different line). if i_step % args.print_every == 0: print('\r' + stats) # Save the weights. if epoch % args.save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch)) torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027)) ]) # Build vocab vocab = build_vocab(args.root_path, threshold=0) vocab_path = args.vocab_path with open(vocab_path, 'wb') as f: pickle.dump(vocab, f) len_vocab = vocab.idx print(vocab.idx2word) # Build data loader data_loader = get_loader(args.root_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) #Build atten models if torch.cuda.is_available(): encoder.cuda(1) decoder.cuda(1) # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # make one hot # cap_ = torch.unsqueeze(captions,2) # one_hot_ = torch.FloatTensor(captions.size(0),captions.size(1),len_vocab).zero_() # one_hot_caption = one_hot_.scatter_(2, cap_, 1) # Set mini-batch dataset images = to_var(images) captions = to_var(captions) #captions_ = to_var(one_hot_caption) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) captions = captions.view(-1) outputs = outputs.view(-1, len_vocab) loss = criterion(outputs, targets) loss.backward() optimizer.step() #print(targets) #print(outputs) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) #test set accuracy #print(outputs.max(1)[1]) outputs_np = outputs.max(1)[1].cpu().data.numpy() targets_np = targets.cpu().data.numpy() print(outputs_np) print(targets_np) location_match = 0 size_match = 0 shape_match = 0 exact_match = 0 for i in range(len(targets_np)): if outputs_np[i] == targets_np[i]: exact_match += 1 if i >= args.batch_size and i < args.batch_size * 2 and outputs_np[ i] == targets_np[i]: shape_match += 1 elif i >= args.batch_size * 2 and i < args.batch_size * 3 and outputs_np[ i] == targets_np[i]: location_match += 1 elif i >= args.batch_size * 3 and i < args.batch_size * 4 and outputs_np[ i] == targets_np[i]: size_match += 1 print( 'location match : %.4f, shape match : %.4f, exact_match: %.4f' % (location_match / (args.batch_size), shape_match / args.batch_size, exact_match / len(targets_np))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(): # Configuration for hyper-parameters torch.cuda.set_device(0) config = Config() # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader train_image_path = os.path.join(config.image_path, 'train2017') json_path = os.path.join(config.caption_path, 'captions_train2017.json') train_loader = get_data_loader(train_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) val_image_path = os.path.join(config.image_path, 'val2017') json_path = os.path.join(config.caption_path, 'captions_val2017.json') val_loader = get_data_loader(val_image_path, json_path, vocab, transform, config.batch_size, shuffle=False, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) encoder.eval() decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) print('entering in to training loop') # Train the Models with open('train1_log.txt', 'w') as logfile: logfile.write('Validation Error,Training Error') for epoch in range(0, 25): for i, (images, captions, lengths, img_ids) in enumerate(train_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i + 1) % config.save_step == 0: torch.save( encoder.state_dict(), os.path.join(config.teacher_cnn_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( decoder.state_dict(), os.path.join(config.teacher_lstm_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) print('Just Completed an Epoch, Initite Validation Error Test') avgvalloss = 0 for j, (images, captions, lengths, img_ids) in enumerate(val_loader): images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] optimizer.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) valloss = criterion(outputs, targets) if j == 0: avgvalloss = valloss.data[0] avgvalloss = (avgvalloss + valloss.data[0]) / 2 if ((j + 1) % 1000 == 0): print('Average Validation Loss: %.4f' % (avgvalloss)) logfile.write( str(avgvalloss) + ',' + str(loss.data[0]) + str('\n')) break
# Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch)) torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch)) # Close the training log file. f.close() # <a id='step3'></a> # ## Step 3: (Optional) Validate your Model # # To assess potential overfitting, one approach is to assess performance on a validation set. If you decide to do this **optional** task, you are required to first complete all of the steps in the next notebook in the sequence (**3_Inference.ipynb**); as part of that notebook, you will write and test code (specifically, the `sample` method in the `DecoderRNN` class) that uses your RNN decoder to generate captions. That code will prove incredibly useful here. # # If you decide to validate your model, please do not edit the data loader in **data_loader.py**. Instead, create a new file named **data_loader_val.py** containing the code for obtaining the data loader for the validation data. You can access: # - the validation images at filepath `'/opt/cocoapi/images/train2014/'`, and # - the validation image caption annotation file at filepath `'/opt/cocoapi/annotations/captions_val2014.json'`.
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))