def main(): args = parse_arguments() if args.enable_cuda and torch.cuda.is_available(): args.device = torch.device('cuda') else: args.device = torch.device('cpu') preprocess = transforms.Compose( (transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))) coco_train = coco.CocoCaptions(TRAIN_DIR_IMGS, TRAIN_DIR_CAPTIONS, transform=preprocess) data_loader = torch.utils.data.DataLoader(dataset=coco_train, batch_size=args.batch_size, shuffle=False, num_workers=2) model = EncoderCNN() model = nn.DataParallel(model.train(False).to(args.device)) vectors, captions = [], [] for img_batch, capt_batch in tqdm(data_loader): capt_batch = list(zip(*capt_batch)) vec_batch = model(img_batch) captions.extend(capt_batch) vectors.extend([vec for vec in vec_batch]) captions_tokenized = list([[caption.lower() for caption in caption_list] for caption_list in captions]) if not os.path.exists(DATA_DIR): os.mkdir(DATA_DIR) else: shutil.rmtree(DATA_DIR) np.save(DATA_DIR + "image_codes.npy", np.asarray(vectors)) with open(DATA_DIR + "captions_tokenized.json", "w") as file_capt: json.dump(captions_tokenized, file_capt)
new_sampler = data.sampler.SubsetRandomSampler(indices=indices) train_data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(train_data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # set the encoder decoder in training mode encoder.train() decoder.train() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step()
data_file="test_no_dup_with_category_3more_name.json", use_mean_img=False, neg_samples=False) test_loader = DataLoader( test_dataset, batch_size=32, shuffle=False, num_workers=4, collate_fn=lstm_collate_fn, ) encoder_cnn = EncoderCNN(emb_size) encoder_cnn.load_state_dict(torch.load("./encoder_cnn.pth")) print("Successfully load trained weights...") encoder_cnn = encoder_cnn.to(device) encoder_cnn.train(False) test_features = {} for batch_num, input_data in enumerate(test_loader, 1): print("#{}\r".format(batch_num), end="") lengths, images, names, offsets, set_ids, labels, is_compat = input_data image_seqs = images.to(device) with torch.no_grad(): emb_seqs = encoder_cnn(image_seqs) batch_ids = [] for set_id, items in zip(set_ids, labels): for item in items: batch_ids.append(item)
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) transform_val = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_dir, args.val_caption_path, vocab, transform_val, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) encoder.freeze_bottom() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # decoder = BahdanauAttnDecoderRNN(args.hidden_size, args.embed_size, len(vocab)).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) accs, b1s, b2s, b3s, b4s = [], [], [], [], [] for epoch in range(args.num_epochs): decoder.train() encoder.train() losses = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) losses.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) # acc, b1, b2, b3, b4 = evaluate(val_loader, encoder, decoder, vocab) # accs.append(acc) # b1s.append(b1) # b2s.append(b2) # b3s.append(b3) # b4s.append(b4) avg_loss = sum(losses) / total_step print('Epoch {} Average Training Loss: {:.4f}'.format( epoch + 1, avg_loss)) with open('stem_freeze_freq1000.txt', 'a') as file: file.write("Epoch {} \n".format(epoch + 1)) file.write('Average Accuracy: {} \n'.format(acc)) file.write('Average Loss: {} \n'.format(avg_loss)) file.write('Average BLEU gram1: {} \n'.format(b1)) file.write('Average BLEU gram2: {} \n'.format(b2)) file.write('Average BLEU gram3: {} \n'.format(b3)) file.write('Average BLEU gram4: {} \n'.format(b4)) file.write('\n') plt.title("Accuracy vs BLEU score") plt.plot(np.arange(1, args.num_epochs + 1), accs, label='accuracy') plt.plot(np.arange(1, args.num_epochs + 1), b1s, label='BLEU 1') plt.plot(np.arange(1, args.num_epochs + 1), b2s, label='BLEU 2') plt.plot(np.arange(1, args.num_epochs + 1), b3s, label='BLEU 3') plt.plot(np.arange(1, args.num_epochs + 1), b4s, label='BLEU 4') plt.xlabel("epochs") plt.xticks(np.arange(1, args.num_epochs + 1)) plt.legend(loc='upper left') plt.savefig('accuracy_BLEU.png') plt.clf()