def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #Load vocab_list for uniskip vocab_list = pd.read_csv("./data/vocab_list.csv", header=None) vocab_list = vocab_list.values.tolist()[0] #Build data loader data_loader = get_loader(args.image_dir, args.img_embeddings_dir, args.data_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #im_encoder = preprocess_get_model.model() attention = T_Att() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, args.dropout) uniskip = UniSkip('./data/skip-thoughts', vocab_list) decoder.eval() if torch.cuda.is_available(): #im_encoder.cuda() attention.cuda() decoder.cuda() uniskip.cuda() attention.load_state_dict(torch.load(args.attention_path)) decoder.load_state_dict(torch.load(args.decoder_path)) for i, (images, captions, cap_lengths, qa, qa_lengths, vocab_words) in enumerate(data_loader): # # Set mini-batch dataset img_embeddings = to_var(images.data, volatile=True) captions = to_var(captions) # qa = to_var(qa) # targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0] # # Forward, Backward and Optimize # decoder.zero_grad() # attention.zero_grad() # #features = encoder(images) #img_embeddings = im_encoder(images) #uniskip = UniSkip('/Users/tushar/Downloads/code/data/skip-thoughts', vocab_list) cap_embeddings = uniskip(captions, cap_lengths) cap_embeddings = cap_embeddings.data img_embeddings = img_embeddings.data ctx_vec = attention(img_embeddings, cap_embeddings) outputs = decoder.sample(ctx_vec) output_ids = outputs.cpu().data.numpy() qa = qa.numpy() qa = qa[0] # predicted_q = [] # predicted_a = [] sample = [] # flag = -1 for word_id in output_ids: word = vocab.idx2word[word_id] sample.append(word) # if word == '<end>': # if flag == -1: # predicted_q = sample # sample = [] # flag = 0 # else: # predicted_a = sample # predicted_q = ' '.join(predicted_q[1:]) # predicted_a = ' '.join(predicted_a[1:]) sample = ' '.join(sample) actual = [] # print("predicted q was : " + predicted_q) for word_id in qa: word = vocab.idx2word[word_id] actual.append(word) actual = ' '.join(actual) #print(im_id) print("actual_qa : " + actual + " | predicted_qa : " + sample)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #Load vocab_list for uniskip vocab_list = pd.read_csv("./data/vocab_list.csv", header=None) vocab_list = vocab_list.values.tolist()[0] # Build data loader data_loader = get_loader(args.image_dir, args.img_embeddings_dir, args.data_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #im_encoder = preprocess_get_model.model() attention = T_Att() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, args.dropout) uniskip = UniSkip('./data/skip-thoughts', vocab_list) if torch.cuda.is_available(): #im_encoder.cuda() attention.cuda() decoder.cuda() uniskip.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(attention.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, cap_lengths, qa, qa_lengths, vocab_words) in enumerate(tqdm(data_loader)): #Re-initialize decoder hidden state decoder.hidden = decoder.init_hidden() # Set mini-batch dataset img_embeddings = to_var(images.data, volatile=True) captions = to_var(captions) qa = to_var(qa) targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() attention.zero_grad() #features = encoder(images) #img_embeddings = im_encoder(images) cap_embeddings = uniskip(captions, cap_lengths) cap_embeddings = cap_embeddings.data img_embeddings = img_embeddings.data # print(img_embeddings.size()) # print(type(img_embeddings)) # print(cap_embeddings.size()) #print(type(cap_embeddings)) ctx_vec = attention(img_embeddings,cap_embeddings) outputs = decoder(ctx_vec, qa, qa_lengths) predicted = outputs.max(1)[1] loss = criterion(outputs, targets) loss.backward() optimizer.step() #pred_ids = [] #print(predicted.size()) # pred_ids.append(predicted) # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) #output_ids = predicted.cpu().data.numpy() #sample = [] #for word_id in output_ids: # word = vocab.idx2word[word_id] # sample.append(word) #sample = ' '.join(sample) #print("predicted qa : " + sample) # Save the models if (i+1)%args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(attention.state_dict(), os.path.join(args.model_path, 'attention-%d-%d.pkl' %(epoch+1, i+1)))