def main(args): train_losses = [] train_acc = [] # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): losses = [] accuracy = 0.0 for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) # record accuracy and loss losses.append(loss.item()) topv, topi = outputs.topk(1, dim=1) targets = targets.unsqueeze(-1) accuracy += float((topi == targets).sum()) / targets.shape[0] # update params decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}, Accuracy: {:.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), accuracy / float(i + 1))) with open('my_train_loss_t4_resnext.txt', 'a') as fi: fi.write('\n' + 'epoch = {}, i = {}, tr_loss = {}, acc = {}'. format(epoch + 1, i + 1, loss.item(), accuracy / float(i + 1))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join( args.model_path, 'my-decoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'my-encoder-{}-{}-t4-resnext.ckpt'.format( epoch + 1, i + 1))) train_losses.append(sum(losses) / total_step) train_acc.append(accuracy / total_step) # save losses over epoch f = open("train_loss.txt", "a") f.write(str(train_losses)) f.close() # save accuracies over epoch f = open("train_acc.txt", "a") f.write(str(train_acc)) f.close()
def main(args): torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.coco_detection_result, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) # the layout encoder hidden state size must be the same with decoder input size layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() layout_encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths, label_seqs, location_seqs, layout_lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() layout_encoder.zero_grad() encoder.zero_grad() features = encoder(images) layout_encoding = layout_encoder(label_seqs, location_seqs, layout_lengths) comb_features = features + layout_encoding outputs = decoder(comb_features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models # transform = transforms.Compose([ # transforms.RandomCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # transforms.Normalize((0.485, 0.456, 0.406), # (0.229, 0.224, 0.225))]) transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab, transform) sasr_data_loader.load_data(args.data_file, args.init_flag) frogger_data_loader = sasr_data_loader.data_loader( args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(frogger_data_loader): images = to_var(images, volatile=True) if (list(images.size())[0] != 1): captions = to_var(captions) # print(list(images.size())[0]) # print(captions) # exit(0) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory threshold = 20 captions_dict = load_captions(train_dir) vocab = Vocabulary(captions_dict, threshold) vocab_size = vocab.index if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper #with open(args.vocab_path, 'rb') as f: #vocab = pickle.load(f) dataloader = DataLoader(train_dir, vocab, transform) imagenumbers, captiontotal, imagetotal = dataloader.gen_data() # Build data loader data_loader = get_loader(imagenumbers, captiontotal, imagetotal, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_size, args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
class ImageDescriptor(): def __init__(self, args, encoder): assert(args.mode == 'train' or 'val' or 'test') self.__args = args self.__mode = args.mode self.__attention_mechanism = args.attention self.__stats_manager = ImageDescriptorStatsManager() self.__validate_when_training = args.validate_when_training self.__history = [] if not os.path.exists(args.model_dir): os.makedirs(args.model_dir) self.__config_path = os.path.join( args.model_dir, f'config-{args.encoder}{args.encoder_ver}.txt') # Device configuration self.__device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # training set vocab with open(args.vocab_path, 'rb') as f: self.__vocab = pickle.load(f) # validation set vocab with open(args.vocab_path.replace('train', 'val'), 'rb') as f: self.__vocab_val = pickle.load(f) # coco dataset self.__coco_train = CocoDataset( args.image_dir, args.caption_path, self.__vocab, args.crop_size) self.__coco_val = CocoDataset( args.image_dir, args.caption_path.replace('train', 'val'), self.__vocab_val, args.crop_size) # data loader self.__train_loader = torch.utils.data.DataLoader(dataset=self.__coco_train, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) self.__val_loader = torch.utils.data.DataLoader(dataset=self.__coco_val, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Build the models self.__encoder = encoder.to(self.__device) self.__decoder = DecoderRNN(args.embed_size, args.hidden_size, len(self.__vocab), args.num_layers, attention_mechanism=self.__attention_mechanism).to(self.__device) # Loss and optimizer self.__criterion = nn.CrossEntropyLoss() self.__params = list(self.__decoder.parameters( )) + list(self.__encoder.linear.parameters()) + list(self.__encoder.bn.parameters()) self.__optimizer = torch.optim.Adam( self.__params, lr=args.learning_rate) # Load checkpoint and check compatibility if os.path.isfile(self.__config_path): with open(self.__config_path, 'r') as f: content = f.read()[:-1] if content != repr(self): # save the error info with open('config.err', 'w') as f: print(f'f.read():\n{content}', file=f) print(f'repr(self):\n{repr(self)}', file=f) raise ValueError( "Cannot create this experiment: " "I found a checkpoint conflicting with the current setting.") self.load(file_name=args.checkpoint) else: self.save() def setting(self): ''' Return the setting of the experiment. ''' return {'Net': (self.__encoder, self.__decoder), 'Optimizer': self.__optimizer, 'BatchSize': self.__args.batch_size} @property def epoch(self): return len(self.__history) @property def history(self): return self.__history # @property # def mode(self): # return self.__args.mode # @mode.setter # def mode(self, m): # self.__args.mode = m def __repr__(self): ''' Pretty printer showing the setting of the experiment. This is what is displayed when doing `print(experiment). This is also what is saved in the `config.txt file. ''' string = '' for key, val in self.setting().items(): string += '{}({})\n'.format(key, val) return string def state_dict(self): ''' Returns the current state of the model. ''' return {'Net': (self.__encoder.state_dict(), self.__decoder.state_dict()), 'Optimizer': self.__optimizer.state_dict(), 'History': self.__history} def save(self): ''' Saves the model on disk, i.e, create/update the last checkpoint. ''' file_name = os.path.join( self.__args.model_dir, '{}{}-epoch-{}.ckpt'.format(self.__args.encoder, self.__args.encoder_ver, self.epoch)) torch.save(self.state_dict(), file_name) with open(self.__config_path, 'w') as f: print(self, file=f) print(f'Save to {file_name}.') def load(self, file_name=None): ''' Loads the model from the last checkpoint saved on disk. Args: file_name (str): path to the checkpoint file ''' if not file_name: # find the latest .ckpt file try: file_name = max( glob.iglob(os.path.join(self.__args.model_dir, '*.ckpt')), key=os.path.getctime) print(f'Load from {file_name}.') except: raise FileNotFoundError( 'No checkpoint file in the model directory.') else: file_name = os.path.join(self.__args.model_dir, file_name) print(f'Load from {file_name}.') try: checkpoint = torch.load(file_name, map_location=self.__device) except: raise FileNotFoundError( 'Please check --checkpoint, the name of the file') self.load_state_dict(checkpoint) del checkpoint def load_state_dict(self, checkpoint): ''' Loads the model from the input checkpoint. Args: checkpoint: an object saved with torch.save() from a file. ''' self.__encoder.load_state_dict(checkpoint['Net'][0]) self.__decoder.load_state_dict(checkpoint['Net'][1]) self.__optimizer.load_state_dict(checkpoint['Optimizer']) self.__history = checkpoint['History'] # The following loops are used to fix a bug that was # discussed here: https://github.com/pytorch/pytorch/issues/2830 # (it is supposed to be fixed in recent PyTorch version) for state in self.__optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(self.__device) def train(self, plot_loss=None): ''' Train the network using backpropagation based on the optimizer and the training set. Args: plot_loss (func, optional): if not None, should be a function taking a single argument being an experiment (meant to be `self`). Similar to a visitor pattern, this function is meant to inspect the current state of the experiment and display/plot/save statistics. For example, if the experiment is run from a Jupyter notebook, `plot` can be used to display the evolution of the loss with `matplotlib`. If the experiment is run on a server without display, `plot` can be used to show statistics on `stdout` or save statistics in a log file. (default: None) ''' self.__encoder.train() self.__decoder.train() self.__stats_manager.init() total_step = len(self.__train_loader) start_epoch = self.epoch print("Start/Continue training from epoch {}".format(start_epoch)) if plot_loss is not None: plot_loss(self) for epoch in range(start_epoch, self.__args.num_epochs): t_start = time.time() self.__stats_manager.init() for i, (images, captions, lengths) in enumerate(self.__train_loader): # Set mini-batch dataset if not self.__attention_mechanism: images = images.to(self.__device) captions = captions.to(self.__device) else: with torch.no_grad(): images = images.to(self.__device) captions = captions.to(self.__device) targets = pack_padded_sequence( captions, lengths, batch_first=True)[0] # Forward, backward and optimize if not self.__attention_mechanism: features = self.__encoder(images) outputs = self.__decoder(features, captions, lengths) self.__decoder.zero_grad() self.__encoder.zero_grad() else: self.__encoder.zero_grad() self.__decoder.zero_grad() features, cnn_features = self.__encoder(images) outputs = self.__decoder( features, captions, lengths, cnn_features=cnn_features) loss = self.__criterion(outputs, targets) loss.backward() self.__optimizer.step() with torch.no_grad(): self.__stats_manager.accumulate( loss=loss.item(), perplexity=np.exp(loss.item())) # Print log info each iteration if i % self.__args.log_step == 0: print('[Training] Epoch: {}/{} | Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}' .format(epoch+1, self.__args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) if not self.__validate_when_training: self.__history.append(self.__stats_manager.summarize()) print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}".format( self.epoch, time.time() - t_start, self.__history[-1]['loss'], self.__history[-1]['perplexity'])) else: self.__history.append( (self.__stats_manager.summarize(), self.evaluate())) print("Epoch {} | Time: {:.2f}s\nTraining Loss: {:.6f} | Training Perplexity: {:.6f}\nEvaluation Loss: {:.6f} | Evaluation Perplexity: {:.6f}".format( self.epoch, time.time() - t_start, self.__history[-1][0]['loss'], self.__history[-1][0]['perplexity'], self.__history[-1][1]['loss'], self.__history[-1][1]['perplexity'])) # Save the model checkpoints self.save() if plot_loss is not None: plot_loss(self) print("Finish training for {} epochs".format(self.__args.num_epochs)) def evaluate(self, print_info=False): ''' Evaluates the experiment, i.e., forward propagates the validation set through the network and returns the statistics computed by the stats manager. Args: print_info (bool): print the results of loss and perplexity ''' self.__stats_manager.init() self.__encoder.eval() self.__decoder.eval() total_step = len(self.__val_loader) with torch.no_grad(): for i, (images, captions, lengths) in enumerate(self.__val_loader): images = images.to(self.__device) captions = captions.to(self.__device) targets = pack_padded_sequence( captions, lengths, batch_first=True)[0] # Forward if not self.__attention_mechanism: features = self.__encoder(images) outputs = self.__decoder(features, captions, lengths) else: features, cnn_features = self.__encoder(images) outputs = self.__decoder( features, captions, lengths, cnn_features=cnn_features) loss = self.__criterion(outputs, targets) self.__stats_manager.accumulate( loss=loss.item(), perplexity=np.exp(loss.item())) if i % self.__args.log_step == 0: print('[Validation] Step: {}/{} | Loss: {:.4f} | Perplexity: {:5.4f}' .format(i, total_step, loss.item(), np.exp(loss.item()))) summarize = self.__stats_manager.summarize() if print_info: print( f'[Validation] Average loss for this epoch is {summarize["loss"]:.6f}') print( f'[Validation] Average perplexity for this epoch is {summarize["perplexity"]:.6f}\n') self.__encoder.train() self.__decoder.train() return summarize def mode(self, mode=None): ''' Get the current mode or change mode. Args: mode (str): 'train' or 'eval' mode ''' if not mode: return self.__mode self.__mode = mode def __load_image(self, image): ''' Load image at `image_path` for evaluation. Args: image (PIL Image): image ''' image = image.resize([224, 224], Image.LANCZOS) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) image = transform(image).unsqueeze(0) return image def test(self, image_path=None, plot=False): ''' Evaluate the model by generating the caption for the corresponding image at `image_path`. Note: This function will not provide BLEU socre. Args: image_path (str): file path of the evaluation image plot (bool): plot or not ''' self.__encoder.eval() self.__decoder.eval() with torch.no_grad(): if not image_path: image_path = self.__args.image_path image = Image.open(image_path) # only process with RGB image if np.array(image).ndim == 3: img = self.__load_image(image).to(self.__device) # generate an caption if not self.__attention_mechanism: feature = self.__encoder(img) sampled_ids = self.__decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() else: feature, cnn_features = self.__encoder(img) sampled_ids = self.__decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.__vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption[1:-1]) # Print out the image and the generated caption print(sentence) if plot: image = Image.open(image_path) plt.imshow(np.asarray(image)) else: print('Not support for non-RGB image.') self.__encoder.train() self.__decoder.train() def coco_image(self, idx, ds='val'): ''' Access iamge_id (which is part of the file name) and corresponding image caption of index `idx` in COCO dataset. Note: For jupyter notebook Args: idx (int): index of COCO dataset Returns: (dict) ''' assert(ds == 'train' or 'val') if ds == 'train': ann_id = self.__coco_train.ids[idx] return self.__coco_train.coco.anns[ann_id] else: ann_id = self.__coco_val.ids[idx] return self.__coco_val.coco.anns[ann_id] @property def len_of_train_set(self): ''' Number of training ''' return len(self.__coco_train) @property def len_of_val_set(self): return len(self.__coco_val) def bleu_score(self, idx, ds='val', plot=False, show_caption=False): ''' Evaluate the BLEU score for index `idx` in COCO dataset. Note: For jupyter notebook Args: idx (int): index ds (str): training or validation dataset plot (bool): plot the image or not Returns: score (float): bleu score ''' assert(ds == 'train' or 'val') self.__encoder.eval() self.__decoder.eval() with torch.no_grad(): try: if ds == 'train': ann_id = self.__coco_train.ids[idx] coco_ann = self.__coco_train.coco.anns[ann_id] else: ann_id = self.__coco_val.ids[idx] coco_ann = self.__coco_val.coco.anns[ann_id] except: raise IndexError('Invalid index') image_id = coco_ann['image_id'] image_id = str(image_id) if len(image_id) != 6: for _ in range(6 - len(image_id)): image_id = '0' + image_id image_path = f'{self.__args.image_dir}/COCO_train2014_000000{image_id}.jpg' if ds == 'val': image_path = image_path.replace('train', 'val') coco_list = coco_ann['caption'].split() image = Image.open(image_path) if np.array(image).ndim == 3: img = self.__load_image(image).to(self.__device) # generate an caption if not self.__attention_mechanism: feature = self.__encoder(img) sampled_ids = self.__decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() else: feature, cnn_features = self.__encoder(img) sampled_ids = self.__decoder.sample(feature, cnn_features) sampled_ids = sampled_ids.cpu().data.numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.__vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break # strip punctuations and spacing sampled_list = [c for c in sampled_caption[1:-1] if c not in punctuation] score = sentence_bleu(coco_list, sampled_list, smoothing_function=SmoothingFunction().method4) if plot: plt.figure() image = Image.open(image_path) plt.imshow(np.asarray(image)) plt.title(f'score: {score}') plt.xlabel(f'file: {image_path}') # Print out the generated caption if show_caption: print(f'Sampled caption:\n{sampled_list}') print(f'COCO caption:\n{coco_list}') else: print('Not support for non-RGB image.') return return score
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.Resize((224,224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #Load vocab_list for uniskip vocab_list = pd.read_csv("./data/vocab_list.csv", header=None) vocab_list = vocab_list.values.tolist()[0] # Build data loader data_loader = get_loader(args.image_dir, args.img_embeddings_dir, args.data_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #im_encoder = preprocess_get_model.model() attention = T_Att() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, args.dropout) uniskip = UniSkip('./data/skip-thoughts', vocab_list) if torch.cuda.is_available(): #im_encoder.cuda() attention.cuda() decoder.cuda() uniskip.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(attention.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, cap_lengths, qa, qa_lengths, vocab_words) in enumerate(tqdm(data_loader)): #Re-initialize decoder hidden state decoder.hidden = decoder.init_hidden() # Set mini-batch dataset img_embeddings = to_var(images.data, volatile=True) captions = to_var(captions) qa = to_var(qa) targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() attention.zero_grad() #features = encoder(images) #img_embeddings = im_encoder(images) cap_embeddings = uniskip(captions, cap_lengths) cap_embeddings = cap_embeddings.data img_embeddings = img_embeddings.data # print(img_embeddings.size()) # print(type(img_embeddings)) # print(cap_embeddings.size()) #print(type(cap_embeddings)) ctx_vec = attention(img_embeddings,cap_embeddings) outputs = decoder(ctx_vec, qa, qa_lengths) predicted = outputs.max(1)[1] loss = criterion(outputs, targets) loss.backward() optimizer.step() #pred_ids = [] #print(predicted.size()) # pred_ids.append(predicted) # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) #output_ids = predicted.cpu().data.numpy() #sample = [] #for word_id in output_ids: # word = vocab.idx2word[word_id] # sample.append(word) #sample = ' '.join(sample) #print("predicted qa : " + sample) # Save the models if (i+1)%args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(attention.state_dict(), os.path.join(args.model_path, 'attention-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): if not os.path.exists( args.model_path ): # # create model folder to keep model setting pickle files os.makedirs(args.model_path) # image preprocessing and normailzation transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # load vocabulary wrapper file # get data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size) # build encoder decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # build decoder if torch.cuda.is_available(): # load GPU encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() # get loss params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # get optimization # train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # set mini batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # forward and backward decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # optimization # Print loss and perplexity if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # save the models pickle file settings if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models (Gen) # TODO: put these in generator encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) # Build the models (Disc) discriminator = Discriminator(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() discriminator.cuda() # Loss and Optimizer (Gen) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Loss and Optimizer (Disc) params_disc = list(discriminator.parameters()) optimizer_disc = torch.optim.Adam(params_disc) # Train the Models total_step = len(data_loader) disc_losses = [] for epoch in range(args.num_epochs): for i, (images, captions, lengths, wrong_captions, wrong_lengths) in enumerate(data_loader): # pdb.set_trace() # TODO: train disc before gen # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) wrong_captions = to_var(wrong_captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) sampled_captions = decoder.sample(features) # sampled_captions = torch.zeros_like(sampled_ids) sampled_lengths = [] for row in range(sampled_captions.size(0)): for index, word_id in enumerate(sampled_captions[row, :]): # pdb.set_trace() word = vocab.idx2word[word_id.cpu().data.numpy()[0]] # sampled_captions[row, index].data = word if word == '<end>': sampled_lengths.append(index + 1) break elif index == sampled_captions.size(1) - 1: sampled_lengths.append(sampled_captions.size(1)) break sampled_lengths = np.array(sampled_lengths) sampled_lengths[::-1].sort() sampled_lengths = sampled_lengths.tolist() loss = criterion(outputs, targets) loss.backward() optimizer.step() # Train discriminator discriminator.zero_grad() rewards_real = discriminator(images, captions, lengths) rewards_fake = discriminator(images, sampled_captions, sampled_lengths) rewards_wrong = discriminator(images, wrong_captions, wrong_lengths) real_loss = -torch.mean(torch.log(rewards_real)) fake_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_fake), min=-1000)) wrong_loss = -torch.mean( torch.clamp(torch.log(1 - rewards_wrong), min=-1000)) loss_disc = real_loss + fake_loss + wrong_loss disc_losses.append(loss_disc.cpu().data.numpy()[0]) loss_disc.backward() optimizer_disc.step() # print('iteration %i' % i) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models # if (i+1) % args.save_step == 0: if ( i + 1 ) % total_step == 0: # jm: saving at the last iteration instead torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( discriminator.state_dict(), os.path.join( args.model_path, 'discriminator-%d-%d.pkl' % (epoch + 1, i + 1))) # plot at the end of every epoch plt.plot(disc_losses, label='disc loss') plt.savefig('disc_losses.png') plt.clf()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ # transforms.ColorJitter(contrast = 0.3,saturation = 0.3), # transforms.RandomChoice([transforms.RandomHorizontalFlip(),transforms.RandomVerticalFlip()]), transforms.RandomAffine(0,translate = (0.1,0.1)), transforms.ToTensor(), transforms.Normalize((0.8, 0.7, 0.8), (1, 1, 1)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # data_loader = get_loader(args.image_dir, args.caption_path, vocab, # transform, args.batch_size, # shuffle=True, num_workers=args.num_workers) sasr_data_loader = SASR_Data_Loader(vocab,transform) sasr_data_loader.load_data(args.data_file,args.init_flag) frogger_data_loader = sasr_data_loader.data_loader(args.batch_size, transform, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) + list(encoder.resnet.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) stransform = transforms.ToPILImage() img2vec = Img2Vec() total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i,(images,captions,lengths) in enumerate(frogger_data_loader): # image1 = images[0].squeeze() # # print(image1.size()) # # c = stransform(image1) # # vec = img2vec.get_vec(c,True) # # # print(vec) # # c.save('save_image1.png') # # image2 = images[1].squeeze() # # print(image2.size()) # # c = stransform(image2) # # # vec = img2vec.get_vec(c) # # # print(vec) # # c.save('save_image2.png') images = to_var(images, volatile=True) # images = images.to(device) if (list(images.size())[0]!=1): captions = to_var(captions) # print(images[0]) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() # print(images) features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def train( num_epochs: int, lr: float, batch_size: int, vocab_threshold: int, vocab_from_file: bool, embed_size: int, hidden_size: int, save_every: int, print_every: int, log_file: str )-> None: """ Train the captioning network with the required parameters. The training logs are saved in log_file. num_epochs: Number of epochs to train the model. batch_size: Mini-batch size for training. vocab_threshold: Minimum word count threshold for vocabulary initialisation. A word that appears in the dataset a fewer number of times than vocab_threshold will be discarded and will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold, the bigger the vocabulary. vocab_from_file: Whether to load the vocabulary from a pre-initialized file. embed_size: Dimensionality of image and word embeddings. hidden_size: Number of features in hidden state of the RNN decoder. save_every: Number of epochs between each checkpoint saving. print_every: Number of batches for printing average loss. log_file: Name of the training log file. Saves loss and perplexity. """ transform_train = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(), # horizontally flip image with probability=0.5 transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize((0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225))]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # Parameters to update. We do not re-train de CNN here params = list(encoder.embed.parameters()) + list(decoder.parameters()) # TODO: add learning rate scheduler # Optimizer for minimum search. optimizer = optim.Adam(params, lr=lr) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) # Open the training log file. f = open(log_file, 'w') for epoch in range(1, num_epochs + 1): for i_step in range(1, total_step + 1): # Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # for i in range(10): # print(torch.argmax(outputs[0,i, :]).item()) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step() # Get training statistics. stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % ( epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item())) # Print training statistics (on same line). print('\r' + stats, end="") sys.stdout.flush() # Print training statistics to file. f.write(stats + '\n') f.flush() # Print training statistics (on different line). if i_step % print_every == 0: print('\r' + stats) # Save the weights. if epoch % save_every == 0: torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl")) torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl")) # Close the training log file. f.close()
def main(cfg): # modelのディレクトリ作成 if not os.path.exists(hydra.utils.to_absolute_path(cfg.train.model_path)): os.makedirs(hydra.utils.to_absolute_path(cfg.train.model_path)) # 画像の前処理と正規化を行う transform = transforms.Compose([ transforms.RandomCrop(cfg.image.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(cfg.image.mean, cfg.image.std) ]) with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f: vocab = pickle.load(f) # data_loaderの読み込み data_loader = get_loader(hydra.utils.to_absolute_path(cfg.train.image_dir), hydra.utils.to_absolute_path(cfg.train.caption_path), vocab, transform, cfg.train.batch_size, shuffle=True, num_workers=cfg.train.num_workers) # modelの構築 encoder = EncoderCNN(cfg.train.embed_size).to(device) decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers).to(device) # lossとoptimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=cfg.train.learning_rate) # train total_step = len(data_loader) for epoch in range(cfg.train.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # ミニバッジデータセットのセット images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % cfg.train.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, cfg.train.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # modelをcheckpointごとにSaveする if (i + 1) % cfg.train.save_step == 0: torch.save(decoder.state_dict(), os.path.join( hydra.utils.to_absolute_path(cfg.train.model_path), 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save(encoder.state_dict(), os.path.join( hydra.utils.to_absolute_path(cfg.train.model_path), 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): tensor_board_writer = Logger() # Create model directory # if not os.path.exists(args.model_path): # os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) with open(args.test_vocab_path, 'rb') as f: test_vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) test_data_loader = get_loader(args.test_image_dir, args.test_caption_path, test_vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in trange(args.num_epochs): for i, (images, captions, lengths) in enumerate(tqdm(data_loader)): # Set mini-batch dataset images = images.to(device) # print(images.shape) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.dictionary, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models #encoder = EncoderCNN(args.embed_size).to(device) dictionary = pd.read_csv(args.dictionary, header=0, encoding='unicode_escape', error_bad_lines=False) dictionary = list(dictionary['keys']) decoder = DecoderRNN(len(dictionary), args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters( )) # + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (array, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset array = array.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize #features = encoder(images) outputs = decoder(array, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() #encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path + "_" + args.model_type): os.makedirs(args.model_path + "_" + args.model_type) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ #transforms.RandomCrop(args.crop_size), #transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers, mode=args.mode) # Build the models encoder = EncoderCNN(args.embed_size, args.model_type, args.mode).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() #params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(encoder.parameters()) + list(decoder.parameters()) # fine tune if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters()) elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters()) # train from scratch #if(args.mode == "side_by_side"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters()) #elif(args.mode == "depthwise"): params = list(decoder.parameters()) + list(encoder.linear_to_embed_size_concat.parameters()) + list(encoder.bn.parameters()) + list(encoder.feature_extractor.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) #Image.fromarray((((images[0].permute(1, 2, 0).detach().cpu().numpy() + 1) / 2) * 255).astype(np.uint8)).show() captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #print(targets.shape) #print(targets) # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints torch.save(decoder.state_dict(), os.path.join( args.model_path + "_" + args.model_type + "/", 'decoder.pt')) torch.save(encoder.state_dict(), os.path.join( args.model_path + "_" + args.model_type + "/", 'encoder.pt'))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #read rationalization data rationalizations = [] max_length = 0 lengths = [] bad_worker_ids = [ 'A2CNSIECB9UP05', 'A23782O23HSPLA', 'A2F9ZBSR6AXXND', 'A3GI86L18Z71XY', 'AIXTI8PKSX1D2', 'A2QWHXMFQI18GQ', 'A3SB7QYI84HYJT', 'A2Q2A7AB6MMFLI', 'A2P1KI42CJVNIA', 'A1IJXPKZTJV809', 'A2WZ0RZMKQ2WGJ', 'A3EKETMVGU2PM9', 'A1OCEC1TBE3CWA', 'AE1RYK54MH11G', 'A2ADEPVGNNXNPA', 'A15QGLWS8CNJFU', 'A18O3DEA5Z4MJD', 'AAAL4RENVAPML', 'A3TZBZ92CQKQLG', 'ABO9F0JD9NN54', 'A8F6JFG0WSELT', 'ARN9ET3E608LJ', 'A2TCYNRAZWK8CC', 'A32BK0E1IPDUAF', 'ANNV3E6CIVCW4' ] with open('./Log/Rationalizations.txt') as f: for line in f: line = line.lower() line = re.sub('[^a-z\ \']+', " ", line) words = line.split() length = len(words) lengths.append(length) if length > max_length: max_length = length for index, word in enumerate(words): words[index] = vocab.word2idx[word] rationalizations.append(words) # max_length = max(rationalizations,key=len rationalizations = [np.array(xi) for xi in rationalizations] # for index,r in enumerate(rationalizations): # # print(max_length) # r = np.lib.pad(r,(0,max_length - len(r)),'constant') # rationalizations[index] = r # rationalizations = np.vstack(rationalizations) # print(rationalizations) # print(rationalizations.shape) # print(torch.from_numpy(rationalizations)) # rationalizations = torch.from_numpy(rationalizations) # print(np.asarray(rationalizations).reshape(rationalizations.shape,rationalizations.shape)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) frogger_data_loader = get_images('./data/FroggerDataset/', args.batch_size, transform) # exit(0) # Train the Models # data = iter(frogger_data_loader) # imgs = data.next()[0] # print(imgs) # print(frogger_data_loader[0]) # exit(0) # for i,(images) in enumerate(frogger_data_loader): # print(images) total_step = len(frogger_data_loader) for epoch in range(args.num_epochs): for i, x in enumerate(frogger_data_loader): # print(x) # print(x[0]) # exit(0) # print(x[0]) # exit(0) images = to_var(x[0], volatile=True) print(images[0][1]) exit(0) captions = [] max_length = max(lengths[i:i + 2]) rats = rationalizations[i:i + 2] rats.sort(key=lambda s: len(s)) rats.reverse() # print(rats) # exit(0) for index, r in enumerate(rats): # print(max_length) r = np.lib.pad(r, (0, max_length - len(r)), 'constant') captions.append(r) # rationalizations = np.vstack(rationalizations) # captions.sort(key = lambda s: len(s)) captions = to_var(torch.from_numpy(np.asarray(captions))) # lengths.append(len(rationalizations[i])) new_lengths = [] # new_lengths.append(lengths[i]) new_lengths = lengths[i:i + 2] new_lengths.sort() new_lengths.reverse() captions = captions # print(captions) # print(new_lengths) targets = pack_padded_sequence(captions, new_lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() # print(images) features = encoder(images) # print(features) # print(rats) # print(len(lengths)) outputs = decoder(features, captions, new_lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
def main(args): device = 'cuda' if torch.cuda.is_available() else 'cpu' with open('data/multim_poem.json') as f, open('data/unim_poem.json') as unif: multim = json.load(f) unim = json.load(unif) multim = util.filter_multim(multim) # multim = multim[:128] with open('data/img_features.pkl', 'rb') as fi, open('data/poem_features.pkl', 'rb') as fp: img_features = pickle.load(fi) poem_features = pickle.load(fp) # make sure vocab exists word2idx, idx2word = util.read_vocab_pickle(args.vocab_path) # will be used in embedder if args.source == 'unim': data = unim features = poem_features elif args.source == 'multim': data = multim features = img_features else: print('Error: source must be unim or multim!') exit() # create data loader. the data will be in decreasing order of length data_loader = get_poem_poem_dataset(args.batch_size, shuffle=True, num_workers=args.num_workers, json_obj=data, features=features, max_seq_len=128, word2idx=word2idx, tokenizer=None) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(word2idx), device) decoder = DataParallel(decoder) if args.restore: decoder.load_state_dict(torch.load(args.ckpt)) if args.load: decoder.load_state_dict(torch.load(args.load)) decoder.to(device) discriminator = Discriminator(args.embed_size, args.hidden_size, len(word2idx), num_labels=2) discriminator.embed.weight = decoder.module.embed.weight discriminator = DataParallel(discriminator) if args.restore: discriminator.load_state_dict(torch.load(args.disc)) discriminator.to(device) # optimization config criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(decoder.parameters(), lr=args.learning_rate) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 10], gamma=0.33) optimizerD = torch.optim.Adam(discriminator.parameters(), lr=args.learning_rate) sys.stderr.write('Start training...\n') total_step = len(data_loader) decoder.train() global_step = 0 running_ls = 0 for epoch in range(args.num_epochs): scheduler.step() acc_ls = 0 start = time.time() for i, (batch) in enumerate(data_loader): poem_embed, ids, lengths = [t.to(device) for t in batch] targets = pack_padded_sequence(ids[:, 1:], lengths, batch_first=True)[0] # train discriminator # train with real discriminator.zero_grad() pred_real = discriminator(ids[:, 1:], lengths) real_label = torch.ones(ids.size(0), dtype=torch.long).to(device) loss_d_real = criterion(pred_real, real_label) loss_d_real.backward(torch.ones_like(loss_d_real), retain_graph=True) # train with fake logits = decoder(poem_embed, ids, lengths) weights = F.softmax(logits, dim=-1) m = Categorical(probs=weights) generated_ids = m.sample() # generated_ids = torch.argmax(logits, dim=-1) pred_fake = discriminator(generated_ids.detach(), lengths) fake_label = torch.zeros(ids.size(0)).long().to(device) loss_d_fake = criterion(pred_fake, fake_label) loss_d_fake.backward(torch.ones_like(loss_d_fake), retain_graph=True) loss_d = loss_d_real.mean().item() + loss_d_fake.mean().item() optimizerD.step() # train generator decoder.zero_grad() reward = F.softmax(pred_fake, dim=-1)[:, 1].unsqueeze(-1) loss_r = -m.log_prob(generated_ids) * reward loss_r.backward(torch.ones_like(loss_r), retain_graph=True) loss_r = loss_r.mean().item() loss = criterion(pack_padded_sequence(logits, lengths, batch_first=True)[0], targets) loss.backward(torch.ones_like(loss)) loss = loss.mean().item() # loss = loss_r running_ls += loss acc_ls += loss for param in decoder.parameters(): torch.nn.utils.clip_grad_norm_(param, 0.25) optimizer.step() global_step += 1 if global_step % args.log_step == 0: elapsed_time = time.time() - start iters_per_sec = (i + 1) / elapsed_time remaining = (total_step - i - 1) / iters_per_sec remaining_fmt = time.strftime("%H:%M:%S", time.gmtime(remaining)) elapsed_fmt = time.strftime("%H:%M:%S", time.gmtime(elapsed_time)) print('[{}/{}, {}/{}], ls_d:{:.2f}, ls_r:{:.2f} ls: {:.2f}, Acc: {:.2f} Perp: {:5.2f} {:.3}it/s {}<{}' .format(epoch+1, args.num_epochs, i+1, total_step, loss_d, loss_r, running_ls / args.log_step, acc_ls / (i+1), np.exp(acc_ls / (i+1)), iters_per_sec, elapsed_fmt, remaining_fmt ) ) running_ls = 0 if global_step % args.save_step == 0: torch.save(decoder.state_dict(), args.ckpt) torch.save(discriminator.state_dict(), args.disc) torch.save(decoder.state_dict(), args.save) torch.save(discriminator.state_dict(), args.disc)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) obj = data_loader.MsvdDataset() datas = obj.getAll() #print(len(datas)) os.chdir(r'E:/jupyterNotebook/our_project/') # Train the models total_step = len(datas) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(datas): #print(epoch,i,images.shape) # Set mini-batch dataset images = images.to(device) # Forward, backward and optimize features = encoder(images) features = features.cpu().detach().numpy() features = features.mean(axis=0) features = torch.from_numpy(features).view(1, -1).to(device) #print(features.shape) for j in range(1): #for j in range(len(captions)): captions[j] = captions[j].long() captions[j] = captions[j].view(1, -1).to(device) targets = pack_padded_sequence(captions[j], lengths[j], batch_first=True)[0] outputs = decoder(features, captions[j], lengths[j]) #print(targets.shape) #print(outputs.shape) loss = criterion(outputs, targets) decoder.zero_grad() #encoder.zero_grad() loss.backward() optimizer.step() print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) #print(os.path) if (i + 1) % 25 == 0: #args.save_step == 0: torch.save( decoder.state_dict(), os.path.join('E:\jupyterNotebook\our_project\models', 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join('E:\jupyterNotebook\our_project\models', 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) transform_val = transforms.Compose([ transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_dir, args.val_caption_path, vocab, transform_val, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) encoder.freeze_bottom() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # decoder = BahdanauAttnDecoderRNN(args.hidden_size, args.embed_size, len(vocab)).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) accs, b1s, b2s, b3s, b4s = [], [], [], [], [] for epoch in range(args.num_epochs): decoder.train() encoder.train() losses = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) losses.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) # acc, b1, b2, b3, b4 = evaluate(val_loader, encoder, decoder, vocab) # accs.append(acc) # b1s.append(b1) # b2s.append(b2) # b3s.append(b3) # b4s.append(b4) avg_loss = sum(losses) / total_step print('Epoch {} Average Training Loss: {:.4f}'.format( epoch + 1, avg_loss)) with open('stem_freeze_freq1000.txt', 'a') as file: file.write("Epoch {} \n".format(epoch + 1)) file.write('Average Accuracy: {} \n'.format(acc)) file.write('Average Loss: {} \n'.format(avg_loss)) file.write('Average BLEU gram1: {} \n'.format(b1)) file.write('Average BLEU gram2: {} \n'.format(b2)) file.write('Average BLEU gram3: {} \n'.format(b3)) file.write('Average BLEU gram4: {} \n'.format(b4)) file.write('\n') plt.title("Accuracy vs BLEU score") plt.plot(np.arange(1, args.num_epochs + 1), accs, label='accuracy') plt.plot(np.arange(1, args.num_epochs + 1), b1s, label='BLEU 1') plt.plot(np.arange(1, args.num_epochs + 1), b2s, label='BLEU 2') plt.plot(np.arange(1, args.num_epochs + 1), b3s, label='BLEU 3') plt.plot(np.arange(1, args.num_epochs + 1), b4s, label='BLEU 4') plt.xlabel("epochs") plt.xticks(np.arange(1, args.num_epochs + 1)) plt.legend(loc='upper left') plt.savefig('accuracy_BLEU.png') plt.clf()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the models if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join(args.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(args.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): configure(os.path.join(args['exp_dir'], 'log_dir')) transform = transforms.Compose([ transforms.RandomCrop(args['crop_size']), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) data_loader = get_loader({ 'data_dir': args['data_dir'], 'exp_dir': args['exp_dir'], 'raw_data_dir': args['raw_data_dir'], 'batch_size': args['batch_size'], 'transform': transform, 'num_workers': args['num_workers'], 'shuffle': args['shuffle'], 'mode': 'train' }) # valid_data_loader=get_loader({'data_dir' : args['data_dir'], # 'raw_data_dir' : args['raw_data_dir'], # 'batch_size' : int(args['batch_size']/4), # 'transform' : transform, # 'num_workers' : args['num_workers'], # 'shuffle' : args['shuffle'], # 'mode':'validate'}) args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir'])) encoder = EncoderCNN(args).train() decoder = DecoderRNN(args).train() if args['pretrained']: checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir']) checkpoint = Checkpoint.load(checkpoint_path) encoder.load_state_dict(checkpoint.encoder) decoder.load_state_dict(checkpoint.decoder) step = checkpoint.step epoch = checkpoint.epoch omit = True else: step = 0 epoch = 0 omit = False encoder.to(device) decoder.to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) # params=list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args['lr']) scheduler = StepLR(optimizer, step_size=40, gamma=0.1) # optimizer=YFOptimizer(params) total_step = len(data_loader) min_valid_loss = float('inf') for epoch in range(epoch, args['num_epochs']): scheduler.step() for idx, (images, captions, leng) in enumerate(data_loader): if omit: if idx < (step - total_step * epoch): logger.info( 'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'. format(idx, step, epoch, total_step, step - total_step * epoch)) continue else: omit = False images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, leng, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, leng) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5) optimizer.step() log_value('loss', loss.item(), step) step += 1 if step % args['log_step'] == 0: logger.info( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args['num_epochs'], idx, total_step, loss.item(), np.exp(loss.item()))) if step % args['valid_step'] == 0: # valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader) # if valid_loss<min_valid_loss: # min_valid_loss=valid_loss Checkpoint(encoder, decoder, optimizer, epoch, step).save(args['exp_dir'])
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_image_dir, args.val_caption_path, vocab, transform, args.batch_size, shuffle=False, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) train_loss_arr = [] val_loss_arr = [] train_bleu_arr = [] val_bleu_arr = [] for epoch in range(1, args.num_epochs + 1, 1): iteration_loss = [] iteration_bleu = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets) iteration_loss.append(loss.item()) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions, sampled_ids, vocab) iteration_bleu.append(bleu_score_batch) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, i, total_step, loss.item()) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " Step: " + str(i) + "/" + str(total_step) + " loss: " + str(loss.item()) + " Bleu: " + str(bleu_score_batch) + "\n") f_log.close() # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) train_loss_arr.append(np.array(iteration_loss)) train_bleu_arr.append(np.array(iteration_bleu)) val_loss = 0 val_steps = 0 val_iteration_loss = [] val_iteration_bleu = [] for j, (images_val, captions_val, lengths_val) in enumerate(val_loader): # Set mini-batch dataset images_val = images_val.to(device) captions_val = captions_val.to(device) targets = pack_padded_sequence(captions_val, lengths_val, batch_first=True)[0] # Forward, backward and optimize features = encoder(images_val) outputs = decoder(features, captions_val, lengths_val) #print(outputs.shape, targets.shape) loss = criterion(outputs, targets).item() val_loss += loss val_iteration_loss.append(loss) val_steps += 1 #get BLEU score for corresponding batch sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().numpy() bleu_score_batch = get_bleu(captions_val, sampled_ids, vocab) val_iteration_bleu.append(bleu_score_batch) val_loss /= val_steps print('Epoch [{}/{}], Val Loss: {:.4f}, Bleu: '.format( epoch, args.num_epochs, val_loss) + str(bleu_score_batch)) f_log = open(os.path.join(args.model_path, "log.txt"), "a+") f_log.write("Epoch: " + str(epoch) + "/" + str(args.num_epochs) + " val loss: " + str(val_loss) + " Bleu: " + str(bleu_score_batch) + "\n\n") f_log.close() val_loss_arr.append(np.array(val_iteration_loss)) val_bleu_arr.append(np.array(val_iteration_bleu)) np.save(os.path.join(args.model_path, "train_loss.npy"), np.array(train_loss_arr)) np.save(os.path.join(args.model_path, "val_loss.npy"), np.array(val_loss_arr)) np.save(os.path.join(args.model_path, "train_bleu.npy"), np.array(train_bleu_arr)) np.save(os.path.join(args.model_path, "val_bleu.npy"), np.array(val_bleu_arr))
def main(args): #setup tensorboard if args.tensorboard: cc = CrayonClient(hostname="localhost") print(cc.get_experiment_names()) #if args.name in cc.get_experiment_names(): try: cc.remove_experiment(args.name) except: print("experiment didnt exist") cc_server = cc.create_experiment(args.name) # Create model directory full_model_path = args.model_path + "/" + args.name if not os.path.exists(full_model_path): os.makedirs(full_model_path) with open(full_model_path + "/parameters.json", 'w') as f: f.write((json.dumps(vars(args)))) # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) mini_transform = transforms.Compose( [transforms.ToPILImage(), transforms.Scale(20), transforms.ToTensor()]) # Load vocabulary wrapper. if args.vocab_path is not None: with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) else: print("building new vocab") vocab = build_vocab(args.image_dir, 1, None) with open((full_model_path + "/vocab.pkl"), 'wb') as f: pickle.dump(vocab, f) # Build data loader data_loader = get_loader(args.image_dir, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) code_data_set = ProcessingDataset(root=args.image_dir, vocab=vocab, transform=transform) train_ds, val_ds = validation_split(code_data_set) train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn) test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn) train_size = len(train_loader) test_size = len(test_loader) # Build the models encoder = EncoderCNN(args.embed_size, args.train_cnn) print(encoder) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) print(decoder) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.bn.parameters()) #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) start_time = time.time() add_log_entry(args.name, start_time, vars(args)) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): decoder.train() encoder.train() # Set mini-batch dataset image_ts = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] count = images.size()[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(image_ts) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() total = targets.size(0) max_index = outputs.max(dim=1)[1] #correct = (max_index == targets).sum() _, predicted = torch.max(outputs.data, 1) correct = predicted.eq(targets.data).cpu().sum() accuracy = 100. * correct / total if args.tensorboard: cc_server.add_scalar_value("train_loss", loss.data[0]) cc_server.add_scalar_value("perplexity", np.exp(loss.data[0])) cc_server.add_scalar_value("accuracy", accuracy) # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], accuracy, np.exp(loss.data[0]))) # Save the models if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) if 1 == 2 and i % int(train_size / 10) == 0: encoder.eval() #decoder.eval() correct = 0 for ti, (timages, tcaptions, tlengths) in enumerate(test_loader): timage_ts = to_var(timages, volatile=True) tcaptions = to_var(tcaptions) ttargets = pack_padded_sequence(tcaptions, tlengths, batch_first=True)[0] tfeatures = encoder(timage_ts) toutputs = decoder(tfeatures, tcaptions, tlengths) print(ttargets) print(toutputs) print(ttargets.size()) print(toutputs.size()) #correct = (ttargets.eq(toutputs[0].long())).sum() accuracy = 100 * correct / test_size print('accuracy: %.4f' % (accuracy)) if args.tensorboard: cc_server.add_scalar_value("accuracy", accuracy) torch.save( decoder.state_dict(), os.path.join(full_model_path, 'decoder-%d-%d.pkl' % (epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(full_model_path, 'encoder-%d-%d.pkl' % (epoch + 1, i + 1))) end_time = time.time() print("finished training, runtime: %d", [(end_time - start_time)])
def main(args): #create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) #image preprocessing and normalzation stuff(define transforms) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) #load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #build data-loader data_loader = get_loader(args.image_dir, args.caption, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) #build the model encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).to(device) #define loss and optimizer function criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.batch_norm.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) #train the model total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] #forward and backprop features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() #print log info if i % args.log_step == 0: print( 'Epoch {}/{} , Step {}/{} , Loss {:.4f} , Perplexity{:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) #save the model if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_caption_loader(args.caption_path, vocab, 75, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderRNN(len(vocab), args.embed_size, args.hidden_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list( encoder.embedding.parameters()) + list(encoder.rnn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (captions_src, captions_tgt, lengths) in enumerate(data_loader): # Set mini-batch dataset captions_src = captions_src.to(device) captions_tgt = captions_tgt.to(device) targets = pack_padded_sequence(captions_tgt, lengths, batch_first=True)[0] # Forward, backward and optimize enc_output, enc_hidden = encoder(captions_src) outputs = decoder(enc_hidden[:, -1:, :], captions_tgt, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(): # Configuration for hyper-parameters config = Config() # Create model directory if not os.path.exists(config.model_path): os.makedirs(config.model_path) # Image preprocessing transform = config.train_transform # Load vocabulary wrapper with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f: vocab = pickle.load(f) # Build data loader image_path = os.path.join(config.image_path, 'train2014') json_path = os.path.join(config.caption_path, 'captions_train2014.json') train_loader = get_data_loader(image_path, json_path, vocab, transform, config.batch_size, shuffle=True, num_workers=config.num_threads) total_step = len(train_loader) # Build Models encoder = EncoderCNN(config.embed_size) decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab), config.num_layers) if torch.cuda.is_available() encoder.cuda() decoder.cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) optimizer = torch.optim.Adam(params, lr=config.learning_rate) # Train the Models for epoch in range(config.num_epochs): for i, (images, captions, lengths) in enumerate(train_loader): # Set mini-batch dataset images = Variable(images) captions = Variable(captions) if torch.cuda.is_available(): images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) loss.backward() optimizer.step() # Print log info if i % config.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' %(epoch, config.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) # Save the Model if (i+1) % config.save_step == 0: torch.save(decoder.state_dict(), os.path.join(config.model_path, 'decoder-%d-%d.pkl' %(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join(config.model_path, 'encoder-%d-%d.pkl' %(epoch+1, i+1)))
# Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step()
class Worker: def __init__(self, args): # Initialize MPI/NCCL and set topology variables self.init_dist(args.gpu_only) self.rank = self.dist.get_rank() self.world_size = self.dist.get_world_size() self.local_rank = self.dist.get_local_rank() self.local_size = self.dist.get_local_size() self.n_gpus = self.dist.get_n_gpus() self.n_nodes = self.world_size / self.local_size self.node = self.rank // self.local_size self.n_cpu_workers = (self.local_size - self.n_gpus) * self.n_nodes self.n_gpu_workers = self.n_gpus * self.n_nodes # Set RNG seed for reproducibility, can be left on torch.manual_seed(1234) # CuDNN reproducibility if args.reproducible: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Set number of threads if self.dist.is_cpu_rank(): #torch.set_num_threads(args.num_threads) print("[Rank {}] Setting number of OMP threads to {}".format( self.rank, args.num_threads), flush=True) # Calculate batch sizes self.total_batch_size = args.batch_size self.cpu_batch_size = args.cpu_batch_size assert ((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ % (self.n_gpus * self.n_nodes) == 0), "GPU batch size is not an integer" self.gpu_batch_size = int((self.total_batch_size - self.cpu_batch_size * self.n_cpu_workers * self.n_nodes) \ / (self.n_gpus * self.n_nodes)) self.batch_size = self.cpu_batch_size if self.dist.is_cpu_rank( ) else self.gpu_batch_size print("[Rank {}] Current CUDA device: {}".format( self.rank, torch.cuda.current_device()), flush=True) def init_dist(self, gpu_only): # C++ extension module with JIT compilation dist_module = load( name="dist", sources=["dist.cu"], verbose=True, with_cuda=True, extra_cuda_cflags=[ '-ccbin', 'g++', '-std=c++11', '-O3', #'-I/usr/mpi/gcc/openmpi-2.1.2-hfi/include', #'-I/usr/mpi/gcc/mvapich2-2.3b-hfi/include', '-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/include', #'-I/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/include64', '-I/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/include' ], extra_ldflags=[ '-L/opt/packages/cuda/9.2/lib64', '-lcudart', '-lrt', #'-L/usr/mpi/gcc/openmpi-2.1.2-hfi/lib64', '-lmpi', #'-L/usr/mpi/gcc/mvapich2-2.3b-hfi/lib', '-lmpi', '-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/intel64/lib', '-lmpi', #'-L/opt/intel/compilers_and_libraries_2017.4.196/linux/mpi/lib64', '-lmpi', '-L/pylon5/ac7k4vp/jchoi157/pytorch/build/nccl/lib', '-lnccl' ], build_directory="/home/jchoi157/torch_extensions") self.dist = dist_module.DistManager(gpu_only, False) def average_gradients(self): # Only all-reduce decoder parameters since encoder is pre-trained for param in self.decoder.parameters(): if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cuda(0, non_blocking=True) param.grad.data *= (self.cpu_batch_size / self.total_batch_size) else: param.grad.data *= (self.gpu_batch_size / self.total_batch_size) self.dist.hetero_allreduce(param.grad.data) if self.dist.is_cpu_rank(): param.grad.data = param.grad.data.cpu() def train(self, args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader( args.image_dir, args.caption_path, vocab, transform, self.rank, self.world_size, self.local_size, self.n_gpus, self.total_batch_size, self.cpu_batch_size, self.gpu_batch_size, self.batch_size, shuffle=(False if args.reproducible else True), no_partition=args.no_partition) self.num_batches = len(data_loader) print("[Rank {}] batch size {}, num batches {}".format( self.rank, self.total_batch_size if args.no_partition else self.batch_size, self.num_batches), flush=True) # Build the models self.encoder = EncoderCNN(args.embed_size) self.decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) if self.dist.is_gpu_rank(): self.encoder = self.encoder.cuda(self.local_rank) self.decoder = self.decoder.cuda(self.local_rank) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(self.decoder.parameters()) + list( self.encoder.linear.parameters()) + list( self.encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): epoch_start_time = time.time() batch_time_sum = 0 batch_time_total = 0 processed_batches = 0 processed_batches_total = 0 batch_start_time = time.time() for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset if self.dist.is_gpu_rank(): images = images.cuda(self.local_rank) captions = captions.cuda(self.local_rank) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward, all-reduce and optimize features = self.encoder(images) outputs = self.decoder(features, captions, lengths) loss = criterion(outputs, targets) self.decoder.zero_grad() self.encoder.zero_grad() loss.backward() if not args.no_partition: self.average_gradients() optimizer.step() batch_time = time.time() - batch_start_time batch_time_sum += batch_time batch_time_total += batch_time processed_batches += 1 processed_batches_total += 1 saved_loss = loss.item() # Print log info if i % args.log_step == 0 and i != 0: print( 'Rank [{}], Epoch [{}/{}], Step [{}/{}], Average time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, args.num_epochs, i, total_step, batch_time_sum / processed_batches, saved_loss, np.exp(saved_loss)), flush=True) batch_time_sum = 0 processed_batches = 0 # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( self.decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( self.encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) batch_start_time = time.time() epoch_time = time.time() - epoch_start_time print( '!!! Rank [{}], Epoch [{}], Time: {:.6f}, Average batch time: {:.6f}, Loss: {:.4f}, Perplexity: {:5.4f}' .format(self.rank, epoch, epoch_time, batch_time_total / processed_batches_total, saved_loss, np.exp(saved_loss)), flush=True)
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ # transforms.Resize(190, 190), transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) print type(data_loader) # a, (b, c, d) = enumerate(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch + 1, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if epoch % args.save_step == 0: print 'Save models\n' torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
def main(args): if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Build the models, can use a feedforward/convolutional encoder and an RNN decoder encoder = EncoderCNN(args.embed_size).to( device) #can be sequential or convolutional decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion1 = nn.CrossEntropyLoss() criterion2 = nn.NLLLoss() softmax = nn.LogSoftmax(dim=1) params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) total_training_steps = args.num_iters losses = [] perplexity = [] for epoch in range(args.num_epochs): for i in range(total_training_steps): prog_data = generate_training_data(args.batch_size) images = [im[0] for im in prog_data] transforms = [transform[1] for transform in prog_data] [ele.insert(0, '<start>') for ele in transforms] #start token for each sequence [ele.append('<end>') for ele in transforms] #end token for each sequence lengths = [len(trans) for trans in transforms] maximum_len = max(lengths) for trans in transforms: if len(trans) != maximum_len: trans.extend(['pad'] * (maximum_len - len(trans))) padded_lengths = [len(trans) for trans in transforms] transforms = [[word_to_int(word) for word in transform] for transform in transforms] transforms = torch.tensor(transforms, device=device) images = torch.tensor(images, device=device) images = images.unsqueeze( 1) #Uncomment this line when training using EncoderCNN lengths = torch.tensor(lengths, device=device) padded_lengths = torch.tensor(padded_lengths, device=device) targets = pack_padded_sequence(transforms, padded_lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, transforms, padded_lengths) #print(outputs) loss = criterion1(outputs, targets) losses.append(loss.item()) perplexity.append(np.exp(loss.item())) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f},Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_training_steps, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i + 1) % args.save_step == 0: torch.save( decoder.state_dict(), os.path.join(args.model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) torch.save( encoder.state_dict(), os.path.join(args.model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1))) y = losses z = perplexity x = np.arange(len(losses)) plt.plot(x, y, label='Cross Entropy Loss') plt.plot(x, z, label='Perplexity') plt.xlabel('Iterations') plt.ylabel('Cross Entropy Loss and Perplexity') plt.title("Cross Entropy Loss and Model Perplexity During Training") plt.legend() plt.savefig('plots/plots_cnn/cnn4_gpu', dpi=100)
# Randomly sample a caption length, and sample indices with that length. indices = data_loader.dataset.get_train_indices() # Create and assign a batch sampler to retrieve a batch with the sampled indices. new_sampler = data.sampler.SubsetRandomSampler(indices=indices) data_loader.batch_sampler.sampler = new_sampler # Obtain the batch. images, captions = next(iter(data_loader)) # print(images.shape) # Move batch of images and captions to GPU if CUDA is available. images = images.to(device) captions = captions.to(device) # Zero the gradients. decoder.zero_grad() encoder.zero_grad() # Pass the inputs through the CNN-RNN model. features = encoder(images) outputs = decoder(features, captions) # Calculate the batch loss. loss = criterion(outputs.view(-1, vocab_size), captions.view(-1)) # Backward pass. loss.backward() # Update the parameters in the optimizer. optimizer.step()
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % args.save_step == 0: torch.save(decoder.state_dict(), os.path.join( args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))