''' This code runs inference model on CIFAR-10 dataset using mxnet ''' ################################################################################################### import data_load import os import mxnet as mx import logging import time logging.getLogger().setLevel(logging.DEBUG) # logging to stdout ################################### Load data ##################################################### train_data, train_labels = data_load.train_load() test_data, test_labels = data_load.test_load() print('train_data shape: ', end='') print(train_data.shape) print('train_labels shape: ', end='') print(train_labels.shape) print('test_data shape: ', end='') print(test_data.shape) print('test_labels shape: ', end='') print(test_labels.shape) ################################### Prepare data for mxnet ######################################## batch_size = 100 train_iter = mx.io.NDArrayIter(data=train_data, label=train_labels, batch_size=batch_size, shuffle=True) test_iter = mx.io.NDArrayIter(data=test_data, label=test_labels, batch_size=20)
def main(args): # ============================== # Create some folders or files for saving # ============================== if not os.path.exists(args.root_folder): os.mkdir(args.root_folder) loss_path = args.loss_path mertics_path = args.mertics_path epoch_model_path = args.epoch_model_path best_model_path = args.best_model_path generated_captions_path = args.generated_captions_folder_path sentences_show_path = args.sentences_show_path # Transform the format of images # This function in utils.general_tools.py train_transform = get_train_transform() val_transform = get_val_trainsform() # Load vocabulary print("*** Load Vocabulary ***") with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Create data sets # This function in data_load.py train_data = train_load(root=args.train_image_dir, json=args.train_caption_path, vocab=vocab, transform=train_transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_data = val_load(root=args.val_image_dir, json=args.val_caption_path, transform=val_transform, batch_size=1, shuffle=False, num_workers=args.num_workers) # Build model encoder = Encoder(args.hidden_dim, args.fine_tuning).to(device) decoder = Decoder(args.embedding_dim, args.hidden_dim, vocab, len(vocab), args.max_seq_length).to(device) # Select loss function criterion = nn.CrossEntropyLoss().to(device) if args.fine_tuning == True: params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) else: params = decoder.parameters() optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) # Load pretrained model if args.resume == True: checkpoint = torch.load(best_model_path) encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) if args.fine_tuning == False: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] + 1 best_score = checkpoint['best_score'] best_epoch = checkpoint['best_epoch'] # New epoch and score else: start_epoch = 1 best_score = 0 best_epoch = 0 for epoch in range(start_epoch, 10000): print("-" * 20) print("epoch:{}".format(epoch)) # Adjust learning rate when the difference between epoch and best epoch is multiple of 3 if (epoch - best_epoch) > 0 and (epoch - best_epoch) % 4 == 0: # This function in utils.general_tools.py adjust_lr(optimizer, args.shrink_factor) if (epoch - best_epoch) > 10: break print("*** Training complete ***") # ============= # Training # ============= print(" *** Training ***") decoder.train() encoder.train() total_step = len(train_data) epoch_loss = 0 for (images, captions, lengths, img_ids) in tqdm(train_data): images = images.to(device) captions = captions.to(device) # Why do lengths cut 1 and the first dimension of captions from 1 # Because we need to ignore the begining symbol <start> lengths = list(np.array(lengths) - 1) targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] features = encoder(images) predictions = decoder(features, captions, lengths) predictions = pack_padded_sequence(predictions, lengths, batch_first=True)[0] loss = criterion(predictions, targets) epoch_loss += loss.item() decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Save loss information # This function in utils.save_tools.py save_loss(round(epoch_loss / total_step, 3), epoch, loss_path) # ============= # Evaluating # ============= print("*** Evaluating ***") encoder.eval() decoder.eval() generated_captions = [] for image, img_id in tqdm(val_data): image = image.to(device) img_id = img_id[0] features = encoder(image) sentence = decoder.generate(features) sentence = ' '.join(sentence) item = {'image_id': int(img_id), 'caption': sentence} generated_captions.append(item) j = random.randint(1, 100) print('*** Computing metrics ***') # Save current generated captions # This function in utils.save_tools.py captions_json_path = save_generated_captions(generated_captions, epoch, generated_captions_path, args.fine_tuning) # Compute score of metrics # This function in utils.general_tools.py results = coco_metrics(args.val_caption_path, captions_json_path, epoch, sentences_show_path) # Save metrics results # This function in utils.save_tools.py epoch_score = save_metrics(results, epoch, mertics_path) # Update the best score if best_score < epoch_score: best_score = epoch_score best_epoch = epoch save_best_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, best_model_path) print("*** Best score:{} Best epoch:{} ***".format( best_score, best_epoch)) # Save every epoch model save_epoch_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, epoch_model_path, args.fine_tuning)
def main(args): # hyperparameters batch_size = args.batch_size num_workers = 1 # Image Preprocessing transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) # load COCOs dataset TRAIN_IMAGES_PATH = 'data/train2014' TRAIN_CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json' VOCAB_PATH = 'data/coco/annotations/vocab.pkl' with open(VOCAB_PATH, 'rb') as f: vocab = pickle.load(f) train_loader = train_load(root=TRAIN_IMAGES_PATH, json=TRAIN_CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) VAL_IMAGES_PATH = 'data/val2014' VAL_CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json' val_loader = val_load(path=VAL_IMAGES_PATH, json=VAL_CAPTION_FILE_PATH, vocab=vocab, transform=transform, batch_size=batch_size, shuffle=True, num_workers=num_workers) losses_val = [] losses_train = [] # Build the models ngpu = 1 initial_step = initial_epoch = 0 embed_size = args.embed_size num_hiddens = args.num_hidden learning_rate = 1e-3 num_epochs = 3 log_step = args.log_step save_step = 500 checkpoint_dir = args.checkpoint_dir encoder = CNN(embed_size) decoder = RNN(embed_size, num_hiddens, len(vocab), num_layers=1, rec_unit=args.rec_unit) # Loss criterion = nn.CrossEntropyLoss() if args.checkpoint_file: encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models( args.checkpoint_file, args.sample) initial_step, initial_epoch, losses_train, losses_val = meta encoder.load_state_dict(encoder_state_dict) decoder.load_state_dict(decoder_state_dict) else: params = list(decoder.parameters()) + list( encoder.linear.parameters()) + list(encoder.batchnorm.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() if args.sample: return utils.sample(encoder, decoder, vocab, val_loader) # Train the Models total_step = len(train_loader) try: for epoch in range(initial_epoch, num_epochs): for step, (images, captions, lengths) in enumerate(train_loader, start=initial_step): # Set mini-batch dataset images = utils.to_var(images, volatile=True) captions = utils.to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, Backward and Optimize decoder.zero_grad() encoder.zero_grad() if ngpu > 1: # run on multiple GPU features = nn.parallel.data_parallel( encoder, images, range(ngpu)) outputs = nn.parallel.data_parallel( decoder, features, range(ngpu)) else: # run on single GPU features = encoder(images) outputs = decoder(features, captions, lengths) train_loss = criterion(outputs, targets) losses_train.append(train_loss.data[0]) train_loss.backward() optimizer.step() # Run validation set and predict if step % log_step == 0: encoder.batchnorm.eval() # run validation set batch_loss_val = [] for val_step, (images, captions, lengths) in enumerate(val_loader): images = utils.to_var(images, volatile=True) captions = utils.to_var(captions, volatile=True) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] features = encoder(images) outputs = decoder(features, captions, lengths) val_loss = criterion(outputs, targets) batch_loss_val.append(val_loss.data[0]) losses_val.append(np.mean(batch_loss_val)) # predict sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(sampled_ids, vocab) print('Sample:', sentence) true_ids = captions.cpu().data.nmumpy()[0] sentence = utils.convert_back_to_text(true_ids, vocab) print('Target:', sentence) print( 'Epoch: {} - Step: {} - Train Loss: {} - Eval Loss: {}' .format(epoch, step, losses_train[-1], losses_val[-1])) encoder.batchnorm.train() # Save the models if (step + 1) % save_step == 0: utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses( losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl')) except KeyboardInterrupt: pass finally: # Do final save utils.save_models(encoder, decoder, optimizer, step, epoch, losses_train, losses_val, checkpoint_dir) utils.dump_losses(losses_train, losses_val, os.path.join(checkpoint_dir, 'losses.pkl'))