def do(args: argparse.Namespace): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('gpu :', args.gpu) # preprocess preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) # vocab with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # model encoder = EncoderCNN(args.embed_size).cuda() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda() model_state = torch.load(args.checkpoint_path) encoder.load_state_dict(model_state['encoder']) decoder.load_state_dict(model_state['decoder']) print('load successfully at\tepoch:%d\tstep:%d' % (model_state['epoch'], model_state['step'])) encoder.eval() decoder.eval() # image img = load_image(args.img_path, preprocess).cuda() outs = decoder.sample(encoder(img)) outs = outs.cpu().numpy() print(outs) # caption caption = [] for word_id in outs: word = vocab.idx2word[word_id] caption.append(word) if word == '<end>': break sentence = ' '.join(caption) print(sentence)
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size, hidden_size, vocab_size): encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move to GPU, if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = encoder.to(device) decoder = decoder.to(device) criterion = nn.CrossEntropyLoss().to(device) params = list(decoder.parameters()) + list(encoder.embed.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) # This is to make sure that the 1st loss is lower than sth and # Save the model according to this comparison valid_loss_min = np.Inf for epoch in range(1, n_epochs + 1): # Keep track of training and validation loss train_loss = 0.0 valid_loss = 0.0 encoder.train() decoder.train() for data in train_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) loss.backward() optimizer.step() train_loss += loss.item() * images.size(0) encoder.eval() decoder.eval() for data in valid_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) valid_loss += loss.item() * images.size(0) # Average losses train_loss = train_loss / len(train_loader) valid_loss = valid_loss / len(valid_loader) print( f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}" ) # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min} --> {valid_loss}). Saving model ..." ) torch.save(encoder.state_dict(), save_location_path + '/encoder{n_epochs}.pt') torch.save(decoder.state_dict(), save_location_path + '/decoder{n_epochs}.pt') valid_loss_min = valid_loss
shuffle=False, num_workers=2, ) # device configuration device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = EncoderCNN().to(device) ######################################################################### # # QUESTION 1.2 Extracting image features # ######################################################################### features = [] # TODO loop through all image data, extracting features and saving them # no gradients needed with torch.no_grad(): model.eval() for data in tqdm(train_loader): data = data.to(device) features.append(model(data)) features = torch.cat(features).squeeze() # to check your results, features should be dimensions [len(train_set), 2048] # convert features to a PyTorch Tensor before saving print(features.shape) # save features torch.save(features, "features.pt")
def main(): cudnn.benchmark = True device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = get_parser().parse_args() NUM_WORKERS = 2 CROP_SIZE = 256 NUM_PIXELS = 64 ENCODER_SIZE = 2048 ALPHA = 1. # attention regularization parameter learning_rate = args.lr start_epoch = 0 max_BLEU = 0 vocab = pickle.load(open('vocab.p', 'rb')) train_transform = transforms.Compose([ transforms.RandomCrop(CROP_SIZE), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286))]) val_transform = transforms.Compose([ transforms.CenterCrop(CROP_SIZE), transforms.ToTensor(), transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286))]) train_loader = torch.utils.data.DataLoader( dataset=Custom_Flickr30k('flickr30k-images/flickr30k-images','flickr30k-captions/results_20130124.token', vocab, transform=train_transform, train=True), batch_size=args.batch_size, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn) val_loader = torch.utils.data.DataLoader( dataset=Custom_Flickr30k('flickr30k-images/flickr30k-images','flickr30k-captions/results_20130124.token', vocab, transform=val_transform, train=False), batch_size=args.batch_size, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, collate_fn=collate_fn) # Initialize models encoder = EncoderCNN().to(device) decoder = DecoderRNNwithAttention(vocab, args.hid_size, 1, args.attn_size, ENCODER_SIZE, NUM_PIXELS, dropout=args.drop).to(device) # Initialize optimization criterion = torch.nn.CrossEntropyLoss() #decoder.embed.weight.requires_grad = False params = list(decoder.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] max_BLEU = checkpoint['max_BLEU'] encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) optimizer.load_state_dict(checkpoint['optimizer']) else: print("No checkpoint found at '{}'".format(args.resume)) XEntropy = AverageMeter() PPL = AverageMeter() # Save if not args.resume: file = open(f'{args.save}/resuts.txt','a') file.write('Loss,PPL,BLEU \n') file.close() for epoch in range(start_epoch, 30): print('Epoch {}'.format(epoch+1)) print('training...') for i, (images, captions, lengths) in enumerate(train_loader): if i%10 == 0: print('[{}/{}]'.format(i,len(train_loader))) print(PPL.avg) # Batch to device images = images.to(device) captions = captions.to(device) lengths.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] encoder.train() decoder.train() features = encoder(images) predictions, attention_weights = decoder(features, captions, lengths) scores = pack_padded_sequence(predictions[:,:-1,:], lengths-2, batch_first=True) targets = pack_padded_sequence(captions[:,1:-1], lengths-2, batch_first=True) loss = criterion(scores.data, targets.data) loss += ALPHA * ((1. - attention_weights.sum(dim=1)) ** 2).mean() decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() XEntropy.update(loss.item(), len(lengths)) PPL.update(np.exp(loss.item()), len(lengths)) print('Train Perplexity = {}'.format(PPL.avg)) if epoch+1 % 10 == 0: learning_rate /= 10 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate encoder.eval() decoder.eval() print('validating...') curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size, device)[0] is_best = curr_BLEU > max_BLEU max_BLEU = max(curr_BLEU, max_BLEU) save_checkpoint({ 'epoch': epoch + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'max_BLEU': max_BLEU, 'optimizer' : optimizer.state_dict(), }, is_best, args.save) print('Validation BLEU = {}'.format(curr_BLEU)) # Save file = open(f'{args.save}/resuts.txt','a') file.write('{},{},{} \n'.format(XEntropy.avg,PPL.avg,curr_BLEU)) file.close() checkpoint = torch.load(f'{args.save}/model_best.pth.tar') encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) #decoder.embed.weight.requires_grad = True learning_rate = 0.001 params = list(encoder.parameters()) + list(decoder.parameters()) optimizer = torch.optim.Adam(params, lr=learning_rate) for epoch in range(start_epoch, args.epoch): print('Epoch {}'.format(epoch+1)) print('training...') for i, (images, captions, lengths) in enumerate(train_loader): if i%10 == 0: print('[{}/{}]'.format(i,len(train_loader))) print(PPL.avg) # Batch to device images = images.to(device) captions = captions.to(device) lengths.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] encoder.train() decoder.train() features = encoder(images) predictions, attention_weights = decoder(features, captions, lengths) scores = pack_padded_sequence(predictions[:,:-1,:], lengths-2, batch_first=True) targets = pack_padded_sequence(captions[:,1:-1], lengths-2, batch_first=True) loss = criterion(scores.data, targets.data) loss += ALPHA * ((1. - attention_weights.sum(dim=1)) ** 2).mean() decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() XEntropy.update(loss.item(), len(lengths)) PPL.update(np.exp(loss.item()), len(lengths)) print('Train Perplexity = {}'.format(PPL.avg)) if epoch+1 % 5 == 0: learning_rate /= 10 for param_group in optimizer.param_groups: param_group['lr'] = learning_rate encoder.eval() decoder.eval() print('validating...') curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size, device)[0] is_best = curr_BLEU > max_BLEU max_BLEU = max(curr_BLEU, max_BLEU) save_checkpoint({ 'epoch': epoch + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'max_BLEU': max_BLEU, 'optimizer' : optimizer.state_dict(), }, is_best, args.save) print('Validation BLEU = {}'.format(curr_BLEU)) # Save file = open(f'{args.save}/resuts.txt','a') file.write('{},{},{} \n'.format(XEntropy.avg,PPL.avg,curr_BLEU)) file.close()
data_transform = transforms.Compose([ transforms.Resize(224), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( (0.485, 0.456, 0.406), # using ImageNet norms (0.229, 0.224, 0.225)) ]) test_lines = read_lines(TOKEN_FILE_TEST) test_image_ids, test_cleaned_captions = parse_lines(test_lines) # load models encoder = EncoderCNN().to(device) decoder = torch.load("decoder.ckpt").to(device) encoder.eval() decoder.eval() # generate caption, eval mode to not influence batchnorm ######################################################################### # # QUESTION 2.1 Generating predictions on test data # ######################################################################### # TODO define decode_caption() function in utils.py image_id_candidate_reference = {} # type: dict[str, dict[str, list[str]]] import os if os.path.exists("image_id_candidate_reference.pt"): image_id_candidate_reference = torch.load( "image_id_candidate_reference.pt") else:
Obj2 = s3.get_object(Bucket=S3_BUCKET, Key=ENC_PATH) bytestream = io.BytesIO(Obj2['Body'].read()) encoder_model = EncoderCNN(embed_size) encoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE)) print('Encoder loaded') # Load Decoder Obj3 = s3.get_object(Bucket=S3_BUCKET, Key=DEC_PATH) bytestream = io.BytesIO(Obj3['Body'].read()) decoder_model = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers) decoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE)) print('Decoder loaded') # decoder = DecoderRNN( embed_size , hidden_size , vocab_size , num_layers ) # decoder.load_state_dict( torch.load( os.path.join( model_save_path , 'decoderdata.pkl' ) ) ) encoder_model.eval() decoder_model.eval() except Exception as e: print('error in loading block') print(repr(e)) raise (e) def transform_image(image_bytes): try: transform_test = transforms.Compose([ transforms.Resize(224), # smaller edge of image resized to 256 transforms.RandomCrop( 224), # get 224x224 crop from random location transforms.RandomHorizontalFlip(