def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    transform = transforms.Compose([
        # transforms.ColorJitter(contrast = 0.3,saturation = 0.3),
        # transforms.RandomChoice([transforms.RandomHorizontalFlip(),transforms.RandomVerticalFlip()]),
        transforms.RandomAffine(0,translate = (0.1,0.1)),
        transforms.ToTensor(), 
        transforms.Normalize((0.8, 0.7, 0.8), 
                            (1, 1, 1))
        ])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
    #                          transform, args.batch_size,
    #                          shuffle=True, num_workers=args.num_workers) 
    sasr_data_loader = SASR_Data_Loader(vocab,transform)
    sasr_data_loader.load_data(args.data_file,args.init_flag)
    frogger_data_loader = sasr_data_loader.data_loader(args.batch_size, 
                             transform,
                             shuffle=True, num_workers=args.num_workers) 
    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) + list(encoder.resnet.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    stransform = transforms.ToPILImage()

    img2vec = Img2Vec()
    total_step = len(frogger_data_loader)
    for epoch in range(args.num_epochs):
        for i,(images,captions,lengths) in enumerate(frogger_data_loader):
            # image1 = images[0].squeeze()
            # # print(image1.size())
            # # c = stransform(image1)
            # # vec = img2vec.get_vec(c,True)
            # # # print(vec)
            # # c.save('save_image1.png')
            # # image2 = images[1].squeeze()
            # # print(image2.size())
            # # c = stransform(image2)
            # # # vec = img2vec.get_vec(c)
            # # # print(vec)
            # # c.save('save_image2.png')
            images = to_var(images, volatile=True)
            # images = images.to(device)
            if (list(images.size())[0]!=1):
                captions = to_var(captions)
                
                # print(images[0])
                targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
                decoder.zero_grad()
                encoder.zero_grad()
                # print(images) 
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                # Print log info
                if i % args.log_step == 0:
                    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                          %(epoch, args.num_epochs, i, total_step, 
                            loss.data[0], np.exp(loss.data[0]))) 
                    
                # Save the models
                if (i+1) % args.save_step == 0:
                    torch.save(decoder.state_dict(), 
                               os.path.join(args.model_path, 
                                            'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                    torch.save(encoder.state_dict(), 
                               os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Esempio n. 2
0
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()
        # Update the parameters in the optimizer.
        optimizer.step()

        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (
            epoch, num_epochs, i_step, total_step, loss.item(),
            np.exp(loss.item()))

        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()

        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()

        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)

    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(),
                   os.path.join('./models', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(),
                   os.path.join('./models', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()
Esempio n. 4
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models (Gen)
    # TODO: put these in generator
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Build the models (Disc)
    discriminator = Discriminator(args.embed_size, args.hidden_size,
                                  len(vocab), args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        discriminator.cuda()

    # Loss and Optimizer (Gen)
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Loss and Optimizer (Disc)
    params_disc = list(discriminator.parameters())
    optimizer_disc = torch.optim.Adam(params_disc)

    # Train the Models
    total_step = len(data_loader)
    disc_losses = []
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths, wrong_captions,
                wrong_lengths) in enumerate(data_loader):

            # pdb.set_trace()
            # TODO: train disc before gen

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            wrong_captions = to_var(wrong_captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)

            sampled_captions = decoder.sample(features)
            # sampled_captions = torch.zeros_like(sampled_ids)
            sampled_lengths = []

            for row in range(sampled_captions.size(0)):
                for index, word_id in enumerate(sampled_captions[row, :]):
                    # pdb.set_trace()
                    word = vocab.idx2word[word_id.cpu().data.numpy()[0]]
                    # sampled_captions[row, index].data = word
                    if word == '<end>':
                        sampled_lengths.append(index + 1)
                        break
                    elif index == sampled_captions.size(1) - 1:
                        sampled_lengths.append(sampled_captions.size(1))
                        break
            sampled_lengths = np.array(sampled_lengths)
            sampled_lengths[::-1].sort()
            sampled_lengths = sampled_lengths.tolist()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Train discriminator
            discriminator.zero_grad()
            rewards_real = discriminator(images, captions, lengths)
            rewards_fake = discriminator(images, sampled_captions,
                                         sampled_lengths)
            rewards_wrong = discriminator(images, wrong_captions,
                                          wrong_lengths)
            real_loss = -torch.mean(torch.log(rewards_real))
            fake_loss = -torch.mean(
                torch.clamp(torch.log(1 - rewards_fake), min=-1000))
            wrong_loss = -torch.mean(
                torch.clamp(torch.log(1 - rewards_wrong), min=-1000))
            loss_disc = real_loss + fake_loss + wrong_loss

            disc_losses.append(loss_disc.cpu().data.numpy()[0])
            loss_disc.backward()
            optimizer_disc.step()

            # print('iteration %i' % i)
            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            # if (i+1) % args.save_step == 0:
            if (
                    i + 1
            ) % total_step == 0:  # jm: saving at the last iteration instead
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    discriminator.state_dict(),
                    os.path.join(
                        args.model_path,
                        'discriminator-%d-%d.pkl' % (epoch + 1, i + 1)))

                # plot at the end of every epoch
                plt.plot(disc_losses, label='disc loss')
                plt.savefig('disc_losses.png')
                plt.clf()
Esempio n. 5
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device)
    
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) 
                
            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
Esempio n. 6
0
def main(args):
    random.seed()
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder_img = EncoderCNN(args.hidden_size)
    encoder_capt = EncoderRNN(args.embed_size, args.hidden_size, len(vocab),
                              args.num_layers)
    mlp = MLPNN(args.hidden_size + args.hidden_size)

    encoder_img_e = EncoderCNN(args.hidden_size)
    encoder_capt_e = EncoderRNN(args.embed_size, args.hidden_size, len(vocab),
                                args.num_layers)

    # load the reward model
    encoder_img_e.load_state_dict(torch.load(args.encoder_path_e_img))
    encoder_capt_e.load_state_dict(torch.load(args.encoder_path_e_capt))

    if torch.cuda.is_available():
        encoder_img.cuda()
        encoder_capt.cuda()
        mlp.cuda()
        encoder_img_e.cuda()
        encoder_capt_e.cuda()

    # Loss and Optimizer
    criterion = nn.MSELoss()
    params = list(encoder_capt.parameters()) + list(
        encoder_img.linear.parameters()) + list(encoder_img.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)

            features = encoder_img_e(images)
            outputs = encoder_capt_e(captions, lengths)
            scores = torch.mm(features, outputs.transpose(1, 0))
            diagonal = scores.diag()

            #rvals = torch.ones(images.size[0]) # batchlength size
            rvals = diagonal.detach()  # batchlength size
            #rvals = torch.autograd.Variable(diagonal, requires_grad=False)
            # targets = pack_padded_sequence(rvals, lengths, batch_first=True)[0]
            # Forward, Backward and Optimize
            encoder_capt.zero_grad()
            encoder_img.zero_grad()
            mlp.zero_grad()

            img_features = encoder_img(images)
            #TODO randomly convert the caption to be partial
            n = captions[0].size(0)
            t = n * torch.rand(captions.size(0), device=torch.device("cuda"))
            t = t.type(torch.long)
            for k in range(captions.size(0)):
                #print("t[",k,"]=",t[k])
                if t[k] < lengths[k]:
                    captions[k][t[k]] = 2
                captions[k][t[k] + 1:n] = torch.zeros(
                    n - int(t[k]) - 1, device=torch.device("cuda"))
            lengths = t + 1

            lengths, indices = torch.sort(torch.tensor(lengths),
                                          descending=True)
            captions.index_copy_(0, indices, captions)
            img_features.index_copy_(0, indices, img_features)
            rvals.index_copy_(0, indices, rvals)

            cap_features = encoder_capt(captions, lengths)
            outputs = mlp(img_features, cap_features)

            loss = criterion(outputs, rvals)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    encoder_capt.state_dict(),
                    os.path.join(
                        args.model_path,
                        'encoder-capt-%d-%d-v.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder_img.state_dict(),
                    os.path.join(
                        args.model_path,
                        'encoder-img-%d-%d-v.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    mlp.state_dict(),
                    os.path.join(args.model_path,
                                 'mlp-%d-%d-v.pkl' % (epoch + 1, i + 1)))
def main(cfg):
    # modelのディレクトリ作成
    if not os.path.exists(hydra.utils.to_absolute_path(cfg.train.model_path)):
        os.makedirs(hydra.utils.to_absolute_path(cfg.train.model_path))

    # 画像の前処理と正規化を行う
    transform = transforms.Compose([
        transforms.RandomCrop(cfg.image.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])

    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # data_loaderの読み込み
    data_loader = get_loader(hydra.utils.to_absolute_path(cfg.train.image_dir),
                             hydra.utils.to_absolute_path(cfg.train.caption_path), vocab, transform,
                             cfg.train.batch_size, shuffle=True,
                             num_workers=cfg.train.num_workers)

    # modelの構築
    encoder = EncoderCNN(cfg.train.embed_size).to(device)
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers).to(device)

    # lossとoptimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=cfg.train.learning_rate)

    # train
    total_step = len(data_loader)

    for epoch in range(cfg.train.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            # ミニバッジデータセットのセット
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            if i % cfg.train.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, cfg.train.num_epochs, i, total_step, loss.item(), np.exp(loss.item())))

            # modelをcheckpointごとにSaveする
            if (i + 1) % cfg.train.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    hydra.utils.to_absolute_path(cfg.train.model_path), 'decoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))

                torch.save(encoder.state_dict(), os.path.join(
                    hydra.utils.to_absolute_path(cfg.train.model_path), 'encoder-{}-{}.ckpt'.format(epoch + 1, i + 1)))
Esempio n. 8
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
def main():

    min_train_loss = 100.0
    # Create model directory
    if not os.path.exists(args['model_path']):
        os.makedirs(args['model_path'])

    fp_loss = open(
        args['model_path'] +
        'training_loss_resnet50_finetune_attention_lstm_node08.txt', 'w+')

    # Image preprocessing
    transform = transforms.Compose([
        transforms.RandomCrop(args['crop_size']),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.9638, 0.9638, 0.9638),
                             (0.1861, 0.1861, 0.1861))
    ])
    # Load vocabulary wrapper.
    with open(args['vocab_path'], 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args['image_dir'],
                             args['caption_path'],
                             vocab,
                             transform,
                             args['batch_size'],
                             shuffle=True,
                             num_workers=args['num_workers'])

    # Build the models
    encoder = EncoderCNN(args['embed_size'])
    decoder = DecoderRNN(args['embed_size'],
                         args['hidden_size'],
                         len(vocab),
                         args['num_layers'],
                         max_seq_length=50)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params,
                                 lr=args['learning_rate'])  #original optimizer

    # Train the Models
    total_step = len(data_loader)
    total = 1
    for epoch in range(args['num_epochs']):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.cuda()
            captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features, cnn_features = encoder(images)
            outputs = decoder(features, cnn_features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            fp_loss.write(str(total))
            fp_loss.write("\t")
            fp_loss.write(str(loss.item()))
            fp_loss.write("\n")

            total = total + 1

            # Print log info
            if i % args['log_step'] == 0:
                print('Epoch [%d/%d], Step [%d/%d], training-loss: %.4f' %
                      (epoch, args['num_epochs'], i, total_step, loss.item()))

            if min_train_loss > loss.item():
                min_train_loss = loss.item()
                torch.save(
                    decoder.state_dict(),
                    os.path.join(
                        args['model_path'],
                        'decoder_resnet50_finetune_attention_lstm_node08.pkl'))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(
                        args['model_path'],
                        'encoder_resnet50_finetune_attention_lstm_node08.pkl'))

    fp_loss.close()
Esempio n. 10
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    #read rationalization data
    rationalizations = []
    max_length = 0
    lengths = []
    bad_worker_ids = [
        'A2CNSIECB9UP05', 'A23782O23HSPLA', 'A2F9ZBSR6AXXND', 'A3GI86L18Z71XY',
        'AIXTI8PKSX1D2', 'A2QWHXMFQI18GQ', 'A3SB7QYI84HYJT', 'A2Q2A7AB6MMFLI',
        'A2P1KI42CJVNIA', 'A1IJXPKZTJV809', 'A2WZ0RZMKQ2WGJ', 'A3EKETMVGU2PM9',
        'A1OCEC1TBE3CWA', 'AE1RYK54MH11G', 'A2ADEPVGNNXNPA', 'A15QGLWS8CNJFU',
        'A18O3DEA5Z4MJD', 'AAAL4RENVAPML', 'A3TZBZ92CQKQLG', 'ABO9F0JD9NN54',
        'A8F6JFG0WSELT', 'ARN9ET3E608LJ', 'A2TCYNRAZWK8CC', 'A32BK0E1IPDUAF',
        'ANNV3E6CIVCW4'
    ]
    with open('./Log/Rationalizations.txt') as f:
        for line in f:
            line = line.lower()
            line = re.sub('[^a-z\ \']+', " ", line)
            words = line.split()
            length = len(words)
            lengths.append(length)
            if length > max_length:
                max_length = length
            for index, word in enumerate(words):
                words[index] = vocab.word2idx[word]
            rationalizations.append(words)
    # max_length = max(rationalizations,key=len
    rationalizations = [np.array(xi) for xi in rationalizations]
    # for index,r in enumerate(rationalizations):
    #     # print(max_length)
    #     r = np.lib.pad(r,(0,max_length - len(r)),'constant')
    #     rationalizations[index] = r

    # rationalizations = np.vstack(rationalizations)
    # print(rationalizations)
    # print(rationalizations.shape)
    # print(torch.from_numpy(rationalizations))
    # rationalizations = torch.from_numpy(rationalizations)
    # print(np.asarray(rationalizations).reshape(rationalizations.shape,rationalizations.shape))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    frogger_data_loader = get_images('./data/FroggerDataset/', args.batch_size,
                                     transform)
    # exit(0)

    # Train the Models
    # data = iter(frogger_data_loader)
    # imgs = data.next()[0]
    # print(imgs)
    # print(frogger_data_loader[0])
    # exit(0)

    # for i,(images)  in enumerate(frogger_data_loader):
    #     print(images)
    total_step = len(frogger_data_loader)
    for epoch in range(args.num_epochs):
        for i, x in enumerate(frogger_data_loader):
            # print(x)
            # print(x[0])
            # exit(0)
            # print(x[0])
            # exit(0)
            images = to_var(x[0], volatile=True)
            print(images[0][1])
            exit(0)
            captions = []
            max_length = max(lengths[i:i + 2])
            rats = rationalizations[i:i + 2]
            rats.sort(key=lambda s: len(s))
            rats.reverse()
            # print(rats)
            # exit(0)
            for index, r in enumerate(rats):
                # print(max_length)
                r = np.lib.pad(r, (0, max_length - len(r)), 'constant')
                captions.append(r)
            # rationalizations = np.vstack(rationalizations)
            # captions.sort(key = lambda s: len(s))
            captions = to_var(torch.from_numpy(np.asarray(captions)))

            # lengths.append(len(rationalizations[i]))
            new_lengths = []
            # new_lengths.append(lengths[i])
            new_lengths = lengths[i:i + 2]
            new_lengths.sort()
            new_lengths.reverse()
            captions = captions
            # print(captions)
            # print(new_lengths)
            targets = pack_padded_sequence(captions,
                                           new_lengths,
                                           batch_first=True)[0]
            decoder.zero_grad()
            encoder.zero_grad()
            # print(images)
            features = encoder(images)
            # print(features)
            # print(rats)
            # print(len(lengths))
            outputs = decoder(features, captions, new_lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 11
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    #################### YOUR CODE BEGINS HERE ##################################
    # TODO [YOU] - Peform image preprocessing and data augmentation by defining transform.
    transform = None

    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    #################### YOUR CODE ENDS HERE ####################################

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss, optimizer and training pipeline
    #################### YOUR CODE BEGINS HERE ##################################
    criterion = None  #  Define the appropriate loss function
    params = None  #  Get the full list of parameters to be optimized
    optimizer = None  #  Define the optimizer. Make sure to specify the parameters to be optimized over

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = None  # Get the features from the encoder
            outputs = None  # Pass these features into the decoder, along with any other required arguments

            loss = None  # Compute the loss from the output and targets

            # TODO [YOU]
            # 1. zero out the gradients for encoder, decoder
            # 2. perform backward pass
            # 3. perform a gradient step

            #################### YOUR CODE ENDS HERE ####################################

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Esempio n. 12
0
def train_main(args):
    if not os.path.exists(args.base_dir + "model/"):
        os.mkdir(args.base_dir + "model/")

    transform = transforms.Compose([
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(base_dir + "vocab.pkl", "rb") as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)

    # 新建加载数据集
    loader = get_loader(args.base_dir,
                        args.part,
                        vocab,
                        transform,
                        args.batch_size,
                        shuffle=True,
                        num_workers=args.num_workers)
    # 随机显示一张图片和对应标签
    # plotting(loader, args)

    # 实例化编码器和解码器
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size,
                         vocab_size,
                         args.hidden_size,
                         args.num_layers,
                         max_seq=20)

    num_captions = 5
    num_examples = len(loader)
    loss_func = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.parameters()) + list(
        encoder.bn.parameters())
    optimizer = Adam(params, 0.001)

    for epoch in range(args.num_epoch):
        for i, (images, captions, lengths) in enumerate(loader):
            for j in range(num_captions):
                caption = captions[:, j, :]
                length = torch.Tensor(lengths)[:, j]
                length, _ = torch.sort(length, dim=0, descending=True)
                targets = pack_padded_sequence(caption,
                                               length,
                                               batch_first=True)[0]

                # 正反向传播及优化
                features = encoder(images)
                outputs = decoder(features, caption, length)
                loss = loss_func(outputs, targets)

                decoder.zero_grad()
                encoder.zero_grad()
                loss.backward()

                optimizer.step()
            if i % 10 == 0:
                print(
                    "Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}"
                    .format(epoch + 1, args.num_epoch, i, num_examples,
                            loss.item(), np.exp(loss.item())))
        torch.save(
            decoder.state_dict(),
            os.path.join(args.model_path,
                         'decoder-epoch-{}.ckpt'.format(epoch + 1)))
        torch.save(
            encoder.state_dict(),
            os.path.join(args.model_path,
                         'encoder-epoch-{}.ckpt'.format(epoch + 1)))
Esempio n. 13
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # if using policy gradient
    if (args.use_policy):
        # Build the models
        encoder = EncoderCNN(args.embed_size)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)
        est_encoder = EncoderCNN(args.embed_size)
        estimator = Estimator(args.embed_size, len(vocab), args.hidden_size,
                              args.num_layers)

        # if using pretrained model
        if (args.use_pretrained):
            encoder.load_state_dict(torch.load(args.pretrained_encoder))
            decoder.load_state_dict(torch.load(args.pretrained_decoder))
            est_encoder.load_state_dict(torch.load(
                args.pretrained_est_encoder))
            estimator.load_state_dict(torch.load(args.pretrained_estimator))

        if torch.cuda.is_available():
            encoder.cuda()
            decoder.cuda()
            est_encoder.cuda()
            estimator.cuda()

        # loss and optimizer
        BCE_loss = nn.BCELoss()
        label_real = to_var(torch.ones(args.batch_size, 1))
        label_fake = to_var(torch.zeros(args.batch_size, 1))

        cap_params = list(decoder.parameters()) + list(
            encoder.linear.parameters()) + list(encoder.bn.parameters())
        est_params = list(est_encoder.linear.parameters()) + list(
            est_encoder.bn.parameters()) + list(estimator.parameters())

        cap_optimizer = torch.optim.Adam(cap_params, lr=args.learning_rate)
        est_optimizer = torch.optim.Adam(est_params, lr=args.learning_rate)

        # training
        total_step = len(data_loader)
        for epoch in range(args.num_epochs):
            for i, (images, captions, lengths) in enumerate(data_loader):
                # leave last batch out
                if (i == total_step - 1):
                    print('leaving last batch out because not enough data...')
                    continue

                # Set mini-batch dataset
                # set images volatile because we don't want to calculate gradient of CNN
                images = to_var(images, volatile=True)
                captions = to_var(captions)

                # Forward, Backward and Optimize
                decoder.zero_grad()
                encoder.zero_grad()
                est_encoder.zero_grad()
                estimator.zero_grad()

                features = encoder(images)

                # outputs is a list of captions
                outputs, log_probs = decoder(features, captions, lengths, True)

                # cut off the backward pass between estimator and decoder
                outputs = Variable(outputs.data)
                est_features = est_encoder(images)

                # get the rewards of the generated captions and real captions
                rewards_fake = estimator(est_features, outputs)
                rewards_real = estimator(est_features, captions)

                # backprop the loss for estimator
                est_loss_real = BCE_loss(rewards_real, label_real)
                est_loss_fake = BCE_loss(rewards_fake, label_fake)

                # check if estimator has been trained enough
                #                 print('fake rewards:', rewards_fake)
                #                 print('real rewards:', rewards_real)
                #                 print('real loss:', est_loss_real)
                #                 print('fake loss:', est_loss_fake)

                est_loss = est_loss_real + est_loss_fake
                #                 est_loss.backward(retain_graph=True)
                est_loss.backward()
                est_optimizer.step()

                # backprop the loss for encoder and decoder of the caption generator
                rewards_fake = Variable(rewards_fake.data)
                cap_loss = []
                for r in range(rewards_fake.shape[0]):
                    for l in range(log_probs.shape[1]):
                        cap_loss.append(-log_probs[r][l] * rewards_fake[r])

                cap_loss = torch.cat(cap_loss).sum()
                #                 cap_loss.backward()
                #                 cap_optimizer.step()
                # Print log info
                if i % args.log_step == 0:
                    print(
                        'Epoch [%d/%d], Step [%d/%d], Estimator Loss: %.4f, Generator Loss: %.4f'
                        % (epoch, args.num_epochs, i, total_step,
                           est_loss.data[0], cap_loss.data[0]))

                # Save the models
                if (i + 1) % args.save_step == 0:
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(args.model_path,
                                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(args.model_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        est_encoder.state_dict(),
                        os.path.join(
                            args.model_path,
                            'est_encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        estimator.state_dict(),
                        os.path.join(
                            args.model_path,
                            'estimator-%d-%d.pkl' % (epoch + 1, i + 1)))

    # if using strict matching
    else:
        # Build the models
        encoder = EncoderCNN(args.embed_size)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)
        if torch.cuda.is_available():
            encoder.cuda()
            decoder.cuda()

        params = list(decoder.parameters()) + list(
            encoder.linear.parameters()) + list(encoder.bn.parameters())
        optimizer = torch.optim.Adam(params, lr=args.learning_rate)

        # Loss and Optimizer
        criterion = nn.CrossEntropyLoss()

        # training
        total_step = len(data_loader)
        for epoch in range(args.num_epochs):
            for i, (images, captions, lengths) in enumerate(data_loader):
                # Set mini-batch dataset
                images = to_var(images, volatile=True)
                captions = to_var(captions)

                # Forward, Backward and Optimize
                decoder.zero_grad()
                encoder.zero_grad()

                features = encoder(images)
                # pack_padded_sequence will pack a padded sequence (in time step order)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                outputs = decoder(features, captions, lengths, False)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                # Print log info
                if i % args.log_step == 0:
                    print(
                        'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                        % (epoch, args.num_epochs, i, total_step, loss.data[0],
                           np.exp(loss.data[0])))

                # Save the models
                if (i + 1) % args.save_step == 0:
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(args.model_path,
                                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(args.model_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 14
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
Esempio n. 15
0
def train(
        num_epochs: int,
        lr: float,
        batch_size: int,
        vocab_threshold: int,
        vocab_from_file: bool,
        embed_size: int,
        hidden_size: int,
        save_every: int,
        print_every: int,
        log_file: str
)-> None:
    """
    Train the captioning network with the required parameters.
    The training logs are saved in log_file.

    num_epochs:         Number of epochs to train the model.
    batch_size:         Mini-batch size for training.
    vocab_threshold:    Minimum word count threshold for vocabulary initialisation. A word that appears in
                        the dataset a fewer number of times than vocab_threshold will be discarded and
                        will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold,
                        the bigger the vocabulary.
    vocab_from_file:    Whether to load the vocabulary from a pre-initialized file.
    embed_size:         Dimensionality of image and word embeddings.
    hidden_size:        Number of features in hidden state of the RNN decoder.
    save_every:         Number of epochs between each checkpoint saving.
    print_every:        Number of batches for printing average loss.
    log_file:           Name of the training log file. Saves loss and perplexity.

    """

    transform_train = transforms.Compose([
        transforms.Resize(256),                          # smaller edge of image resized to 256
        transforms.RandomCrop(224),                      # get 224x224 crop from random location
        transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
        transforms.ToTensor(),                           # convert the PIL Image to a tensor
        transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                             (0.229, 0.224, 0.225))])

    # Build data loader.
    data_loader = get_loader(transform=transform_train,
                             mode='train',
                             batch_size=batch_size,
                             vocab_threshold=vocab_threshold,
                             vocab_from_file=vocab_from_file)

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder.
    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move models to GPU if CUDA is available.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder.to(device)
    decoder.to(device)

    # Define the loss function.
    criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

    # Parameters to update. We do not re-train de CNN here
    params = list(encoder.embed.parameters()) + list(decoder.parameters())

    # TODO: add learning rate scheduler
    # Optimizer for minimum search.
    optimizer = optim.Adam(params, lr=lr)

    # Set the total number of training steps per epoch.
    total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

    # Open the training log file.
    f = open(log_file, 'w')

    for epoch in range(1, num_epochs + 1):
        for i_step in range(1, total_step + 1):

            # Randomly sample a caption length, and sample indices with that length.
            indices = data_loader.dataset.get_train_indices()
            # Create and assign a batch sampler to retrieve a batch with the sampled indices.
            new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            data_loader.batch_sampler.sampler = new_sampler

            # Obtain the batch.
            images, captions = next(iter(data_loader))

            # Move batch of images and captions to GPU if CUDA is available.
            images = images.to(device)
            captions = captions.to(device)

            # Zero the gradients.
            decoder.zero_grad()
            encoder.zero_grad()

            # Pass the inputs through the CNN-RNN model.
            features = encoder(images)
            outputs = decoder(features, captions)

            # for i in range(10):
            #     print(torch.argmax(outputs[0,i, :]).item())

            # Calculate the batch loss.
            loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

            # Backward pass.
            loss.backward()

            # Update the parameters in the optimizer.
            optimizer.step()

            # Get training statistics.
            stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (
            epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))

            # Print training statistics (on same line).
            print('\r' + stats, end="")
            sys.stdout.flush()

            # Print training statistics to file.
            f.write(stats + '\n')
            f.flush()

            # Print training statistics (on different line).
            if i_step % print_every == 0:
                print('\r' + stats)

        # Save the weights.
        if epoch % save_every == 0:
            torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl"))
            torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl"))

    # Close the training log file.
    f.close()
Esempio n. 16
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features, cnn_features = encoder(images)
            outputs = decoder(features, cnn_features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Esempio n. 17
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    #val_loader = get_loader('./data/val_resized2014/', './data/annotations/captions_val2014.json',
    #                         vocab, transform, 1, False, 1)

    start_epoch = 0

    encoder_state = args.encoder
    decoder_state = args.decoder

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    if not args.train_encoder:
        encoder.eval()
    decoder = VRNN(args.embed_size, args.hidden_size, len(vocab),
                   args.latent_size, args.num_layers)

    if args.restart:
        encoder_state, decoder_state = 'new', 'new'

    if encoder_state == '': encoder_state = 'new'
    if decoder_state == '': decoder_state = 'new'

    print("Using encoder: {}".format(encoder_state))
    print("Using decoder: {}".format(decoder_state))

    try:
        start_epoch = int(float(decoder_state.split('-')[1]))
    except:
        pass

    if encoder_state != 'new':
        encoder.load_state_dict(torch.load(encoder_state))
    if decoder_state != 'new':
        decoder.load_state_dict(torch.load(decoder_state))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    """ Make logfile and log output """
    with open(args.model_path + args.logfile, 'a+') as f:
        f.write("Using encoder: new\nUsing decoder: new\n\n")

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Optimizer
    cross_entropy = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    batch_loss = []
    batch_loss_det = []
    batch_kl = []
    batch_ml = []
    batch_acc = []

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(start_epoch, args.num_epochs):
        for i, (images, captions, lengths, _, _) in enumerate(data_loader):

            # get lengths excluding <start> symbol
            lengths = [l - 1 for l in lengths]

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)

            # assuming following assertion
            assert min(lengths) > args.z_step + 2

            # get targets from captions (excluding <start> tokens)
            #targets = pack_padded_sequence(captions[:,1:], lengths, batch_first=True)[0]
            targets_var = captions[:, args.z_step + 1]
            targets_det = pack_padded_sequence(
                captions[:, args.z_step + 2:],
                [l - args.z_step - 1 for l in lengths],
                batch_first=True)[0]

            # Get prior and approximate distributions
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            prior, q_z, q_x, det_x = decoder(features,
                                             captions,
                                             lengths,
                                             z_step=args.z_step)

            # Calculate KL Divergence
            kl = torch.mean(kl_divergence(*q_z + prior))

            # Get marginal likelihood from log likelihood of the correct symbol
            index = (torch.cuda.LongTensor(range(q_x.shape[0])), targets_var)
            ml = torch.mean(q_x[index])

            # Get Cross-Entropy loss for deterministic decoder
            ce = cross_entropy(det_x, targets_det)

            elbo = ml - kl
            loss_var = -elbo

            loss_det = ce

            loss = loss_var + loss_det

            batch_loss.append(loss.data[0])
            batch_loss_det.append(loss_det.data[0])
            batch_kl.append(kl.data[0])
            batch_ml.append(ml.data[0])

            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

                with open(args.model_path + args.logfile, 'a') as f:
                    f.write(
                        'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                        % (epoch, args.num_epochs, i, total_step, loss.data[0],
                           np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                if args.train_encoder:
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(args.model_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                with open(args.model_path + 'training_loss.pkl', 'w+') as f:
                    pickle.dump(batch_loss, f)
                with open(args.model_path + 'training_val.pkl', 'w+') as f:
                    pickle.dump(batch_acc, f)

    with open(args.model_path + args.logfile, 'a') as f:
        f.write("Training finished at {} .\n\n".format(str(datetime.now())))
Esempio n. 18
0
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
        epoch_loss += loss.item()
    epoch_loss /= total_step

    # Save the weights.
    if save_every == -1:
        # Only save the best one so far!
        if epoch_loss <= smallest_loss:
            torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss)))
            torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss)))
            smallest_loss = epoch_loss
    elif epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models', "{:02d}-decoder-{:.4f}.pkl".format(epoch, epoch_loss)))
        torch.save(encoder.state_dict(), os.path.join('./models', "{:02d}-encoder-{:.4f}.pkl".format(epoch, epoch_loss)))

# Close the training log file.
f.close()
Esempio n. 19
0
def main(args):
    checkpoint = True

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    init_folder = 'results'

    if not os.path.exists(init_folder):
        os.makedirs(init_folder)
    # else:
    #     shutil.rmtree(init_folder)
    #     os.makedirs(init_folder)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    if args.with_glove == 'True':
        # Get glove pickles
        glove_path = args.glove_path
        vectors = bcolz.open(f'{glove_path}/6B.{args.embed_size}.dat')[:]
        words = pickle.load(
            open(f'{glove_path}/6B.{args.embed_size}_words.pkl', 'rb'))
        word2idx = pickle.load(
            open(f'{glove_path}/6B.{args.embed_size}_idx.pkl', 'rb'))
        glove = {w: vectors[word2idx[w]] for w in words}

        # Get weights matrix
        weights_matrix = np.zeros((len(vocab), args.embed_size))
        words_found = 0

        # We compare the vocabulary from the built vocab, and the glove word vectors
        for i in range(len(vocab)):
            try:
                word = vocab.idx2word[i]
                weights_matrix[i] = glove[word]
                words_found += 1
            except KeyError:
                weights_matrix[i] = np.random.normal(scale=0.6,
                                                     size=(args.embed_size, ))

        # Build the models
        encoder = EncoderCNN(args.embed_size).to(device)
        decoder = DecoderRNNGlove(args.hidden_size, weights_matrix,
                                  args.num_layers).to(device)
    else:
        # Build models normally
        encoder = EncoderCNN(args.embed_size).to(device)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers).to(device)

    # Build data loader
    train_data_loader = get_loader(args.image_dir,
                                   args.caption_path,
                                   vocab,
                                   transform,
                                   args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers)

    val_data_loader = get_loader(args.val_image_dir,
                                 args.val_caption_path,
                                 vocab,
                                 transform,
                                 args.batch_size,
                                 shuffle=True,
                                 num_workers=args.num_workers)

    if not args.reset_training:
        if isfile(os.path.join(args.model_path, 'best_encoder.ckpt')):
            encoder.load_state_dict(
                torch.load(os.path.join(args.model_path, 'best_encoder.ckpt')))
            print('Encoder weights loaded!')
        else:
            print(
                'Weights file for encoder does not exist. Encoder will be initialized with default values.'
            )

        if isfile(os.path.join(args.model_path, 'best_decoder.ckpt')):
            decoder.load_state_dict(
                torch.load(os.path.join(args.model_path, 'best_decoder.ckpt')))
            print('Decoder weights loaded!')
        else:
            print(
                'Weights file for decoder does not exist. Decoder will be initialized with default values.'
            )

        if isfile(os.path.join(args.model_path, 'last_best_bleu4.npy')):
            temp = np.load(os.path.join(args.model_path,
                                        'last_best_bleu4.npy'),
                           allow_pickle='TRUE').item()
            best_bleu4 = temp['best_bleu4']
            train_encoder = temp['train_encoder']
            print(
                f'Previous best bleu4 score: {best_bleu4}, training_encoder: {train_encoder}'
            )
        else:
            best_bleu4 = 0
            train_encoder = False
    else:
        best_bleu4 = 0
        train_encoder = False

    best_epoch = 0

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    encoder_optimizer = torch.optim.Adam(params, lr=args.encoder_learning_rate)
    decoder_optimizer = torch.optim.Adam(params, lr=args.decoder_learning_rate)

    train_losses = []
    val_losses = []
    bleu1_scores = []
    bleu2_scores = []
    bleu3_scores = []
    bleu4_scores = []
    cider_scores = []
    rouge_scores = []
    for epoch in range(1, args.num_epochs + 1):

        train_loss = train(train_data_loader, encoder, decoder, criterion,
                           encoder_optimizer, decoder_optimizer, epoch,
                           train_encoder)
        score_dict, val_loss = validate(val_data_loader, encoder, decoder,
                                        criterion, vocab, epoch)

        train_losses.append(train_loss)
        val_losses.append(val_loss)
        bleu1_scores.append(score_dict['Bleu_1'])
        bleu2_scores.append(score_dict['Bleu_2'])
        bleu3_scores.append(score_dict['Bleu_3'])
        bleu4_scores.append(score_dict['Bleu_4'])
        cider_scores.append(score_dict['CIDEr'])
        rouge_scores.append(score_dict['ROUGE_L'])

        # Check if there was an improvement
        bleu4_score = score_dict['Bleu_4']
        print(f'Last best score {best_bleu4}, at epoch {best_epoch}')
        if bleu4_score > best_bleu4:
            best_bleu4 = bleu4_score
            best_epoch = epoch
            print(f'New best score {best_bleu4}, at epoch {best_epoch}')
            torch.save(decoder.state_dict(),
                       os.path.join(args.model_path, 'best_decoder.ckpt'))
            torch.save(encoder.state_dict(),
                       os.path.join(args.model_path, 'best_encoder.ckpt'))
            np.save(os.path.join(args.model_path, 'last_best_bleu4.npy'), {
                'best_bleu4': best_bleu4,
                'train_encoder': train_encoder
            })

        else:
            if train_encoder:
                train_encoder = False
                print(
                    'No impovement in Bleu4 score. Switching from training Encoder to Decoder'
                )
            else:
                train_encoder = True
                print(
                    'No impovement in Bleu4 score. Switching from training Decoder to Encoder'
                )

            np.save(os.path.join(args.model_path, 'last_best_bleu4.npy'), {
                'best_bleu4': best_bleu4,
                'train_encoder': train_encoder
            })

#########################################################################################

    plot_loss_graph(args.num_epochs, train_losses, val_losses, init_folder)
    plot_score_graph(args.num_epochs, bleu1_scores, bleu2_scores, bleu3_scores,
                     bleu4_scores, cider_scores, rouge_scores, init_folder)