Beispiel #1
0
def main(args):
  
  vocab = load_vocab()
  
  encoder = CNNEncoder()
  decoder = DecoderRNN(512,512,len(vocab))
  
  encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(args.checkpoint_file,False)
  encoder.load_state_dict(encoder_state_dict)
  decoder.load_state_dict(decoder_state_dict)
  
  if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()
    
  transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
        ])
  
  inp = cv2.imread(args.image_path)
  inp = transform(Image.fromarray(inp)).unsqueeze(0)
  inp = utils.to_var(inp, volatile=True)
  
  features = encoder(inp)
  sampled_ids = decoder.sample(features)
  
  sampled_ids = sampled_ids.cpu().data.numpy()[0]
  sentence = utils.convert_back_to_text(sampled_ids, vocab)
  
  print('Caption:', sentence)
Beispiel #2
0
def main(args):

    print("Process %s, running on %s: starting (%s)" % (
        os.getpid(), os.name, time.asctime()))

    encoder = EncoderCNN()
    decoder = DecoderRNN()
    if torch.cuda.is_available() and args.gpu:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    encoder_trainables = [p for p in encoder.parameters() if p.requires_grad]
    decoder_trainables = [p for p in decoder.parameters() if p.requires_grad]

    params = encoder_trainables + decoder_trainables

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    data_loader = trainloader(transform=transform)
    optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
def main(args):
    # hyperparameters
    batch_size = args.batch_size
    num_workers = 1

    # Image Preprocessing
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])

    # load COCOs dataset
    IMAGES_PATH = 'data/train2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_train2014.json'

    vocab = load_vocab()
    train_loader = get_coco_data_loader(path=IMAGES_PATH,
                                        json=CAPTION_FILE_PATH,
                                        vocab=vocab,
                                        transform=transform,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=num_workers)

    IMAGES_PATH = 'data/val2014'
    CAPTION_FILE_PATH = 'data/annotations/captions_val2014.json'
    val_loader = get_coco_data_loader(path=IMAGES_PATH,
                                      json=CAPTION_FILE_PATH,
                                      vocab=vocab,
                                      transform=transform,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers)

    losses_val = []
    losses_train = []

    # Build the models
    ngpu = 1
    initial_step = initial_epoch = 0
    embed_size = args.embed_size
    num_hiddens = args.num_hidden
    learning_rate = 5e-4
    num_epochs = 2
    log_step = args.log_step
    save_step = 500
    checkpoint_dir = args.checkpoint_dir

    encoder = CNNEncoder()
    decoder = DecoderRNN(embed_size, num_hiddens, len(vocab))

    # Loss
    criterion = nn.CrossEntropyLoss()

    if args.checkpoint_file:
        encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(
            args.checkpoint_file, args.sample)
        initial_step, initial_epoch, losses_train, losses_val = meta
        encoder.load_state_dict(encoder_state_dict)
        decoder.load_state_dict(decoder_state_dict)
    else:
        params = list(decoder.parameters()) + list(
            encoder.batchnorm.parameters())
        optimizer = torch.optim.Adam(params, lr=learning_rate)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    if args.sample:
        return utils.sample(encoder, decoder, vocab, val_loader)

    # Train the Models
    total_step = len(train_loader)
    try:
        for epoch in range(initial_epoch, num_epochs):

            for step, (images, captions,
                       lengths) in enumerate(train_loader, start=initial_step):

                # Set mini-batch dataset
                images = utils.to_var(images, volatile=True)
                captions = utils.to_var(captions)
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # Forward, Backward and Optimize
                decoder.zero_grad()
                encoder.zero_grad()
                if ngpu > 1:
                    # run on multiple GPU
                    features = nn.parallel.data_parallel(
                        encoder, images, range(ngpu))
                    outputs, alphas = nn.parallel.data_parallel(
                        decoder, features, range(ngpu))
                else:
                    # run on single GPU
                    features = encoder(images)
                    outputs, alphas = decoder(features, captions, lengths)

                train_loss = criterion(outputs, targets.cpu())
                train_loss += ((1. - alphas.sum(dim=1))**2).mean()
                losses_train.append(train_loss.data)
                train_loss.backward()
                optimizer.step()

                print('Epoch: {} - Step: {} - Train Loss: {}'.format(
                    epoch, step, losses_train[-1]))
                # Run validation set and predict
                if step % log_step == 404:
                    encoder.batchnorm.eval()
                    # run validation set
                    batch_loss_val = []
                    for val_step, (images, captions,
                                   lengths) in enumerate(val_loader):
                        images = utils.to_var(images, volatile=True)
                        captions = utils.to_var(captions, volatile=True)

                        targets = pack_padded_sequence(captions,
                                                       lengths,
                                                       batch_first=True)[0]
                        features = encoder(images)
                        outputs, alphas = decoder(features, captions, lengths)
                        val_loss = criterion(outputs, targets.cpu())
                        val_loss += ((1. - alphas.sum(dim=1))**2).mean()
                        batch_loss_val.append(val_loss.data)
                        if val_step % 50 == 0:
                            print('Epoch: {} - Step: {} - Mini Eval Loss: {}'.
                                  format(epoch, val_step, val_loss))
                            sampled_ids = decoder.sample(features)
                            sampled_ids = sampled_ids.cpu().data.numpy()[0]
                            sentence = utils.convert_back_to_text(
                                sampled_ids, vocab)
                            print('Sample:', sentence)

                            true_ids = captions.cpu().data.numpy()[0]
                            sentence = utils.convert_back_to_text(
                                true_ids, vocab)
                            print('Target:', sentence)

                    losses_val.append(np.mean(batch_loss_val))
                    # predict

                    print('Epoch: {} - Step: {} - Eval Loss: {}'.format(
                        epoch, step, losses_val[-1]))
                    encoder.batchnorm.train()

                # Save the models
                if (step + 1) % save_step == 0:
                    utils.save_models(encoder, decoder, optimizer, step, epoch,
                                      losses_train, losses_val, checkpoint_dir)
                    utils.dump_losses(
                        losses_train, losses_val,
                        os.path.join(checkpoint_dir, 'losses.pkl'))

    except KeyboardInterrupt:
        pass
    finally:
        # Do final save
        utils.save_models(encoder, decoder, optimizer, step, epoch,
                          losses_train, losses_val, checkpoint_dir)
        utils.dump_losses(losses_train, losses_val,
                          os.path.join(checkpoint_dir, 'losses.pkl'))
Beispiel #4
0
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

    return decoded_words  #, decoder_attentions[:di + 1]


def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')


input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
noise = torch.Tensor(list(range(output_lang.n_words)))
print(random.choice(pairs))

hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size)
decoder1 = DecoderRNN(hidden_size, output_lang.n_words, 1)

if use_cuda:
    encoder1 = encoder1.cuda()
    decoder1 = decoder1.cuda()

trainIters(encoder1, decoder1, 25000, print_every=50)

evaluateRandomly(encoder1, decoder1, 20)
Beispiel #5
0
# Build Dataset Loader
train_loader = get_loader(train_image_path,
                          train_json_path,
                          vocab,
                          transform,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=2)
total_step = len(train_loader)

# Build Models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
encoder.cuda()
decoder.cuda()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)

# Train the Decoder
for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(train_loader):
        # Set mini-batch dataset
        images = Variable(images).cuda()
        captions = Variable(captions).cuda()
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

        # Forward, Backward and Optimize
        decoder.zero_grad()