コード例 #1
0
def main():
    args = parse_arguments()
    if args.enable_cuda and torch.cuda.is_available():
        args.device = torch.device('cuda')
    else:
        args.device = torch.device('cpu')

    preprocess = transforms.Compose(
        (transforms.Resize(256), transforms.CenterCrop(224),
         transforms.ToTensor(),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])))

    coco_train = coco.CocoCaptions(TRAIN_DIR_IMGS,
                                   TRAIN_DIR_CAPTIONS,
                                   transform=preprocess)

    data_loader = torch.utils.data.DataLoader(dataset=coco_train,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=2)

    model = EncoderCNN()
    model = nn.DataParallel(model.train(False).to(args.device))

    vectors, captions = [], []
    for img_batch, capt_batch in tqdm(data_loader):
        capt_batch = list(zip(*capt_batch))
        vec_batch = model(img_batch)

        captions.extend(capt_batch)
        vectors.extend([vec for vec in vec_batch])

    captions_tokenized = list([[caption.lower() for caption in caption_list]
                               for caption_list in captions])

    if not os.path.exists(DATA_DIR):
        os.mkdir(DATA_DIR)
    else:
        shutil.rmtree(DATA_DIR)

    np.save(DATA_DIR + "image_codes.npy", np.asarray(vectors))
    with open(DATA_DIR + "captions_tokenized.json", "w") as file_capt:
        json.dump(captions_tokenized, file_capt)
コード例 #2
0
        new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
        train_data_loader.batch_sampler.sampler = new_sampler

        # Obtain the batch.
        images, captions = next(iter(train_data_loader))

        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)

        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()

        # set the encoder decoder in training mode
        encoder.train()
        decoder.train()

        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)

        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

        # Backward pass.
        loss.backward()

        # Update the parameters in the optimizer.
        optimizer.step()
コード例 #3
0
    data_file="test_no_dup_with_category_3more_name.json",
    use_mean_img=False,
    neg_samples=False)
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    collate_fn=lstm_collate_fn,
)

encoder_cnn = EncoderCNN(emb_size)
encoder_cnn.load_state_dict(torch.load("./encoder_cnn.pth"))
print("Successfully load trained weights...")
encoder_cnn = encoder_cnn.to(device)
encoder_cnn.train(False)

test_features = {}
for batch_num, input_data in enumerate(test_loader, 1):
    print("#{}\r".format(batch_num), end="")
    lengths, images, names, offsets, set_ids, labels, is_compat = input_data

    image_seqs = images.to(device)
    with torch.no_grad():
        emb_seqs = encoder_cnn(image_seqs)

    batch_ids = []
    for set_id, items in zip(set_ids, labels):
        for item in items:
            batch_ids.append(item)
コード例 #4
0
def main(args):

    #setup tensorboard
    if args.tensorboard:
        cc = CrayonClient(hostname="localhost")
        print(cc.get_experiment_names())
        #if args.name in cc.get_experiment_names():
        try:
            cc.remove_experiment(args.name)
        except:
            print("experiment didnt exist")
        cc_server = cc.create_experiment(args.name)

    # Create model directory
    full_model_path = args.model_path + "/" + args.name
    if not os.path.exists(full_model_path):
        os.makedirs(full_model_path)
    with open(full_model_path + "/parameters.json", 'w') as f:
        f.write((json.dumps(vars(args))))

    # Image preprocessing

    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    mini_transform = transforms.Compose(
        [transforms.ToPILImage(),
         transforms.Scale(20),
         transforms.ToTensor()])

    # Load vocabulary wrapper.
    if args.vocab_path is not None:
        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)
    else:
        print("building new vocab")
        vocab = build_vocab(args.image_dir, 1, None)
        with open((full_model_path + "/vocab.pkl"), 'wb') as f:
            pickle.dump(vocab, f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    code_data_set = ProcessingDataset(root=args.image_dir,
                                      vocab=vocab,
                                      transform=transform)
    train_ds, val_ds = validation_split(code_data_set)
    train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn)
    train_size = len(train_loader)
    test_size = len(test_loader)

    # Build the models
    encoder = EncoderCNN(args.embed_size, args.train_cnn)
    print(encoder)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    print(decoder)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    start_time = time.time()
    add_log_entry(args.name, start_time, vars(args))

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            decoder.train()
            encoder.train()
            # Set mini-batch dataset
            image_ts = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            count = images.size()[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(image_ts)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total = targets.size(0)
            max_index = outputs.max(dim=1)[1]
            #correct = (max_index == targets).sum()
            _, predicted = torch.max(outputs.data, 1)
            correct = predicted.eq(targets.data).cpu().sum()
            accuracy = 100. * correct / total

            if args.tensorboard:
                cc_server.add_scalar_value("train_loss", loss.data[0])
                cc_server.add_scalar_value("perplexity", np.exp(loss.data[0]))
                cc_server.add_scalar_value("accuracy", accuracy)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       accuracy, np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(full_model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(full_model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
            if 1 == 2 and i % int(train_size / 10) == 0:
                encoder.eval()
                #decoder.eval()
                correct = 0
                for ti, (timages, tcaptions,
                         tlengths) in enumerate(test_loader):
                    timage_ts = to_var(timages, volatile=True)
                    tcaptions = to_var(tcaptions)
                    ttargets = pack_padded_sequence(tcaptions,
                                                    tlengths,
                                                    batch_first=True)[0]
                    tfeatures = encoder(timage_ts)
                    toutputs = decoder(tfeatures, tcaptions, tlengths)
                    print(ttargets)
                    print(toutputs)
                    print(ttargets.size())
                    print(toutputs.size())
                    #correct = (ttargets.eq(toutputs[0].long())).sum()

                accuracy = 100 * correct / test_size
                print('accuracy: %.4f' % (accuracy))
                if args.tensorboard:
                    cc_server.add_scalar_value("accuracy", accuracy)

    torch.save(
        decoder.state_dict(),
        os.path.join(full_model_path,
                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    torch.save(
        encoder.state_dict(),
        os.path.join(full_model_path,
                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    end_time = time.time()
    print("finished training, runtime: %d", [(end_time - start_time)])
コード例 #5
0
ファイル: train.py プロジェクト: yqyqyq123/captioning
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    transform_val = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    val_loader = get_loader(args.val_dir,
                            args.val_caption_path,
                            vocab,
                            transform_val,
                            args.batch_size,
                            shuffle=False,
                            num_workers=args.num_workers)
    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    encoder.freeze_bottom()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)
    #     decoder = BahdanauAttnDecoderRNN(args.hidden_size, args.embed_size, len(vocab)).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    accs, b1s, b2s, b3s, b4s = [], [], [], [], []
    for epoch in range(args.num_epochs):
        decoder.train()
        encoder.train()
        losses = []
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            losses.append(loss.item())
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch + 1, args.num_epochs, i, total_step,
                            loss.item(), np.exp(loss.item())))

            # Save the model checkpoints
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-{}-{}.ckpt'.format(epoch + 1,
                                                             i + 1)))


#         acc, b1, b2, b3, b4 = evaluate(val_loader, encoder, decoder, vocab)
#         accs.append(acc)
#         b1s.append(b1)
#         b2s.append(b2)
#         b3s.append(b3)
#         b4s.append(b4)
        avg_loss = sum(losses) / total_step

        print('Epoch {} Average Training Loss: {:.4f}'.format(
            epoch + 1, avg_loss))

        with open('stem_freeze_freq1000.txt', 'a') as file:
            file.write("Epoch {} \n".format(epoch + 1))
            file.write('Average Accuracy: {} \n'.format(acc))
            file.write('Average Loss: {} \n'.format(avg_loss))
            file.write('Average BLEU gram1: {} \n'.format(b1))
            file.write('Average BLEU gram2: {} \n'.format(b2))
            file.write('Average BLEU gram3: {} \n'.format(b3))
            file.write('Average BLEU gram4: {} \n'.format(b4))
            file.write('\n')

    plt.title("Accuracy vs BLEU score")
    plt.plot(np.arange(1, args.num_epochs + 1), accs, label='accuracy')
    plt.plot(np.arange(1, args.num_epochs + 1), b1s, label='BLEU 1')
    plt.plot(np.arange(1, args.num_epochs + 1), b2s, label='BLEU 2')
    plt.plot(np.arange(1, args.num_epochs + 1), b3s, label='BLEU 3')
    plt.plot(np.arange(1, args.num_epochs + 1), b4s, label='BLEU 4')
    plt.xlabel("epochs")
    plt.xticks(np.arange(1, args.num_epochs + 1))
    plt.legend(loc='upper left')
    plt.savefig('accuracy_BLEU.png')
    plt.clf()