Ejemplo n.º 1
0
def main():
    # load vocablary
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)

    # build model
    encoder = EncoderCNN(300)
    decoder = FactoredLSTM(300, 512, 512, len(vocab))

    encoder.load_state_dict(torch.load('pretrained_models/encoder-15.pkl'))
    decoder.load_state_dict(torch.load('pretrained_models/decoder-15.pkl'))

    # prepare images
    transform = transforms.Compose([
        Rescale((224, 224)),
        transforms.ToTensor()
        ])
    img_names, img_list = load_sample_images('sample_images/', transform)
    image = to_var(img_list[30], volatile=True)

    # if torch.cuda.is_available():
    #     encoder = encoder.cuda()
    #     decoder = decoder.cuda()

    # farward
    features = encoder(image)
    output = decoder.sample(features, mode="factual")

    caption = [vocab.i2w[x] for x in output]
    print(img_names[30])
    print(caption)
Ejemplo n.º 2
0
def train_attention_captioner():
    print("Training The Attention Capitoner ... ")
    # Create model directory
    if not os.path.exists(path_trained_model):
        os.makedirs(path_trained_model)

    # Image preprocessing, first resize the input image then do normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((input_resnet_size, input_resnet_size),
                          interpolation=Image.ANTIALIAS),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Loading pickle dictionary
    with open(dict_path, 'rb') as file:
        dictionary = pickle.load(file)

    # Build data loader
    data_loader = get_loader(imgs_path,
                             data_caps,
                             dictionary,
                             transform,
                             BATCH_SIZE,
                             shuffle=True,
                             num_workers=2)

    # Building the Models
    encoder = EncoderCNN(word_embedding_size).to(device)
    attn_decoder = AttnDecoderRNN(word_embedding_size, len(dictionary[0]))

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(attn_decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=LEARN_RATE)

    word2idx = dictionary[0]
    # Initiazling the decoder hidden and output
    decoder_input = torch.tensor([[word2idx['START']]]).to(device)
    decoder_hidden = torch.zeros(word_embedding_size).to(device)

    total_steps = len(data_loader)
    for epcoh in range(NUM_EPOCHS):
        for i, (images, captions, lengths) in enumerate(data_loader):

            print(images.Size, captions.Size, lengths.Size)

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            features = encoder(images)
            decoder_output, decoder_hidden, attn_weights = attn_decoder(
                decoder_input, decoder_hidden, features)
Ejemplo n.º 3
0
	def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq):
		super(PolicyNet, self).__init__()
		self.embed_size = embed_size
		self.vocab_size = vocab_size
		self.hidden_size = hidden_size
		self.vocab = vocab
		self.CNNp = EncoderCNN(embed_size)
		self.RNNp = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
Ejemplo n.º 4
0
def do(args: argparse.Namespace):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('gpu:', args.gpu)
    if not os.path.exists(args.save_model_path):
        os.mkdir(args.save_model_path)
    # preprocess
    preprocess = transforms.Compose([
        transforms.RandomCrop(args.random_crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
    ])
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # dataset
    coco_loader = get_dataloader(root=args.dataset_path, json_path=args.json_path, vocab=vocab, batch_size=args.batch_size, num_workers=args.num_workers,
                                 transform=preprocess, shuffle=False)
    # models
    encoder = EncoderCNN(args.embed_size).cuda()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda()
    loss_cls = nn.CrossEntropyLoss().cuda()
    params = list(encoder.fc.parameters()) + list(encoder.bn1d.parameters()) + list(decoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    # resume
    if args.resume:
        model_states = torch.load(os.path.join(args.save_model_path, 'model.ckpt'))
        print('checkpoint epoch: %d\tstep: %d' % (model_states['epoch'], model_states['step']))
        encoder.load_state_dict(model_states['encoder'])
        decoder.load_state_dict(model_states['decoder'])
        print('load successfully')
    # train
    total_step = len(coco_loader)
    print('total step in each epoch : ', total_step)
    encoder.fc.train(mode=True)
    encoder.bn1d.train(mode=True)
    encoder.encoder.eval()
    decoder.train(mode=True)
    input('ready')
    for cur_epoch in range(args.num_epochs):
        for cur_step, (image, caption, length) in enumerate(coco_loader):
            image = image.cuda()
            caption = caption.cuda()
            target = pack_padded_sequence(caption, length, batch_first=True)[0]
            out = decoder(encoder(image), caption, length)
            loss = loss_cls(out, target)
            encoder.zero_grad()
            decoder.zero_grad()
            loss.backward()
            optimizer.step()
            if (cur_step + 1) % args.print_step == 0:
                print('Epoch : %d/%d\tStep : %d/%d\tLoss : %.8f\tPerplexity : %.8f' % (
                    cur_epoch + 1, args.num_epochs, cur_step + 1, total_step, loss.item(), np.exp(loss.item())))
            if (cur_step + 1) % args.save_model_step == 0:
                torch.save({'epoch': cur_epoch + 1, 'step': cur_step + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()},
                           os.path.join(args.save_model_path, 'model.ckpt'))
                print('model saved at E:%d\tS:%d' % (cur_epoch + 1, cur_step + 1))
Ejemplo n.º 5
0
def main():
    with open("data/vocab.pkl", 'rb') as f:
        vocab = pickle.load(f)

    img_path = "data/flickr7k_images"
    cap_path = "data/factual_train.txt"
    styled_path = "data/humor/funny_train.txt"
    data_loader = get_data_loader(img_path, cap_path, vocab, 3)
    styled_data_loader = get_styled_data_loader(styled_path, vocab, 3)

    encoder = EncoderCNN(30)
    decoder = FactoredLSTM(30, 40, 40, len(vocab))

    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # for i, (images, captions, lengths) in enumerate(data_loader):
    for i, (captions, lengths) in enumerate(styled_data_loader):
        # images = Variable(images, volatile=True)
        captions = Variable(captions.long())

        if torch.cuda.is_available():
            # images = images.cuda()
            captions = captions.cuda()

        # features = encoder(images)

        outputs = decoder(captions, features=None, mode="humorous")
        print(lengths - 1)
        print(outputs)
        print(captions[:, 1:])

        loss = masked_cross_entropy(outputs, captions[:, 1:].contiguous(),
                                    lengths - 1)

        print(loss)

        break
Ejemplo n.º 6
0
	def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq):
		super(ValueNet, self).__init__()
		self.embed_size = embed_size
		self.vocab_size = vocab_size
		self.hidden_size = hidden_size
		self.vocab = vocab
		self.CNNv = EncoderCNN(embed_size)
		self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
		self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size))
		self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size))
		self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1))
		self.relu = nn.LeakyReLU(0.2, inplace = True)
		self.norm1 = nn.LayerNorm(embed_size)
		self.norm2 = nn.LayerNorm(embed_size)
Ejemplo n.º 7
0
def main(args):

    print("Process %s, running on %s: starting (%s)" % (
        os.getpid(), os.name, time.asctime()))

    encoder = EncoderCNN()
    decoder = DecoderRNN()
    if torch.cuda.is_available() and args.gpu:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    encoder_trainables = [p for p in encoder.parameters() if p.requires_grad]
    decoder_trainables = [p for p in decoder.parameters() if p.requires_grad]

    params = encoder_trainables + decoder_trainables

    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    data_loader = trainloader(transform=transform)
    optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
Ejemplo n.º 8
0
def do(args: argparse.Namespace):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('gpu :', args.gpu)
    # preprocess
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    # vocab
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # model
    encoder = EncoderCNN(args.embed_size).cuda()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size,
                         args.num_layers).cuda()
    model_state = torch.load(args.checkpoint_path)
    encoder.load_state_dict(model_state['encoder'])
    decoder.load_state_dict(model_state['decoder'])
    print('load successfully at\tepoch:%d\tstep:%d' %
          (model_state['epoch'], model_state['step']))
    encoder.eval()
    decoder.eval()
    # image
    img = load_image(args.img_path, preprocess).cuda()
    outs = decoder.sample(encoder(img))
    outs = outs.cpu().numpy()
    print(outs)
    # caption
    caption = []
    for word_id in outs:
        word = vocab.idx2word[word_id]
        caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(caption)
    print(sentence)
Ejemplo n.º 9
0
    return image


# Image preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
])

# Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

# Build models
encoder = EncoderCNN(
    embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
encoder = encoder.to(device)
decoder = decoder.to(device)

# Load the trained model parameters
encoder.load_state_dict(torch.load(encoder_path))
decoder.load_state_dict(torch.load(decoder_path))

# Prepare an image
image = load_image(image_path, transform)
image_tensor = image.to(device)

# Generate an caption from the image
feature = encoder(image_tensor)
sampled_ids = decoder.sample(feature)
Ejemplo n.º 10
0
else:

    data_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # using ImageNet norms
            (0.229, 0.224, 0.225))
    ])

    test_lines = read_lines(TOKEN_FILE_TEST)
    test_image_ids, test_cleaned_captions = parse_lines(test_lines)

    # load models
    encoder = EncoderCNN().to(device)
    decoder = torch.load("decoder.ckpt").to(device)
    encoder.eval()
    decoder.eval()  # generate caption, eval mode to not influence batchnorm

    #########################################################################
    #
    #        QUESTION 2.1 Generating predictions on test data
    #
    #########################################################################

    # TODO define decode_caption() function in utils.py
    image_id_candidate_reference = {}  # type: dict[str, dict[str, list[str]]]
    import os
    if os.path.exists("image_id_candidate_reference.pt"):
        image_id_candidate_reference = torch.load(
def script(args):
    transform = transforms.Compose([
        transforms.Resize(args.img_size),
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
    ])

    train_loader, vocab = get_loader(args.root_dir, args.train_tsv_path,
                                     args.image_path, transform,
                                     args.batch_size, args.shuffle,
                                     args.num_workers)

    vocab_size = len(vocab)
    print("vocab_size: ", vocab_size)

    val_loader, _ = get_loader(args.root_dir, args.val_tsv_path,
                               args.image_path, transform, args.batch_size,
                               args.shuffle, args.num_workers, vocab)

    encoderCNN = EncoderCNN().to(args.device)

    sentLSTM = SentenceLSTM(encoderCNN.enc_dim, args.sent_hidden_dim,
                            args.att_dim, args.sent_input_dim,
                            args.word_input_dim,
                            args.int_stop_dim).to(args.device)

    wordLSTM = WordLSTM(args.word_input_dim, args.word_hidden_dim, vocab_size,
                        args.num_layers).to(args.device)

    criterion_stop = nn.CrossEntropyLoss().to(args.device)
    criterion_words = nn.CrossEntropyLoss().to(args.device)

    params_cnn = list(encoderCNN.parameters())
    params_lstm = list(sentLSTM.parameters()) + list(wordLSTM.parameters())

    optim_cnn = torch.optim.Adam(params=params_cnn, lr=args.learning_rate_cnn)
    optim_lstm = torch.optim.Adam(params=params_lstm,
                                  lr=args.learning_rate_lstm)

    total_step = len(train_loader)

    evaluate(args, val_loader, encoderCNN, sentLSTM, wordLSTM, vocab)

    for epoch in range(args.num_epochs):
        encoderCNN.train()
        sentLSTM.train()
        wordLSTM.train()

        for i, (images, captions, prob) in enumerate(train_loader):
            optim_cnn.zero_grad()
            optim_lstm.zero_grad()

            batch_size = images.shape[0]
            images = images.to(args.device)
            captions = captions.to(args.device)
            prob = prob.to(args.device)

            vis_enc_output = encoderCNN(images)

            topics, ps = sentLSTM(vis_enc_output, captions, args.device)

            loss_sent = criterion_stop(ps.view(-1, 2), prob.view(-1))

            loss_word = torch.tensor([0.0]).to(args.device)

            for j in range(captions.shape[1]):
                word_outputs = wordLSTM(topics[:, j, :], captions[:, j, :])

                loss_word += criterion_words(
                    word_outputs.contiguous().view(-1, vocab_size),
                    captions[:, j, :].contiguous().view(-1))

            loss = args.lambda_sent * loss_sent + args.lambda_word * loss_word

            loss.backward()
            optim_cnn.step()
            optim_lstm.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, args.num_epochs, i, total_step, loss.item()))

            ## Save the model checkpoints
            # if (i+1) % args.save_step == 0:
            #     torch.save(decoder.state_dict(), os.path.join(
            #         args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
            #     torch.save(encoder.state_dict(), os.path.join(
            #         args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))

        evaluate(args, val_loader, encoderCNN, sentLSTM, wordLSTM, vocab)
Ejemplo n.º 12
0
# Load Vocabulary Wrapper
with open('./data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

# Build Dataset Loader
train_loader = get_loader(train_image_path,
                          train_json_path,
                          vocab,
                          transform,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=2)
total_step = len(train_loader)

# Build Models
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
encoder.cuda()
decoder.cuda()

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)

# Train the Decoder
for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(train_loader):
        # Set mini-batch dataset
        images = Variable(images).cuda()
        captions = Variable(captions).cuda()
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
Ejemplo n.º 13
0
def main():

    cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args = get_parser().parse_args()

    NUM_WORKERS = 4
    CROP_SIZE = 256
    NUM_PIXELS = 64
    ENCODER_SIZE = 2048
    learning_rate = args.lr
    start_epoch = 0

    max_BLEU = 0

    vocab = pickle.load(open('vocab.p', 'rb'))

    train_transform = transforms.Compose([
        transforms.RandomCrop(CROP_SIZE),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286))
    ])

    val_transform = transforms.Compose([
        transforms.CenterCrop(CROP_SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.444, 0.421, 0.385), (0.285, 0.277, 0.286))
    ])

    train_loader = torch.utils.data.DataLoader(dataset=Custom_Flickr30k(
        '../flickr30k-images',
        '../flickr30k-captions/results_20130124.token',
        vocab,
        transform=train_transform,
        train=True),
                                               batch_size=args.batch_size,
                                               shuffle=True,
                                               num_workers=NUM_WORKERS,
                                               collate_fn=collate_fn)

    val_loader = torch.utils.data.DataLoader(dataset=Custom_Flickr30k(
        '../flickr30k-images',
        '../flickr30k-captions/results_20130124.token',
        vocab,
        transform=val_transform,
        train=False),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=NUM_WORKERS,
                                             collate_fn=collate_fn)

    # Initialize models
    encoder = EncoderCNN(args.fine_tune).to(device)
    decoder = DecoderRNNwithAttention(len(vocab),
                                      args.embed_size,
                                      args.hid_size,
                                      1,
                                      args.attn_size,
                                      ENCODER_SIZE,
                                      NUM_PIXELS,
                                      dropout=args.drop).to(device)

    # Initialize optimization
    criterion = torch.nn.CrossEntropyLoss()
    if args.fine_tune:
        params = list(encoder.parameters()) + list(decoder.parameters())
    else:
        params = list(decoder.parameters()) + list(
            encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            max_BLEU = checkpoint['max_BLEU']
            encoder.load_state_dict(checkpoint['encoder'])
            decoder.load_state_dict(checkpoint['decoder'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else:
            print("No checkpoint found at '{}'".format(args.resume))

    XEntropy = AverageMeter()
    PPL = AverageMeter()

    # Save
    if not args.resume:
        file = open(f'{args.save}/resuts.txt', 'a')
        file.write('Loss,PPL,BLEU \n')
        file.close()

    for epoch in range(start_epoch, args.epoch):
        print('Epoch {}'.format(epoch + 1))
        print('training...')
        for i, (images, captions, lengths) in enumerate(train_loader):
            # Batch to device
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            encoder.train()
            decoder.train()

            features = encoder(images)
            predictions, attention_weights = decoder(features, captions,
                                                     lengths)

            scores = pack_padded_sequence(predictions[:, :-1, :],
                                          torch.tensor(lengths) - 2,
                                          batch_first=True).cpu()
            targets = pack_padded_sequence(captions[:, 1:-1],
                                           torch.tensor(lengths) - 2,
                                           batch_first=True).cpu()

            loss = criterion(scores.data, targets.data)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            XEntropy.update(loss.item(), len(lengths))
            PPL.update(np.exp(loss.item()), len(lengths))
        print('Train Perplexity = {}'.format(PPL.avg))

        if epoch % 50 == 0:
            learning_rate /= 5
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

        print('validating...')
        curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size,
                              device)
        is_best = curr_BLEU > max_BLEU
        max_BLEU = max(curr_BLEU, max_BLEU)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'encoder': encoder.state_dict(),
                'decoder': decoder.state_dict(),
                'max_BLEU': max_BLEU,
                'optimizer': optimizer.state_dict(),
            }, is_best, args.save)

        print('Validation BLEU = {}'.format(curr_BLEU))

        # Save
        file = open(f'{args.save}/resuts.txt', 'a')
        file.write('{},{},{} \n'.format(XEntropy.avg, PPL.avg, curr_BLEU))
        file.close()
Ejemplo n.º 14
0
def main(args):

    #defining torch configurations
    #torch.manual_seed(args.seed)
    #torch.cuda.manual_seed(args.seed)
    #torch.backends.cudnn.benchmark = True

    #extract weights from the weight matrices
    weights = np.load(args.file_name)

    # CUDA for PyTorch
    #if cuda:
    device = 3
    torch.cuda.set_device(device)

    #device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
    #use_cuda = torch.cuda.is_available()
    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #defining dictionary and VQAFeatureDataset
    #transforms for pretrained network(transform for resnet now)
    train_transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    validate_transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    train_dataset = VQADataset(image_root_dir=args.img_root_dir,
                               dictionary=dictionary,
                               dataroot=args.data_root_dir,
                               choice='train',
                               transform_set=train_transform)
    # eval_dataset = VQADataset(image_root_dir=args.img_root_dir,dictionary=dictionary,dataroot=args.data_root_dir,choice='val',transform_set=validate_transform)

    #model definition
    print('Loading the models')
    image_encoder = EncoderCNN(embed_size=args.img_feats).to(device)
    question_encoder = EncoderLSTM(hidden_size=args.num_hid,
                                   weights_matrix=weights,
                                   fc_size=args.q_embed,
                                   max_seq_length=args.max_sequence_length,
                                   batch_size=args.batch_size).to(device)
    fusion_network = FusionModule(qnetwork=question_encoder,
                                  img_network=image_encoder,
                                  fuse_embed_size=args.fuse_embed,
                                  input_fc_size=args.img_feats,
                                  class_size=args.num_class).to(device)
    #print(list(fusion_network.parameters()))
    print(fusion_network)
    #input()

    #Dataloader initialization
    train_loader = DataLoader(train_dataset,
                              args.batch_size,
                              shuffle=True,
                              num_workers=12)
    # eval_loader =  DataLoader(eval_dataset, args.batch_size, shuffle=True, num_workers=1)

    # Loss and optimizer
    criterion = nn.NLLLoss()
    #params=lis
    #params = list(image_encoder.linear.parameters())+list(image_encoder.bn.parameters())+list(question_encoder.parameters()) + list(fusion_network.parameters())
    optimizer = torch.optim.Adam(fusion_network.parameters(),
                                 lr=args.learning_rate)

    # Train the models
    total_step = len(train_loader)
    step = 0

    #Training starts
    #print('Training Starting ......................')

    def evaluate_val(model, train_loader, criterion, device):
        loss = 0
        accuracy = 0
        with torch.no_grad():
            for image_sample, question_token, labels in iter(train_loader):
                image_sample, question_token, labels = image_sample.to(
                    device), question_token.to(device), labels.to(device)
                output = model.forward(question_token, image_sample)
                loss += criterion(output, labels).item()
                ps = torch.exp(output)
                equality = (labels.data == ps.max(dim=1)[1])
                accuracy += equality.type(torch.FloatTensor).mean()
        return loss, accuracy

    file_train = open('train_loss_log.txt', 'a+')
    loss_save = []

    for epoch in range(args.epochs):

        running_loss = 0.0
        running_corrects = 0
        step = 0
        for data in tqdm(train_loader):
            image_samp, question_toks, labels = data
            image_samp = image_samp.to(device)
            question_toks = question_toks.to(device)
            labels = labels.to(device)

            class_outputs = fusion_network(question_toks, image_samp)
            _, preds = torch.max(class_outputs, 1)
            loss = criterion(class_outputs, labels)
            #question_encoder.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            #print('Enter some key')
            #input()
            # statistics
            running_loss += loss.item() * image_samp.size(0)
            running_corrects += torch.sum(preds == labels.data)
            if (step % 300 == 0):
                #optimizer.zero_grad()
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, args.epochs, step, total_step, loss.item()))
            step = step + 1
        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / len(train_dataset)
        print(epoch_loss)
        #loss_save.append(val_loss)

        val_loss, accuracy = evaluate_val(fusion_network, train_loader,
                                          criterion, device)
        string = 'Epoch {}:{} loss: {} \t'.format(epoch, args.epochs,
                                                  running_loss)
        string += 'Accuracy : '.format(accuracy)
        file_train.write(string)
        print('{} Loss: {:.4f} Acc: {:.4f}'.format('train', epoch_loss,
                                                   epoch_acc))
    file_train.close()
Ejemplo n.º 15
0
def train_captioner():
    print("Training The Capitoner ... ")
    # Create model directory
    if not os.path.exists(path_trained_model):
        os.makedirs(path_trained_model)

    # Image preprocessing, first resize the input image then do normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((input_resnet_size, input_resnet_size),
                          interpolation=Image.ANTIALIAS),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Loading  Dictionary (as binary data)
    with open(dict_path, 'rb') as file:
        dictionary = pickle.load(file)

    # Build data loader
    data_loader = get_loader(imgs_path,
                             data_caps,
                             dictionary,
                             transform,
                             BATCH_SIZE,
                             shuffle=True,
                             num_workers=2)

    # Build the models
    encoder = EncoderCNN(word_embedding_size).to(device)
    decoder = DecoderRNN(word_embedding_size, lstm_output_size,
                         len(dictionary[0]), num_layers).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=LEARN_RATE)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(NUM_EPOCHS):
        for i, (images, captions, lengths) in enumerate(data_loader):
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % 20 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch, NUM_EPOCHS, i, total_step, loss.item()))

        # Sace model after each epoch ...
        torch.save(
            decoder.state_dict(),
            os.path.join(path_trained_model,
                         'captioner{}.ckpt'.format(epoch + 1)))
        torch.save(
            encoder.state_dict(),
            os.path.join(path_trained_model,
                         'feature-extractor-{}.ckpt'.format(epoch + 1)))
Ejemplo n.º 16
0
def test_captioner(show_images=False):
    # Load vocabulary wrapper
    with open(dict_path, 'rb') as file:
        dictionary = pickle.load(file)

    # Build models
    encoder = EncoderCNN(word_embedding_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(word_embedding_size, lstm_output_size,
                         len(dictionary[0]), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder_model_path = os.path.join(
        path_trained_model,
        feature_gen_path + str(NUM_EPOCHS) + model_extension)
    decoder_model_path = os.path.join(
        path_trained_model,
        caption_gen_path + str(NUM_EPOCHS) + model_extension)
    encoder.load_state_dict(torch.load(encoder_model_path))
    print("Feature Extractor Model Loaded Successfully")
    decoder.load_state_dict(torch.load(decoder_model_path))
    print("Caption Generator Loaded Successfully")

    # Open Caption Saver File
    output_file = open(captions_save_path, 'w')

    for iter, data_img in enumerate(data_imgs):
        img_path = os.path.join(imgs_path, data_img['file_name'])
        ### Change ###
        inp_img = Image.open(img_path)
        # Since the model assumes a batch number ...
        image_tensor = transform(inp_img).unsqueeze(0).to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        idx2word = dictionary[1]
        # Convert word_ids to words
        sampled_caption = []
        for idx in sampled_ids:
            word = idx2word[idx]
            sampled_caption.append(word)
            if word == 'END':
                break
        sentence = ' '.join(sampled_caption)

        # Writing the Caption to File
        output_file.write(sentence + "\n")

        # Print out the image and the generated caption
        print("Caption: ", sentence)

        if show_images:
            image = cv.imread(img_path, cv.IMREAD_COLOR)
            window_name = "Sample Image with Caption as Overlay"
            cv.imshow(window_name, image)
            cv.displayOverlay(window_name, sentence)
            cv.waitKey(0)

        if iter == 10:
            return
vocab_path = 'vocab.pkl'
#Load vocabulary wrapper
with open(vocab_path, 'rb') as f:
    vocab = pickle.load(f)

data_loader = get_loader(image_dir,
                         caption_path,
                         vocab,
                         transform,
                         batch_size,
                         shuffle=True,
                         num_workers=num_workers)

#Build models
encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab),
                     num_layers).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(
    encoder.bn.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)

# Train the models
total_step = len(data_loader)
for epoch in range(num_epochs):
    for i, (images, captions, lengths) in enumerate(data_loader):

        # Set mini-batch dataset
Ejemplo n.º 18
0
dataset_train = Flickr8k_Images(
    image_ids=image_ids,
    transform=data_transform,
)

train_loader = torch.utils.data.DataLoader(
    dataset_train,
    batch_size=64,
    shuffle=False,
    num_workers=2,
)

# device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EncoderCNN().to(device)

#########################################################################
#
#        QUESTION 1.2 Extracting image features
#
#########################################################################
features = []

# TODO loop through all image data, extracting features and saving them
# no gradients needed
with torch.no_grad():
    model.eval()
    for data in tqdm(train_loader):
        data = data.to(device)
        features.append(model(data))
Ejemplo n.º 19
0
    embed_size = decoder_input_params['embed_size']
    hidden_size = decoder_input_params['hidden_size']
    vocab_size = decoder_input_params['vocab_size']
    num_layers = decoder_input_params['num_layers']

    ## Load Vocab
    Obj = s3.get_object(Bucket=S3_BUCKET, Key=VOCAB_PATH)
    bytestream = io.BytesIO(Obj['Body'].read())
    decoder_vocab = pickle.load(bytestream)
    print('decoder_vocab loaded')

    # Load Encoder
    Obj2 = s3.get_object(Bucket=S3_BUCKET, Key=ENC_PATH)
    bytestream = io.BytesIO(Obj2['Body'].read())
    encoder_model = EncoderCNN(embed_size)
    encoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE))
    print('Encoder loaded')

    # Load Decoder
    Obj3 = s3.get_object(Bucket=S3_BUCKET, Key=DEC_PATH)
    bytestream = io.BytesIO(Obj3['Body'].read())
    decoder_model = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
    decoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE))
    print('Decoder loaded')

    # decoder = DecoderRNN( embed_size , hidden_size , vocab_size , num_layers )
    # decoder.load_state_dict( torch.load(   os.path.join( model_save_path , 'decoderdata.pkl' )   ) )
    encoder_model.eval()
    decoder_model.eval()
Ejemplo n.º 20
0
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size,
          hidden_size, vocab_size):

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move to GPU, if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    params = list(decoder.parameters()) + list(encoder.embed.parameters())
    optimizer = torch.optim.Adam(params, lr=0.001)

    # This is to make sure that the 1st loss is  lower than sth and
    # Save the model according to this comparison
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs + 1):

        # Keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        encoder.train()
        decoder.train()
        for data in train_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            decoder.zero_grad()
            encoder.zero_grad()

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

        encoder.eval()
        decoder.eval()
        for data in valid_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))

            valid_loss += loss.item() * images.size(0)

            # Average losses
            train_loss = train_loss / len(train_loader)
            valid_loss = valid_loss / len(valid_loader)

            print(
                f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}"
            )

            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print(
                    f"Validation loss decreased ({valid_loss_min} --> {valid_loss}).  Saving model ..."
                )
                torch.save(encoder.state_dict(),
                           save_location_path + '/encoder{n_epochs}.pt')
                torch.save(decoder.state_dict(),
                           save_location_path + '/decoder{n_epochs}.pt')
                valid_loss_min = valid_loss
Ejemplo n.º 21
0
def main(args):
    model_path = args.model_path
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # load vocablary
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    img_path = args.img_path
    factual_cap_path = args.factual_caption_path
    humorous_cap_path = args.humorous_caption_path

    # import data_loader
    data_loader = get_data_loader(img_path,
                                  factual_cap_path,
                                  vocab,
                                  args.caption_batch_size,
                                  shuffle=True)
    styled_data_loader = get_styled_data_loader(humorous_cap_path,
                                                vocab,
                                                args.language_batch_size,
                                                shuffle=True)

    # import models
    emb_dim = args.emb_dim
    hidden_dim = args.hidden_dim
    factored_dim = args.factored_dim
    vocab_size = len(vocab)
    encoder = EncoderCNN(emb_dim)
    decoder = FactoredLSTM(emb_dim, hidden_dim, factored_dim, vocab_size)

    if torch.cuda.is_available():
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    # loss and optimizer
    criterion = masked_cross_entropy
    cap_params = list(decoder.parameters()) + list(encoder.A.parameters())
    lang_params = list(decoder.parameters())
    optimizer_cap = torch.optim.Adam(cap_params, lr=args.lr_caption)
    optimizer_lang = torch.optim.Adam(lang_params, lr=args.lr_language)

    # train
    total_cap_step = len(data_loader)
    total_lang_step = len(styled_data_loader)
    epoch_num = args.epoch_num
    for epoch in range(epoch_num):
        # caption
        for i, (images, captions, lengths) in enumerate(data_loader):
            images = to_var(images, volatile=True)
            captions = to_var(captions.long())

            # forward, backward and optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(captions, features, mode="factual")
            loss = criterion(outputs[:, 1:, :].contiguous(),
                             captions[:, 1:].contiguous(), lengths - 1)
            loss.backward()
            optimizer_cap.step()

            # print log
            if i % args.log_step_caption == 0:
                print("Epoch [%d/%d], CAP, Step [%d/%d], Loss: %.4f" %
                      (epoch + 1, epoch_num, i, total_cap_step,
                       loss.data.mean()))

        eval_outputs(outputs, vocab)

        # language
        for i, (captions, lengths) in enumerate(styled_data_loader):
            captions = to_var(captions.long())

            # forward, backward and optimize
            decoder.zero_grad()
            outputs = decoder(captions, mode='humorous')
            loss = criterion(outputs, captions[:, 1:].contiguous(),
                             lengths - 1)
            loss.backward()
            optimizer_lang.step()

            # print log
            if i % args.log_step_language == 0:
                print("Epoch [%d/%d], LANG, Step [%d/%d], Loss: %.4f" %
                      (epoch + 1, epoch_num, i, total_lang_step,
                       loss.data.mean()))

        # save models
        torch.save(decoder.state_dict(),
                   os.path.join(model_path, 'decoder-%d.pkl' % (epoch + 1, )))

        torch.save(encoder.state_dict(),
                   os.path.join(model_path, 'encoder-%d.pkl' % (epoch + 1, )))