Example #1
0
def do(args: argparse.Namespace):
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    print('gpu :', args.gpu)
    # preprocess
    preprocess = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406),
                             std=(0.229, 0.224, 0.225))
    ])
    # vocab
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # model
    encoder = EncoderCNN(args.embed_size).cuda()
    decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size,
                         args.num_layers).cuda()
    model_state = torch.load(args.checkpoint_path)
    encoder.load_state_dict(model_state['encoder'])
    decoder.load_state_dict(model_state['decoder'])
    print('load successfully at\tepoch:%d\tstep:%d' %
          (model_state['epoch'], model_state['step']))
    encoder.eval()
    decoder.eval()
    # image
    img = load_image(args.img_path, preprocess).cuda()
    outs = decoder.sample(encoder(img))
    outs = outs.cpu().numpy()
    print(outs)
    # caption
    caption = []
    for word_id in outs:
        word = vocab.idx2word[word_id]
        caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(caption)
    print(sentence)
Example #2
0
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size,
          hidden_size, vocab_size):

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move to GPU, if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    params = list(decoder.parameters()) + list(encoder.embed.parameters())
    optimizer = torch.optim.Adam(params, lr=0.001)

    # This is to make sure that the 1st loss is  lower than sth and
    # Save the model according to this comparison
    valid_loss_min = np.Inf

    for epoch in range(1, n_epochs + 1):

        # Keep track of training and validation loss
        train_loss = 0.0
        valid_loss = 0.0

        encoder.train()
        decoder.train()
        for data in train_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            decoder.zero_grad()
            encoder.zero_grad()

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

        encoder.eval()
        decoder.eval()
        for data in valid_loader:
            images, captions = data['image'], data['caption']
            images = images.type(torch.FloatTensor)
            images.to(device)
            captions.to(device)

            features = encoder(images)
            outputs = decoder(features, captions)

            loss = criterion(outputs.contiguous().view(-1, vocab_size),
                             captions.view(-1))

            valid_loss += loss.item() * images.size(0)

            # Average losses
            train_loss = train_loss / len(train_loader)
            valid_loss = valid_loss / len(valid_loader)

            print(
                f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}"
            )

            # save model if validation loss has decreased
            if valid_loss <= valid_loss_min:
                print(
                    f"Validation loss decreased ({valid_loss_min} --> {valid_loss}).  Saving model ..."
                )
                torch.save(encoder.state_dict(),
                           save_location_path + '/encoder{n_epochs}.pt')
                torch.save(decoder.state_dict(),
                           save_location_path + '/decoder{n_epochs}.pt')
                valid_loss_min = valid_loss
Example #3
0
    shuffle=False,
    num_workers=2,
)

# device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EncoderCNN().to(device)

#########################################################################
#
#        QUESTION 1.2 Extracting image features
#
#########################################################################
features = []

# TODO loop through all image data, extracting features and saving them
# no gradients needed
with torch.no_grad():
    model.eval()
    for data in tqdm(train_loader):
        data = data.to(device)
        features.append(model(data))
    features = torch.cat(features).squeeze()

# to check your results, features should be dimensions [len(train_set), 2048]
# convert features to a PyTorch Tensor before saving
print(features.shape)

# save features
torch.save(features, "features.pt")
Example #4
0
def main():

    cudnn.benchmark = True
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args = get_parser().parse_args()

    NUM_WORKERS = 2
    CROP_SIZE = 256
    NUM_PIXELS = 64
    ENCODER_SIZE = 2048
    ALPHA = 1.  # attention regularization parameter
    learning_rate = args.lr
    start_epoch = 0

    max_BLEU = 0

    vocab = pickle.load(open('vocab.p', 'rb'))

    train_transform = transforms.Compose([
            transforms.RandomCrop(CROP_SIZE),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.444, 0.421, 0.385),
                                 (0.285, 0.277, 0.286))])

    val_transform = transforms.Compose([
            transforms.CenterCrop(CROP_SIZE),
            transforms.ToTensor(),
            transforms.Normalize((0.444, 0.421, 0.385),
                                 (0.285, 0.277, 0.286))])

    train_loader = torch.utils.data.DataLoader(
            dataset=Custom_Flickr30k('flickr30k-images/flickr30k-images','flickr30k-captions/results_20130124.token', vocab, transform=train_transform, train=True),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=NUM_WORKERS, pin_memory=True,
            collate_fn=collate_fn)

    val_loader = torch.utils.data.DataLoader(
            dataset=Custom_Flickr30k('flickr30k-images/flickr30k-images','flickr30k-captions/results_20130124.token', vocab, transform=val_transform, train=False),
            batch_size=args.batch_size,
            shuffle=False,
            num_workers=NUM_WORKERS, pin_memory=True,
            collate_fn=collate_fn)

    # Initialize models
    encoder = EncoderCNN().to(device)
    decoder = DecoderRNNwithAttention(vocab, args.hid_size, 1, args.attn_size, ENCODER_SIZE, NUM_PIXELS, dropout=args.drop).to(device)

    # Initialize optimization
    criterion = torch.nn.CrossEntropyLoss()
    #decoder.embed.weight.requires_grad = False
    params = list(decoder.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            max_BLEU = checkpoint['max_BLEU']
            encoder.load_state_dict(checkpoint['encoder'])
            decoder.load_state_dict(checkpoint['decoder'])
            optimizer.load_state_dict(checkpoint['optimizer'])
        else: print("No checkpoint found at '{}'".format(args.resume))

    XEntropy = AverageMeter()
    PPL = AverageMeter()

    # Save
    if not args.resume:
        file = open(f'{args.save}/resuts.txt','a')
        file.write('Loss,PPL,BLEU \n')
        file.close()

    for epoch in range(start_epoch, 30):

        print('Epoch {}'.format(epoch+1))
        print('training...')
        for i, (images, captions, lengths) in enumerate(train_loader):

            if i%10 ==  0:
                print('[{}/{}]'.format(i,len(train_loader)))
                print(PPL.avg)

            # Batch to device
            images = images.to(device)
            captions = captions.to(device)
            lengths.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            encoder.train()
            decoder.train()

            features = encoder(images)
            predictions, attention_weights = decoder(features, captions, lengths)

            scores = pack_padded_sequence(predictions[:,:-1,:], lengths-2, batch_first=True)
            targets = pack_padded_sequence(captions[:,1:-1], lengths-2, batch_first=True)

            loss = criterion(scores.data, targets.data)
            loss += ALPHA * ((1. - attention_weights.sum(dim=1)) ** 2).mean()

            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            XEntropy.update(loss.item(), len(lengths))
            PPL.update(np.exp(loss.item()), len(lengths))
        print('Train Perplexity = {}'.format(PPL.avg))

        if epoch+1 % 10 == 0:
            learning_rate /= 10
            for param_group in optimizer.param_groups: param_group['lr'] = learning_rate

        encoder.eval()
        decoder.eval()
        print('validating...')
        curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size, device)[0]
        is_best = curr_BLEU > max_BLEU
        max_BLEU = max(curr_BLEU, max_BLEU)
        save_checkpoint({
            'epoch': epoch + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(),
            'max_BLEU': max_BLEU, 'optimizer' : optimizer.state_dict(),
        }, is_best, args.save)

        print('Validation BLEU = {}'.format(curr_BLEU))

        # Save
        file = open(f'{args.save}/resuts.txt','a')
        file.write('{},{},{} \n'.format(XEntropy.avg,PPL.avg,curr_BLEU))
        file.close()

    checkpoint = torch.load(f'{args.save}/model_best.pth.tar')
    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])
    #decoder.embed.weight.requires_grad = True
    learning_rate = 0.001
    params = list(encoder.parameters()) + list(decoder.parameters())
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    for epoch in range(start_epoch, args.epoch):
        print('Epoch {}'.format(epoch+1))
        print('training...')
        for i, (images, captions, lengths) in enumerate(train_loader):

            if i%10 ==  0:
                print('[{}/{}]'.format(i,len(train_loader)))
                print(PPL.avg)

            # Batch to device
            images = images.to(device)
            captions = captions.to(device)
            lengths.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            encoder.train()
            decoder.train()

            features = encoder(images)
            predictions, attention_weights = decoder(features, captions, lengths)

            scores = pack_padded_sequence(predictions[:,:-1,:], lengths-2, batch_first=True)
            targets = pack_padded_sequence(captions[:,1:-1], lengths-2, batch_first=True)

            loss = criterion(scores.data, targets.data)
            loss += ALPHA * ((1. - attention_weights.sum(dim=1)) ** 2).mean()

            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            XEntropy.update(loss.item(), len(lengths))
            PPL.update(np.exp(loss.item()), len(lengths))
        print('Train Perplexity = {}'.format(PPL.avg))

        if epoch+1 % 5 == 0:
            learning_rate /= 10
            for param_group in optimizer.param_groups: param_group['lr'] = learning_rate

        encoder.eval()
        decoder.eval()
        print('validating...')
        curr_BLEU = bleu_eval(encoder, decoder, val_loader, args.batch_size, device)[0]
        is_best = curr_BLEU > max_BLEU
        max_BLEU = max(curr_BLEU, max_BLEU)
        save_checkpoint({
            'epoch': epoch + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(),
            'max_BLEU': max_BLEU, 'optimizer' : optimizer.state_dict(),
        }, is_best, args.save)

        print('Validation BLEU = {}'.format(curr_BLEU))

        # Save
        file = open(f'{args.save}/resuts.txt','a')
        file.write('{},{},{} \n'.format(XEntropy.avg,PPL.avg,curr_BLEU))
        file.close()
Example #5
0
    data_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # using ImageNet norms
            (0.229, 0.224, 0.225))
    ])

    test_lines = read_lines(TOKEN_FILE_TEST)
    test_image_ids, test_cleaned_captions = parse_lines(test_lines)

    # load models
    encoder = EncoderCNN().to(device)
    decoder = torch.load("decoder.ckpt").to(device)
    encoder.eval()
    decoder.eval()  # generate caption, eval mode to not influence batchnorm

    #########################################################################
    #
    #        QUESTION 2.1 Generating predictions on test data
    #
    #########################################################################

    # TODO define decode_caption() function in utils.py
    image_id_candidate_reference = {}  # type: dict[str, dict[str, list[str]]]
    import os
    if os.path.exists("image_id_candidate_reference.pt"):
        image_id_candidate_reference = torch.load(
            "image_id_candidate_reference.pt")
    else:
Example #6
0
    Obj2 = s3.get_object(Bucket=S3_BUCKET, Key=ENC_PATH)
    bytestream = io.BytesIO(Obj2['Body'].read())
    encoder_model = EncoderCNN(embed_size)
    encoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE))
    print('Encoder loaded')

    # Load Decoder
    Obj3 = s3.get_object(Bucket=S3_BUCKET, Key=DEC_PATH)
    bytestream = io.BytesIO(Obj3['Body'].read())
    decoder_model = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers)
    decoder_model.load_state_dict(torch.load(bytestream, map_location=DEVICE))
    print('Decoder loaded')

    # decoder = DecoderRNN( embed_size , hidden_size , vocab_size , num_layers )
    # decoder.load_state_dict( torch.load(   os.path.join( model_save_path , 'decoderdata.pkl' )   ) )
    encoder_model.eval()
    decoder_model.eval()

except Exception as e:
    print('error in loading block')
    print(repr(e))
    raise (e)


def transform_image(image_bytes):
    try:
        transform_test = transforms.Compose([
            transforms.Resize(224),  # smaller edge of image resized to 256
            transforms.RandomCrop(
                224),  # get 224x224 crop from random location
            transforms.RandomHorizontalFlip(