Esempio n. 1
0
def main():
    # Create model directory
    ##### arguments #####
    PATH = os.getcwd()
    image_dir = './data/resized2014/'
    caption_path = './data/annotations/captions_train2014.json'
    vocab_path = './data/vocab.pkl'
    model_path = './model'
    crop_size = 224
    batch_size = 128
    num_workers = 4
    learning_rate = 0.001

    # Decoder
    embed_size = 512
    hidden_size = 512
    num_layers = 3  # number of lstm layers
    num_epochs = 10
    start_epoch = 0
    save_step = 3000

    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    coco = CocoDataset(image_dir, caption_path, vocab, transform)
    dataLoader = torch.utils.data.DataLoader(coco,
                                             batch_size,
                                             shuffle=True,
                                             num_workers=4,
                                             collate_fn=coco_batch)

    # Declare the encoder decoder
    encoder = Encoder(embed_size=embed_size).to(device)
    decoder = Decoder(embed_size=embed_size,
                      hidden_size=hidden_size,
                      vocab_size=len(vocab),
                      num_layers=num_layers).to(device)

    encoder.train()
    decoder.train()
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    # For encoder only train the last fc layer
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    # Train the models
    total_step = len(dataLoader)
    for epoch in range(num_epochs):
        for i, (images, captions, lengths) in enumerate(dataLoader):
            # Set mini-batch dataset
            images = images.cuda()
            captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()

            for group in optimizer.param_groups:
                for p in group['params']:
                    state = optimizer.state[p]
                    if ('step' in state and state['step'] >= 1024):
                        state['step'] = 1000

            loss.backward(retain_graph=True)
            optimizer.step()

            # Print log info
            if i % 100 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(
                    epoch + 1 + start_epoch, num_epochs + start_epoch, i,
                    total_step, loss.item()))

            # Save the model checkpoints
            if (i + 1) % save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(
                        model_path,
                        'decoder-{}-{}.ckpt'.format(epoch + 1 + start_epoch,
                                                    i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(
                        model_path,
                        'encoder-{}-{}.ckpt'.format(epoch + 1 + start_epoch,
                                                    i + 1)))

        print('epoch ', epoch + 1, 'loss: ', loss.item())
Esempio n. 2
0
def main():
    # Create model directory
    ##### arguments #####
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    PATH = os.getcwd()
    image_dir = '/u/training/tra379/final_project/resized_2014/'
    caption_path = '/u/training/tra379/final_project/captions_train2014.json'
    vocab_path = '/u/training/tra379/final_project/data/vocab_self_new.pkl'
    model_path = '/u/training/tra379/scratch/model_layer_10'
    crop_size = 224
    batch_size = 128
    num_workers = 4
    learning_rate = 0.001

    # Decoder
    embed_size = 512
    hidden_size = 512
    num_epochs = 5
    log_step = 100
    save_step = 1000
    ######################

    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    coco = CocoDataset(image_dir, caption_path, vocab, transform)
    dataLoader = torch.utils.data.DataLoader(coco, batch_size, shuffle=True, num_workers=4, collate_fn=coco_batch)
    num_layers=10
    print("Number of Layer: ", num_layers)
    # Declare the encoder decoder
    encoder = Encoder(embed_size=embed_size).to(device)
    decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=len(vocab), num_layers=num_layers, stateful=False).to(device)
    #encoder.load_state_dict(torch.load('/u/training/tra379/final_project/models_self/encoder-2-2000.ckpt'))
    #decoder.load_state_dict(torch.load('/u/training/tra379/final_project/models_self/decoder-2-2000.ckpt'))
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    # For encoder only train the last fc layer
    optimizer = torch.optim.Adam(params, lr=learning_rate)

    # Train the models
    total_step = len(dataLoader)
    for epoch in range(num_epochs):
        for i, (images, captions, lengths) in enumerate(dataLoader):

            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            for group in optimizer.param_groups:
                for p in group['params']:
                    state = optimizer.state[p]
                    if('step' in state and state['step']>=1024):
                        state['step'] = 1000
            optimizer.step()

            # Print log info
            if i % log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item())))

            # Save the model checkpoints
            if (i+1) % save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    model_path, 'decoder-{}-{}-{}.ckpt'.format(num_layers, epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    model_path, 'encoder-{}-{}-{}.ckpt'.format(num_layers, epoch+1, i+1)))
    torch.save(decoder, os.path.join(model_path, 'decoder_final-{}.model'.format(num_layers)))
    torch.save(encoder, os.path.join(model_path, 'encoder_final-{}.model'.format(num_layers)))
Esempio n. 3
0
    parser.add_argument(
        '--depth',
        help='Resnet depth, must be one of 18, 34, 50, 101, 152',
        type=int,
        default=50)
    parser.add_argument('--epochs',
                        help='Number of epochs',
                        type=int,
                        default=100)

    args = parser.parse_args()

    dataset_train = CocoDataset(args.coco_path,
                                args.coco_name,
                                transform=transforms.Compose(
                                    [Normalizer(),
                                     Augmenter(),
                                     Resizer()]))

    sampler = AspectRatioSampler(dataset_train, batch_size=5, drop_last=False)

    data_loader = DataLoader(dataset_train,
                             collate_fn=collater,
                             batch_sampler=sampler)

    # Create Model Instance
    model = resnet18(80).cuda()

    for i in range(20):

        optimizer = optim.Adam(model.parameters(), lr=1e-5)
Esempio n. 4
0
def main_worker(gpu, ngpus_per_node, cfg):
    if cfg.gpu is not None:
        print("Use GPU: {} for training".format(cfg.gpu))

    if cfg.distributed:
        print('init distributing process')
        if cfg.dist_url == "env://" and cfg.rank == -1:
            cfg.rank = int(os.environ["RANK"])
        dist.init_process_group(backend=cfg.dist_backend,
                                init_method=cfg.dist_url,
                                world_size=cfg.world_size,
                                rank=cfg.rank)

    # Data
    print('==> Preparing data..')
    # Load vocabulary wrapper for image caption
    with open(cfg.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    # Image preprocessing, normalization for the pretrained resnet
    # cifar cls, use resized 36x36 image
    if cfg.task == 'cifar_cls':
        transform = transforms.Compose([
            transforms.RandomCrop(cfg.crop_size, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # imagenet cls, 224x224
    # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
    if cfg.task == 'imagenet_cls':
        transform = transforms.Compose([
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # coco det, 1333x800
    # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
    if cfg.task == 'coco_det':
        transform = transforms.Compose([
            transforms.RandomGrayscale(p=0.2),
            transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

    # COCO caption dataset
    coco = CocoDataset(root=cfg.image_dir,
                       json=cfg.caption_path,
                       vocab=vocab,
                       transform=transform)
    #Build data loader for image caption training
    if cfg.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(coco)
    else:
        train_sampler = None

    # Data loader for COCO dataset
    # This will return (images, captions, lengths) for each iteration.
    # images: a tensor of shape (batch_size, 3, 224, 224).
    # captions: a tensor of shape (batch_size, padded_length).
    # lengths: a list indicating valid length for each caption. length is (batch_size).
    data_loader = torch.utils.data.DataLoader(dataset=coco,
                                              batch_size=cfg.batch_size,
                                              shuffle=(train_sampler is None),
                                              num_workers=cfg.num_workers,
                                              collate_fn=collate_fn,
                                              pin_memory=True,
                                              sampler=train_sampler)

    # Build the Decoder models
    decoder = DecoderRNN(cfg.model['embed_size'], cfg.model['hidden_size'],
                         len(vocab), cfg.model['num_layers'])

    if cfg.model['net'] == 'densenet121':
        linear_ic = nn.Linear(1024, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = DenseNet121()

    if cfg.model['net'] == 'densenet169':
        linear_ic = nn.Linear(4096, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = DenseNet169()

    if cfg.model['net'] == 'resnet34':
        linear_ic = nn.Linear(512, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet34()

    if cfg.model['net'] == 'resnet50':
        linear_ic = nn.Linear(2048, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet50()

    if cfg.model['net'] == 'resnet101':
        linear_ic = nn.Linear(2048, 256)
        bn_ic = nn.BatchNorm1d(256, momentum=0.01)
        net = ResNet101()

    print('cfg.distributed:', cfg.distributed)
    if cfg.distributed:
        linear_ic.cuda()
        bn_ic.cuda()
        net.cuda()
        decoder.cuda()
        # DistributedDataParallel will divide and allocate batch_size to all
        # available GPUs if device_ids are not set
        linear_ic = torch.nn.parallel.DistributedDataParallel(linear_ic)
        bn_ic = torch.nn.parallel.DistributedDataParallel(bn_ic)
        net = torch.nn.parallel.DistributedDataParallel(net)
        decoder = torch.nn.parallel.DistributedDataParallel(decoder)
    else:
        torch.cuda.set_device(device)
        linear_ic.cuda(cfg.gpu)
        bn_ic.cuda(cfg.gpu)
        net.cuda(cfg.gpu)
        decoder.cuda(cfg.gpu)

    criterion = nn.CrossEntropyLoss()
    # Optimizer for image classificaation
    # optimizer = optim.Adam(list(net.parameters()), lr=cfg.lr)

    optimizer_ic = optim.Adam(
        list(net.parameters()) + list(linear_ic.parameters()) +
        list(decoder.parameters()) + list(bn_ic.parameters()),
        lr=cfg.lr)  #0.0001
    scheduler = MultiStepLR(optimizer_ic, milestones=[60, 120, 160], gamma=0.1)

    if cfg.loading:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        # assert os.path.isdir(cfg.checkpoint), 'Error: no checkpoint directory found!'
        checkpoint = torch.load(cfg.checkpoint)
        net.load_state_dict(checkpoint)
        # best_acc = checkpoint['acc']
        start_epoch = int(cfg.checkpoint.split('/')[-1].split('-')[1])
    else:
        start_epoch = 0

    #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_ic, T_max=200)
    log_dir = 'log/' + cfg.config.split('/')[1][:-3]
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    writer = SummaryWriter(log_dir=log_dir)
    #start training
    for epoch in range(start_epoch, cfg.num_epochs):
        if cfg.distributed:
            train_sampler.set_epoch(epoch)
        net = train_ic(epoch,
                       cfg,
                       net=net,
                       decoder=decoder,
                       linear=linear_ic,
                       bn=bn_ic,
                       optimizer_ic=optimizer_ic,
                       criterion=criterion,
                       data_loader=data_loader,
                       writer=writer)
        scheduler.step()
Esempio n. 5
0
# Load Data
train_ids = get_anns(get_ids("./TrainImageIds.csv"), train_caption_path)
test_ids = get_anns(get_ids("./TestImageIds.csv"), test_caption_path)

# Initialize the Vocabulary class
vocab = Vocabulary(train_caption_path, vocab_path)

RANDOM_SEED = 42
VALIDATION_SPLIT = .1
BATCH_SIZE = 128

# Load dataset
train_dataset = CocoDataset(root="./data/images/train/",
                            json=train_caption_path,
                            ids=train_ids,
                            vocab=vocab,
                            transform=train_transformer)

test_dataset = CocoDataset(root="./data/images/test/",
                           json=test_caption_path,
                           ids=test_ids,
                           vocab=vocab,
                           transform=test_transformer,
                           test=True)

# Use a random sampler to split into training and validation
train_sampler, valid_sampler = train_val_sampler(
    train_dataset,
    random_seed=RANDOM_SEED,
    validation_split=VALIDATION_SPLIT,
Esempio n. 6
0
import os
from PIL import Image
import pickle
from data_loader import CocoDataset
from build_vocab import Vocabulary

with open('./data/vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

coco = CocoDataset(root='./data/val_resized2014',
                   json='./data/annotations/captions_val2014.json',
                   vocab=vocab,
                   transform=None)
output_dir = './application/static/candidate/'
for i in range(20, 40):
    img = coco[i][0]
    img.save(os.path.join(output_dir, str(i) + ".jpg"), img.format)
    with open(output_dir + str(i) + '.txt', 'w') as f:
        caption = ' '.join([vocab.idx2word[id] for id in coco[i][1][1:-1]])
        print caption
        f.write(caption)