def main(): # Create model directory ##### arguments ##### PATH = os.getcwd() image_dir = './data/resized2014/' caption_path = './data/annotations/captions_train2014.json' vocab_path = './data/vocab.pkl' model_path = './model' crop_size = 224 batch_size = 128 num_workers = 4 learning_rate = 0.001 # Decoder embed_size = 512 hidden_size = 512 num_layers = 3 # number of lstm layers num_epochs = 10 start_epoch = 0 save_step = 3000 if not os.path.exists(model_path): os.makedirs(model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader coco = CocoDataset(image_dir, caption_path, vocab, transform) dataLoader = torch.utils.data.DataLoader(coco, batch_size, shuffle=True, num_workers=4, collate_fn=coco_batch) # Declare the encoder decoder encoder = Encoder(embed_size=embed_size).to(device) decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=len(vocab), num_layers=num_layers).to(device) encoder.train() decoder.train() # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) # For encoder only train the last fc layer optimizer = torch.optim.Adam(params, lr=learning_rate) # Train the models total_step = len(dataLoader) for epoch in range(num_epochs): for i, (images, captions, lengths) in enumerate(dataLoader): # Set mini-batch dataset images = images.cuda() captions = captions.cuda() targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() for group in optimizer.param_groups: for p in group['params']: state = optimizer.state[p] if ('step' in state and state['step'] >= 1024): state['step'] = 1000 loss.backward(retain_graph=True) optimizer.step() # Print log info if i % 100 == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format( epoch + 1 + start_epoch, num_epochs + start_epoch, i, total_step, loss.item())) # Save the model checkpoints if (i + 1) % save_step == 0: torch.save( decoder.state_dict(), os.path.join( model_path, 'decoder-{}-{}.ckpt'.format(epoch + 1 + start_epoch, i + 1))) torch.save( encoder.state_dict(), os.path.join( model_path, 'encoder-{}-{}.ckpt'.format(epoch + 1 + start_epoch, i + 1))) print('epoch ', epoch + 1, 'loss: ', loss.item())
def main(): # Create model directory ##### arguments ##### device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') PATH = os.getcwd() image_dir = '/u/training/tra379/final_project/resized_2014/' caption_path = '/u/training/tra379/final_project/captions_train2014.json' vocab_path = '/u/training/tra379/final_project/data/vocab_self_new.pkl' model_path = '/u/training/tra379/scratch/model_layer_10' crop_size = 224 batch_size = 128 num_workers = 4 learning_rate = 0.001 # Decoder embed_size = 512 hidden_size = 512 num_epochs = 5 log_step = 100 save_step = 1000 ###################### if not os.path.exists(model_path): os.makedirs(model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.RandomCrop(crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader coco = CocoDataset(image_dir, caption_path, vocab, transform) dataLoader = torch.utils.data.DataLoader(coco, batch_size, shuffle=True, num_workers=4, collate_fn=coco_batch) num_layers=10 print("Number of Layer: ", num_layers) # Declare the encoder decoder encoder = Encoder(embed_size=embed_size).to(device) decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, vocab_size=len(vocab), num_layers=num_layers, stateful=False).to(device) #encoder.load_state_dict(torch.load('/u/training/tra379/final_project/models_self/encoder-2-2000.ckpt')) #decoder.load_state_dict(torch.load('/u/training/tra379/final_project/models_self/decoder-2-2000.ckpt')) # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters()) # For encoder only train the last fc layer optimizer = torch.optim.Adam(params, lr=learning_rate) # Train the models total_step = len(dataLoader) for epoch in range(num_epochs): for i, (images, captions, lengths) in enumerate(dataLoader): # Set mini-batch dataset images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) decoder.zero_grad() encoder.zero_grad() loss.backward() for group in optimizer.param_groups: for p in group['params']: state = optimizer.state[p] if('step' in state and state['step']>=1024): state['step'] = 1000 optimizer.step() # Print log info if i % log_step == 0: print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) # Save the model checkpoints if (i+1) % save_step == 0: torch.save(decoder.state_dict(), os.path.join( model_path, 'decoder-{}-{}-{}.ckpt'.format(num_layers, epoch+1, i+1))) torch.save(encoder.state_dict(), os.path.join( model_path, 'encoder-{}-{}-{}.ckpt'.format(num_layers, epoch+1, i+1))) torch.save(decoder, os.path.join(model_path, 'decoder_final-{}.model'.format(num_layers))) torch.save(encoder, os.path.join(model_path, 'encoder_final-{}.model'.format(num_layers)))
parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) args = parser.parse_args() dataset_train = CocoDataset(args.coco_path, args.coco_name, transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) sampler = AspectRatioSampler(dataset_train, batch_size=5, drop_last=False) data_loader = DataLoader(dataset_train, collate_fn=collater, batch_sampler=sampler) # Create Model Instance model = resnet18(80).cuda() for i in range(20): optimizer = optim.Adam(model.parameters(), lr=1e-5)
def main_worker(gpu, ngpus_per_node, cfg): if cfg.gpu is not None: print("Use GPU: {} for training".format(cfg.gpu)) if cfg.distributed: print('init distributing process') if cfg.dist_url == "env://" and cfg.rank == -1: cfg.rank = int(os.environ["RANK"]) dist.init_process_group(backend=cfg.dist_backend, init_method=cfg.dist_url, world_size=cfg.world_size, rank=cfg.rank) # Data print('==> Preparing data..') # Load vocabulary wrapper for image caption with open(cfg.vocab_path, 'rb') as f: vocab = pickle.load(f) # Image preprocessing, normalization for the pretrained resnet # cifar cls, use resized 36x36 image if cfg.task == 'cifar_cls': transform = transforms.Compose([ transforms.RandomCrop(cfg.crop_size, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # imagenet cls, 224x224 # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 if cfg.task == 'imagenet_cls': transform = transforms.Compose([ transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # coco det, 1333x800 # same as MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 if cfg.task == 'coco_det': transform = transforms.Compose([ transforms.RandomGrayscale(p=0.2), transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # COCO caption dataset coco = CocoDataset(root=cfg.image_dir, json=cfg.caption_path, vocab=vocab, transform=transform) #Build data loader for image caption training if cfg.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(coco) else: train_sampler = None # Data loader for COCO dataset # This will return (images, captions, lengths) for each iteration. # images: a tensor of shape (batch_size, 3, 224, 224). # captions: a tensor of shape (batch_size, padded_length). # lengths: a list indicating valid length for each caption. length is (batch_size). data_loader = torch.utils.data.DataLoader(dataset=coco, batch_size=cfg.batch_size, shuffle=(train_sampler is None), num_workers=cfg.num_workers, collate_fn=collate_fn, pin_memory=True, sampler=train_sampler) # Build the Decoder models decoder = DecoderRNN(cfg.model['embed_size'], cfg.model['hidden_size'], len(vocab), cfg.model['num_layers']) if cfg.model['net'] == 'densenet121': linear_ic = nn.Linear(1024, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = DenseNet121() if cfg.model['net'] == 'densenet169': linear_ic = nn.Linear(4096, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = DenseNet169() if cfg.model['net'] == 'resnet34': linear_ic = nn.Linear(512, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet34() if cfg.model['net'] == 'resnet50': linear_ic = nn.Linear(2048, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet50() if cfg.model['net'] == 'resnet101': linear_ic = nn.Linear(2048, 256) bn_ic = nn.BatchNorm1d(256, momentum=0.01) net = ResNet101() print('cfg.distributed:', cfg.distributed) if cfg.distributed: linear_ic.cuda() bn_ic.cuda() net.cuda() decoder.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set linear_ic = torch.nn.parallel.DistributedDataParallel(linear_ic) bn_ic = torch.nn.parallel.DistributedDataParallel(bn_ic) net = torch.nn.parallel.DistributedDataParallel(net) decoder = torch.nn.parallel.DistributedDataParallel(decoder) else: torch.cuda.set_device(device) linear_ic.cuda(cfg.gpu) bn_ic.cuda(cfg.gpu) net.cuda(cfg.gpu) decoder.cuda(cfg.gpu) criterion = nn.CrossEntropyLoss() # Optimizer for image classificaation # optimizer = optim.Adam(list(net.parameters()), lr=cfg.lr) optimizer_ic = optim.Adam( list(net.parameters()) + list(linear_ic.parameters()) + list(decoder.parameters()) + list(bn_ic.parameters()), lr=cfg.lr) #0.0001 scheduler = MultiStepLR(optimizer_ic, milestones=[60, 120, 160], gamma=0.1) if cfg.loading: # Load checkpoint. print('==> Resuming from checkpoint..') # assert os.path.isdir(cfg.checkpoint), 'Error: no checkpoint directory found!' checkpoint = torch.load(cfg.checkpoint) net.load_state_dict(checkpoint) # best_acc = checkpoint['acc'] start_epoch = int(cfg.checkpoint.split('/')[-1].split('-')[1]) else: start_epoch = 0 #scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer_ic, T_max=200) log_dir = 'log/' + cfg.config.split('/')[1][:-3] if not os.path.exists(log_dir): os.makedirs(log_dir) writer = SummaryWriter(log_dir=log_dir) #start training for epoch in range(start_epoch, cfg.num_epochs): if cfg.distributed: train_sampler.set_epoch(epoch) net = train_ic(epoch, cfg, net=net, decoder=decoder, linear=linear_ic, bn=bn_ic, optimizer_ic=optimizer_ic, criterion=criterion, data_loader=data_loader, writer=writer) scheduler.step()
# Load Data train_ids = get_anns(get_ids("./TrainImageIds.csv"), train_caption_path) test_ids = get_anns(get_ids("./TestImageIds.csv"), test_caption_path) # Initialize the Vocabulary class vocab = Vocabulary(train_caption_path, vocab_path) RANDOM_SEED = 42 VALIDATION_SPLIT = .1 BATCH_SIZE = 128 # Load dataset train_dataset = CocoDataset(root="./data/images/train/", json=train_caption_path, ids=train_ids, vocab=vocab, transform=train_transformer) test_dataset = CocoDataset(root="./data/images/test/", json=test_caption_path, ids=test_ids, vocab=vocab, transform=test_transformer, test=True) # Use a random sampler to split into training and validation train_sampler, valid_sampler = train_val_sampler( train_dataset, random_seed=RANDOM_SEED, validation_split=VALIDATION_SPLIT,
import os from PIL import Image import pickle from data_loader import CocoDataset from build_vocab import Vocabulary with open('./data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) coco = CocoDataset(root='./data/val_resized2014', json='./data/annotations/captions_val2014.json', vocab=vocab, transform=None) output_dir = './application/static/candidate/' for i in range(20, 40): img = coco[i][0] img.save(os.path.join(output_dir, str(i) + ".jpg"), img.format) with open(output_dir + str(i) + '.txt', 'w') as f: caption = ' '.join([vocab.idx2word[id] for id in coco[i][1][1:-1]]) print caption f.write(caption)