Ejemplo n.º 1
0
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    # 数据预处理
    data = t.load(opt.caption_data_path)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    test_datas = t.load('test_results2.pth')
    imgs = t.load('test_imgs.pth')

    # Caption模型
    model = CaptionModel(opt, None, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    model.cuda()

    results = []
    for ii, (img_feat, img_id) in tqdm.tqdm(enumerate(zip(test_datas, imgs))):
        sentences = model.generate(img_feat)
        item = {
            'image_id': img_id.replace('.jpg', ''),
            'caption': sentences[0].replace('</EOS>', '')
        }
        results.append(item)
        if ii % 1000 == 0: print sentences[0]
    import json
    with open('submit.json', 'w') as f:
        json.dump(results, f)
def generate(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    # 数据预处理
    data = t.load(opt.caption_data_path, map_location=lambda s, l: s)
    word2ix, ix2word = data['word2ix'], data['ix2word']

    normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
        tv.transforms.Scale(opt.scale_size),
        tv.transforms.CenterCrop(opt.img_size),
        tv.transforms.ToTensor(), normalize
    ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x: x
    if opt.use_gpu:
        resnet50.cuda()
        img = img.cuda()
    img_feats = resnet50(Variable(img, volatile=True))

    # Caption模型
    model = CaptionModel(opt, word2ix, ix2word)
    model = model.load(opt.model_ckpt).eval()
    if opt.use_gpu:
        model.cuda()

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Ejemplo n.º 3
0
def generate(**kwargs):
    opt = Config()
    for k,v in kwargs.items():
        setattr(opt,k,v)
    
    # 数据预处理
    data = t.load(opt.caption_data_path,map_location=lambda s,l:s)
    word2ix,ix2word = data['word2ix'],data['ix2word']

    IMAGENET_MEAN =  [0.485, 0.456, 0.406]
    IMAGENET_STD =  [0.229, 0.224, 0.225]
    normalize =  tv.transforms.Normalize(mean=IMAGENET_MEAN,std=IMAGENET_STD)
    transforms = tv.transforms.Compose([
                tv.transforms.Scale(opt.scale_size),
                tv.transforms.CenterCrop(opt.img_size),
                tv.transforms.ToTensor(),
                normalize
        ])
    img = Image.open(opt.test_img)
    img = transforms(img).unsqueeze(0)

    # 用resnet50来提取图片特征
    resnet50 = tv.models.resnet50(True).eval()
    del resnet50.fc
    resnet50.fc = lambda x:x
    if opt.use_gpu:
        resnet50.cuda() 
        img = img.cuda()
    img_feats = resnet50(Variable(img,volatile=True))

    # Caption模型
    model = CaptionModel(opt,word2ix,ix2word)
    model = model.load(opt.model_ckpt).eval()
    if opt.use_gpu:
         model.cuda()

    results = model.generate(img_feats.data[0])
    print('\r\n'.join(results))
Ejemplo n.º 4
0
def main():
    global args
    args = parser.parse_args()
    if args.save is '':
        args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    save_path = os.path.join(args.results_dir, args.save)
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    setup_logging(os.path.join(save_path, 'log.txt'))
    checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar')

    logging.debug("run arguments: %s", args)
    logging.info("using pretrained cnn %s", args.cnn)
    cnn = resnet.__dict__[args.cnn](pretrained=True)

    vocab = build_vocab()
    model = CaptionModel(cnn, vocab,
                         embedding_size=args.embedding_size,
                         rnn_size=args.rnn_size,
                         num_layers=args.num_layers,
                         share_embedding_weights=args.share_weights)

    train_data = get_iterator(get_coco_data(vocab, train=True),
                              batch_size=args.batch_size,
                              max_length=args.max_length,
                              shuffle=True,
                              num_workers=args.workers)
    val_data = get_iterator(get_coco_data(vocab, train=False),
                            batch_size=args.eval_batch_size,
                            max_length=args.max_length,
                            shuffle=False,
                            num_workers=args.workers)

    if 'cuda' in args.type:
        cudnn.benchmark = True
        model.cuda()

    optimizer = select_optimizer(
        args.optimizer, params=model.parameters(), lr=args.lr)
    regime = lambda e: {'lr': args.lr * (args.lr_decay ** e),
                        'momentum': args.momentum,
                        'weight_decay': args.weight_decay}
    model.finetune_cnn(False)

    def forward(model, data, training=True, optimizer=None):
        use_cuda = 'cuda' in args.type
        loss = nn.CrossEntropyLoss()
        perplexity = AverageMeter()
        batch_time = AverageMeter()
        data_time = AverageMeter()

        if training:
            model.train()
        else:
            model.eval()

        end = time.time()
        for i, (imgs, (captions, lengths)) in enumerate(data):
            data_time.update(time.time() - end)
            if use_cuda:
                imgs = imgs.cuda()
                captions = captions.cuda(async=True)
            imgs = Variable(imgs, volatile=not training)
            captions = Variable(captions, volatile=not training)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]

            pred, _ = model(imgs, input_captions, lengths)
            err = loss(pred, target_captions)
            perplexity.update(math.exp(err.data[0]))

            if training:
                optimizer.zero_grad()
                err.backward()
                clip_grad_norm(model.rnn.parameters(), args.grad_clip)
                optimizer.step()

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            if i % args.print_freq == 0:
                logging.info('{phase} - Epoch: [{0}][{1}/{2}]\t'
                             'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                             'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                             'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format(
                                 epoch, i, len(data),
                                 phase='TRAINING' if training else 'EVALUATING',
                                 batch_time=batch_time,
                                 data_time=data_time, perp=perplexity))

        return perplexity.avg

    for epoch in range(args.start_epoch, args.epochs):
        if epoch >= args.finetune_epoch:
            model.finetune_cnn(True)
        optimizer = adjust_optimizer(
            optimizer, epoch, regime)
        # Train
        train_perp = forward(
            model, train_data, training=True, optimizer=optimizer)
        # Evaluate
        val_perp = forward(model, val_data, training=False)

        logging.info('\n Epoch: {0}\t'
                     'Training Perplexity {train_perp:.4f} \t'
                     'Validation Perplexity {val_perp:.4f} \n'
                     .format(epoch + 1, train_perp=train_perp, val_perp=val_perp))
        model.save_checkpoint(checkpoint_file % (epoch + 1))
Ejemplo n.º 5
0
def train(**kwargs):
    opt = Config()
    for k, v in kwargs.items():
        setattr(opt, k, v)

    vis = Visualizer(env=opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # cnn = tv.models.resnet50(True)
    model = CaptionModel(opt, None, word2ix, ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)

    optimizer = model.get_optimizer(opt.lr1)
    criterion = t.nn.CrossEntropyLoss()

    model.cuda()
    criterion.cuda()

    loss_meter = meter.AverageValueMeter()
    perplexity = meter.AverageValueMeter()

    for epoch in range(opt.epoch):

        loss_meter.reset()
        perplexity.reset()
        for ii, (imgs, (captions, lengths),
                 indexes) in tqdm.tqdm(enumerate(dataloader)):
            optimizer.zero_grad()
            input_captions = captions[:-1]
            imgs = imgs.cuda()
            captions = captions.cuda()

            imgs = Variable(imgs)
            captions = Variable(captions)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]

            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            # clip_grad_norm(model.rnn.parameters(),opt.grad_clip)
            optimizer.step()
            loss_meter.add(loss.data[0])
            perplexity.add(t.exp(loss.data)[0])

            # 可视化
            if (ii + 1) % opt.plot_every == 0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss', loss_meter.value()[0])
                vis.plot('perplexity', perplexity.value()[0])

                # 可视化原始图片

                raw_img = _data['train']['ix2id'][indexes[0]]
                img_path = '/data/image/ai_cha/caption/ai_challenger_caption_train_20170902/caption_train_images_20170902/' + raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)
                vis.img('raw', raw_img)

                # raw_img = (imgs.data[0]*0.25+0.45).clamp(max=1,min=0)
                # vis.img('raw',raw_img)

                # 可视化人工的描述语句
                raw_caption = captions.data[:, 0]
                raw_caption = ''.join(
                    [_data['ix2word'][ii] for ii in raw_caption])
                vis.text(raw_caption, u'raw_caption')

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results), u'caption')
        if (epoch + 1) % 100 == 0:
            model.save()
Ejemplo n.º 6
0
    test_loader = DataLoader(test_opt)

    opt.vocab = train_loader.get_vocab()
    opt.vocab_size = train_loader.get_vocab_size()
    opt.seq_length = train_loader.get_seq_length()
    opt.feat_dims = train_loader.get_feat_dims()
    opt.history_file = opt.model_file.replace('.pth', '_history.json', 1)

    logger.info('Building model...')
    model = CaptionModel(opt)

    xe_criterion = CrossEntropyCriterion()
    rl_criterion = RewardCriterion()

    if torch.cuda.is_available():
        model.cuda()
        xe_criterion.cuda()
        rl_criterion.cuda()

    logger.info('Start training...')
    start = datetime.now()

    optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate)
    infos = train(model,
                  xe_criterion,
                  optimizer,
                  train_loader,
                  val_loader,
                  opt,
                  rl_criterion=rl_criterion)
    logger.info('Best val %s score: %f. Best iter: %d. Best epoch: %d',
Ejemplo n.º 7
0
def train(**kwargs):
    opt = Config()
    opt.caption_data_path = 'caption.pth'  # 原始数据
    opt.test_img = ''  # 输入图片
    #opt.model_ckpt='caption_0914_1947' # 预训练的模型

    # 数据w
    vis = Visualizer(env=opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix, ix2word = _data['word2ix'], _data['ix2word']

    # 模型
    model = CaptionModel(opt, word2ix, ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)
    optimizer = model.get_optimizer(opt.lr)
    criterion = t.nn.CrossEntropyLoss()
    if opt.use_gpu:
        model.cuda()
        criterion.cuda()

    # 统计
    loss_meter = meter.AverageValueMeter()

    for epoch in range(opt.epoch):
        loss_meter.reset()
        for ii, (imgs, (captions, lengths),
                 indexes) in tqdm.tqdm(enumerate(dataloader)):
            # 训练
            optimizer.zero_grad()
            if opt.use_gpu:
                imgs = imgs.cuda()
                captions = captions.cuda()
            imgs = Variable(imgs)
            captions = Variable(captions)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions, lengths)[0]
            score, _ = model(imgs, input_captions, lengths)
            loss = criterion(score, target_captions)
            loss.backward()
            optimizer.step()
            loss_meter.add(loss.data[0])
            '''
            if (ii+1)%opt.plot_every ==0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss',loss_meter.value()[0])

                # 可视化原始图片 + 可视化人工的描述语句
                raw_img = _data['ix2id'][indexes[0]]
                img_path=opt.img_path+raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)

                raw_caption = captions.data[:,0]
                raw_caption = ''.join([_data['ix2word'][int(ii)] for ii in raw_caption])
                vis.text(raw_caption,u'raw_caption')
                vis.img('raw',raw_img,caption=raw_caption)

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results),u'caption')
                '''
        model.save()
Ejemplo n.º 8
0
def  train(**kwargs):
    opt = Config()    
    opt.caption_data_path = 'caption.pth' # 原始数据
    opt.test_img = '' # 输入图片
    #opt.model_ckpt='caption_0914_1947' # 预训练的模型

    # 数据
    vis = Visualizer(env = opt.env)
    dataloader = get_dataloader(opt)
    _data = dataloader.dataset._data
    word2ix,ix2word = _data['word2ix'],_data['ix2word']

    # 模型
    model = CaptionModel(opt,word2ix,ix2word)
    if opt.model_ckpt:
        model.load(opt.model_ckpt)
    optimizer = model.get_optimizer(opt.lr)
    criterion = t.nn.CrossEntropyLoss()
    if opt.use_gpu:
        model.cuda()
        criterion.cuda()

    # 统计
    loss_meter = meter.AverageValueMeter()

    for epoch in range(opt.epoch):        
        loss_meter.reset()
        for ii,(imgs, (captions, lengths),indexes)  in tqdm.tqdm(enumerate(dataloader)):
            # 训练
            optimizer.zero_grad()
            input_captions = captions[:-1]
            if opt.use_gpu:
                imgs = imgs.cuda()
                captions = captions.cuda()
            imgs = Variable(imgs)
            captions = Variable(captions)
            input_captions = captions[:-1]
            target_captions = pack_padded_sequence(captions,lengths)[0]
            score,_ = model(imgs,input_captions,lengths)
            loss = criterion(score,target_captions)
            loss.backward()
            optimizer.step()
            loss_meter.add(loss.data[0])

            # 可视化
            if (ii+1)%opt.plot_every ==0:
                if os.path.exists(opt.debug_file):
                    ipdb.set_trace()

                vis.plot('loss',loss_meter.value()[0])

                # 可视化原始图片 + 可视化人工的描述语句
                raw_img = _data['ix2id'][indexes[0]]
                img_path=opt.img_path+raw_img
                raw_img = Image.open(img_path).convert('RGB')
                raw_img = tv.transforms.ToTensor()(raw_img)

                raw_caption = captions.data[:,0]
                raw_caption = ''.join([_data['ix2word'][ii] for ii in raw_caption])
                vis.text(raw_caption,u'raw_caption')
                vis.img('raw',raw_img,caption=raw_caption)

                # 可视化网络生成的描述语句
                results = model.generate(imgs.data[0])
                vis.text('</br>'.join(results),u'caption')
        model.save()