Esempio n. 1
0
def main(loader, vocab, opt, model=None):
    if model is None:
        vocab_size = len(vocab)
        model = MultimodalAtt(vocab_size, opt['max_len'], opt['dim_hidden'],
                              opt['dim_word'])

        model = nn.DataParallel(model)

        if opt['beam']:
            bw = opt['beam_size']
            print(f'Using beam search with beam width = {bw}')
        model_path = opt['checkpoint_path']
        for i in os.listdir(model_path):
            if i.endswith('.pth'):
                print(i)
                path = os.path.join(model_path, i)
                model.load_state_dict(torch.load(path))
                crit = NLUtils.LanguageModelCriterion()

                eval(model, crit, loader, vocab, opt)
    else:
        '''
        Running from inside train.py
        '''
        crit = NLUtils.LanguageModelCriterion()
        scores = eval(model, crit, loader, vocab, opt)
        return scores
Esempio n. 2
0
def main(opt):
    dataset = VideoAudioDataset(opt, 'val')
    opt['vocab_size'] = dataset.get_vocab_size()
    model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'],
    n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt['rnn_dropout_p']).cuda()
    model = nn.DataParallel(model)
    crit = NLUtils.LanguageModelCriterion()
    for model_path in tqdm(glob.glob(os.path.join(opt['model_directory'],'*.pth'))):
        model.load_state_dict(torch.load(model_path))
        eval(model, crit, dataset, dataset.get_vocab(), opt, model_path)
Esempio n. 3
0
def main(opt):
    dataset = VideoAudioDataset(opt, 'test')
    opt['vocab_size'] = dataset.get_vocab_size()
    model = MultimodalAtt(opt['vocab_size'],
                          opt['max_len'],
                          opt['dim_hidden'],
                          opt['dim_word'],
                          dim_vid=opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt['rnn_dropout_p']).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt['model_path']))
    crit = NLUtils.LanguageModelCriterion()

    eval(model, crit, dataset, dataset.get_vocab(), opt)
def main():
    # video_path = input('en In the Shell you should seter the path to the video:')
    video_path = '../video9295.mp4'
    # model_path = input('enter the model path: ')
    model_path = 'save/vanilla/model_2190.pth'
    wav_path = vToA(video_path)
    audio_mfcc = split_audio(wav_path)
    audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    image_feats = extract_image_feats(video_path)
    image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)
    print('generating caption...')
    model = MultimodalAtt(16860, 28, 1024, 512, rnn_dropout_p=0)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    opt = dict()
    opt['child_sum'] = True
    opt['temporal_attention'] = True
    opt['multimodel_attention'] = True
    with torch.no_grad():
        _, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt)
    vocab = json.load(open('data/info.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
Esempio n. 5
0
def main():
    # video_path = input('en In the Shell you should seter the path to the video:')
    #video_path = '/home/cxu-serve/p1/ytian21/dat/msrvtt_2017/train-video/video9295.mp4'
    # model_path = input('enter the model path: ')
    model_path = 'save_new/model_225.pth'
    #wav_path = vToA(video_path)
    #wav_path = '/home/cxu-serve/p1/rohan27/research/audiocaps/code2/audios_msrvtt/video9295.wav'
    #audio_mfcc = split_audio(wav_path)
    #audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0)
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    #image_feats = extract_image_feats(video_path)
    #image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)
    print('generating caption...')
    feat_dir = './audios_msrvtt/features'
    video_id = 9292
    c4_dir = os.path.join(feat_dir, 'conv4', f'video{video_id}.npy')
    fc2_dir = os.path.join(feat_dir, 'fc2', f'video{video_id}.npy')
    c4_feat = torch.from_numpy(np.load(c4_dir)).type(torch.FloatTensor)
    fc2_feat = torch.from_numpy(np.load(fc2_dir)).type(torch.FloatTensor)
    c4_feat.unsqueeze_(0)
    fc2_feat.unsqueeze_(0)
    model = MultimodalAtt(16860, 28, 512, 300, rnn_dropout_p=0)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    opt = dict()
    with torch.no_grad():
        _, seq_preds = model(c4_feat, fc2_feat, mode='inference', opt=opt)
    vocab = json.load(open('data/info_new.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
Esempio n. 6
0
def main(opt):
    dataset = VideoAudioDataset(opt, 'train')
    loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True)
    opt['vocab_size'] = dataset.get_vocab_size()
    model = MultimodalAtt(opt['vocab_size'],
                          opt['max_len'],
                          opt['dim_hidden'],
                          opt['dim_word'],
                          dim_vid=opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt['rnn_dropout_p'])
    model = model.cuda()
    crit = LanguageModelCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"],
                           amsgrad=True)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(loader, model, crit, optimizer, exp_lr_scheduler, opt)