def main(loader, vocab, opt, model=None): if model is None: vocab_size = len(vocab) model = MultimodalAtt(vocab_size, opt['max_len'], opt['dim_hidden'], opt['dim_word']) model = nn.DataParallel(model) if opt['beam']: bw = opt['beam_size'] print(f'Using beam search with beam width = {bw}') model_path = opt['checkpoint_path'] for i in os.listdir(model_path): if i.endswith('.pth'): print(i) path = os.path.join(model_path, i) model.load_state_dict(torch.load(path)) crit = NLUtils.LanguageModelCriterion() eval(model, crit, loader, vocab, opt) else: ''' Running from inside train.py ''' crit = NLUtils.LanguageModelCriterion() scores = eval(model, crit, loader, vocab, opt) return scores
def main(opt): dataset = VideoAudioDataset(opt, 'val') opt['vocab_size'] = dataset.get_vocab_size() model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt['rnn_dropout_p']).cuda() model = nn.DataParallel(model) crit = NLUtils.LanguageModelCriterion() for model_path in tqdm(glob.glob(os.path.join(opt['model_directory'],'*.pth'))): model.load_state_dict(torch.load(model_path)) eval(model, crit, dataset, dataset.get_vocab(), opt, model_path)
def main(opt): dataset = VideoAudioDataset(opt, 'test') opt['vocab_size'] = dataset.get_vocab_size() model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'], n_layers=opt['num_layers'], rnn_dropout_p=opt['rnn_dropout_p']).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(opt['model_path'])) crit = NLUtils.LanguageModelCriterion() eval(model, crit, dataset, dataset.get_vocab(), opt)
def main(): # video_path = input('en In the Shell you should seter the path to the video:') video_path = '../video9295.mp4' # model_path = input('enter the model path: ') model_path = 'save/vanilla/model_2190.pth' wav_path = vToA(video_path) audio_mfcc = split_audio(wav_path) audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0) os.environ['CUDA_VISIBLE_DEVICES'] = '0' image_feats = extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) print('generating caption...') model = MultimodalAtt(16860, 28, 1024, 512, rnn_dropout_p=0) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats, audio_mfcc, mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(): # video_path = input('en In the Shell you should seter the path to the video:') #video_path = '/home/cxu-serve/p1/ytian21/dat/msrvtt_2017/train-video/video9295.mp4' # model_path = input('enter the model path: ') model_path = 'save_new/model_225.pth' #wav_path = vToA(video_path) #wav_path = '/home/cxu-serve/p1/rohan27/research/audiocaps/code2/audios_msrvtt/video9295.wav' #audio_mfcc = split_audio(wav_path) #audio_mfcc = torch.from_numpy(audio_mfcc).type(torch.FloatTensor).unsqueeze(0) os.environ['CUDA_VISIBLE_DEVICES'] = '0' #image_feats = extract_image_feats(video_path) #image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) print('generating caption...') feat_dir = './audios_msrvtt/features' video_id = 9292 c4_dir = os.path.join(feat_dir, 'conv4', f'video{video_id}.npy') fc2_dir = os.path.join(feat_dir, 'fc2', f'video{video_id}.npy') c4_feat = torch.from_numpy(np.load(c4_dir)).type(torch.FloatTensor) fc2_feat = torch.from_numpy(np.load(fc2_dir)).type(torch.FloatTensor) c4_feat.unsqueeze_(0) fc2_feat.unsqueeze_(0) model = MultimodalAtt(16860, 28, 512, 300, rnn_dropout_p=0) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(model_path)) model.eval() opt = dict() with torch.no_grad(): _, seq_preds = model(c4_feat, fc2_feat, mode='inference', opt=opt) vocab = json.load(open('data/info_new.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(opt): dataset = VideoAudioDataset(opt, 'train') loader = DataLoader(dataset, batch_size=opt['batch_size'], shuffle=True) opt['vocab_size'] = dataset.get_vocab_size() model = MultimodalAtt(opt['vocab_size'], opt['max_len'], opt['dim_hidden'], opt['dim_word'], dim_vid=opt['dim_vid'], n_layers=opt['num_layers'], rnn_dropout_p=opt['rnn_dropout_p']) model = model.cuda() crit = LanguageModelCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"], amsgrad=True) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(loader, model, crit, optimizer, exp_lr_scheduler, opt)