def main(opt): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') if opt["model"] == "S2VTModel": dataset = VideoDataset(opt, "test") elif opt["model"] == "S2VTACTModel": dataset = VideoActDataset(opt, "test") else: print('Currently not supported: {}'.format(opt["model"])) raise ValueError opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == "S2VTModel": model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).to(device) elif opt["model"] == "S2VTACTModel": model = S2VTACTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).to(device) elif opt["model"] == "S2VTAttModel": print('Currently Not Supported: {}'.format(opt["model"])) raise ValueError # model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), device, opt)
def main(opt): dataset = VideoDataset(opt, 'val', 'chinese') opt["vocab_size"] = 13491 #dataset.get_vocab_size() + chinDataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) # Setup the model model.load_state_dict( torch.load(opt["saved_model"], map_location=torch.device('cpu'))) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() get_caption(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray for video_path in opt['videos']: print(video_path) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) for sent in sents: print(sent)
def main(opt): dataset = VideoDataset(opt, 'test') opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.seq_length if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=0.2) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt.saved_model)) model.eval() crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = EncoderDecoderModel(encoder, decoder).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return if torch.cuda.device_count() > 1: print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) model = nn.DataParallel(model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def cross_validate(model, crit, opt): dataset = VideoDataset(opt, 'val') print(len(dataset)) _, _, seq_probs, seq_preds, labels, masks = test(model, crit, dataset, dataset.get_vocab(), opt)
for line in fp: video_id.append(line.strip()) fp.close() f = open(output_path, 'w') for i in range(len(results)): f.write(video_id[i] + ',' + results[i] + '\n') f.close() dim_vid = 4096 dim_hidden = 512 dim_word = 512 dataset = VideoDataset('generate', folder_path) vocab_size = dataset.get_vocab_size() seq_length = 25 encoder = Encoder(dim_vid, dim_hidden) decoder = Decoder(vocab_size, seq_length, dim_hidden, dim_word, rnn_dropout_p=0.2) model = Model(encoder, decoder).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load('./good_model.pth')) model.eval() test(model, dataset, dataset.get_vocab())