Beispiel #1
0
def main(opt):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    if opt["model"] == "S2VTModel":
        dataset = VideoDataset(opt, "test")
    elif opt["model"] == "S2VTACTModel":
        dataset = VideoActDataset(opt, "test")
    else:
        print('Currently not supported: {}'.format(opt["model"]))
        raise ValueError

    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == "S2VTModel":
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTACTModel":
        model = S2VTACTModel(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTAttModel":
        print('Currently Not Supported: {}'.format(opt["model"]))
        raise ValueError
    # model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), device, opt)
Beispiel #2
0
def main(opt):
    dataset = VideoDataset(opt, 'val', 'chinese')
    opt["vocab_size"] = 13491  #dataset.get_vocab_size() + chinDataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"],
                         opt["dim_hidden"],
                         bidirectional=bool(opt["bidirectional"]),
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"],
                         opt["max_len"],
                         opt["dim_hidden"],
                         opt["dim_word"],
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"],
                         bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    # Setup the model
    model.load_state_dict(
        torch.load(opt["saved_model"], map_location=torch.device('cpu')))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Beispiel #3
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    get_caption(model, crit, dataset, dataset.get_vocab(), opt)
Beispiel #4
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
Beispiel #5
0
def main(opt):
    dataset = VideoDataset(opt, 'test')
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.seq_length
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=0.2)
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt.saved_model))
    model.eval()
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Beispiel #6
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()
    test(model, crit, dataset, dataset.get_vocab(), opt)
Beispiel #7
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    if torch.cuda.device_count() > 1:
        print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Beispiel #8
0
def cross_validate(model, crit, opt):
    dataset = VideoDataset(opt, 'val')
    print(len(dataset))
    _, _, seq_probs, seq_preds, labels, masks = test(model, crit, dataset,
                                                     dataset.get_vocab(), opt)
Beispiel #9
0
    for line in fp:
        video_id.append(line.strip())
    fp.close()

    f = open(output_path, 'w')
    for i in range(len(results)):
        f.write(video_id[i] + ',' + results[i] + '\n')
    f.close()


dim_vid = 4096
dim_hidden = 512
dim_word = 512

dataset = VideoDataset('generate', folder_path)
vocab_size = dataset.get_vocab_size()
seq_length = 25

encoder = Encoder(dim_vid, dim_hidden)
decoder = Decoder(vocab_size,
                  seq_length,
                  dim_hidden,
                  dim_word,
                  rnn_dropout_p=0.2)
model = Model(encoder, decoder).cuda()

model = nn.DataParallel(model)
model.load_state_dict(torch.load('./good_model.pth'))
model.eval()
test(model, dataset, dataset.get_vocab())