Esempio n. 1
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    get_caption(model, crit, dataset, dataset.get_vocab(), opt)
Esempio n. 2
0
def main(opt):
    dataset = VideoDataset(opt, 'val', 'chinese')
    opt["vocab_size"] = 13491  #dataset.get_vocab_size() + chinDataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"],
                         opt["dim_hidden"],
                         bidirectional=bool(opt["bidirectional"]),
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"],
                         opt["max_len"],
                         opt["dim_hidden"],
                         opt["dim_word"],
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"],
                         bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    # Setup the model
    model.load_state_dict(
        torch.load(opt["saved_model"], map_location=torch.device('cpu')))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Esempio n. 3
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
Esempio n. 4
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"])
    #model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(
        model.parameters(),
        lr=opt["learning_rate"],
        weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
Esempio n. 5
0
def main(opt):
    dataset = VideoDataset(opt, 'test')
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.seq_length
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=0.2)
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt.saved_model))
    model.eval()
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt):
    video_path = opt["video_path"]

    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    image_feats = extract_image_feats(video_path)
    image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0)

    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                            input_dropout_p=opt["input_dropout_p"],
                            rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder).cuda()

    model.load_state_dict(torch.load(opt["saved_model"]))
    model.eval()
    opt = dict()
    opt['child_sum'] = True
    opt['temporal_attention'] = True
    opt['multimodel_attention'] = True
    with torch.no_grad():
        _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
    vocab = json.load(open('data/info.json'))['ix_to_word']
    sent = NLUtils.decode_sequence(vocab, seq_preds)
    print(sent)
Esempio n. 7
0
def main(opt):
    opt_test = opt
    test_dataset = VideoDataset(opt_test, 'test')
    opt_test["vocab_size"] = test_dataset.get_vocab_size()
    opt_test["seq_length"] = test_dataset.max_len
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = Two_Lstm(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, second_lstm, decoder)
    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit,
          opt_test, test_dataset)
Esempio n. 8
0
def main(opt):

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=8,
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    model.load_state_dict(
        torch.load(
            "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth"
        ))
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
    def main(self, opt):
        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
        video_path = self.ent1.get().replace("/", "\\")
        image_feats = self.extract_image_feats(video_path)
        image_feats = torch.from_numpy(image_feats).type(
            torch.FloatTensor).unsqueeze(0)

        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=bool(opt["bidirectional"]),
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(16860,
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=bool(opt["bidirectional"]))
        model = S2VTAttModel(encoder, decoder).cuda()
        model.load_state_dict(torch.load("data/save/model_500.pth"))
        model.eval()
        opt = dict()
        opt['child_sum'] = True
        opt['temporal_attention'] = True
        opt['multimodel_attention'] = True
        with torch.no_grad():
            _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt)
        vocab = json.load(open('data/info.json'))['ix_to_word']
        self.sent = NLUtils.decode_sequence(vocab, seq_preds)
        hasil = self.translator.translate(self.sent[0], dest='id')
        print(self.sent[0])
        self.hasilPred.configure(text=self.sent[0])
        self.hasiltrans.configure(text=hasil.text)
        # coba = self.sent[0]
        self.textToSpeech(self.sent[0], hasil.text)
        del seq_preds
        torch.cuda.empty_cache()
Esempio n. 10
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    if torch.cuda.device_count() > 1:
        print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Esempio n. 11
0
def main(opt):
    dataset_test = VideoDataset(opt, 'test')
    dataloader_test = DataLoader(dataset_test,
                                 batch_size=opt["batch_size"],
                                 shuffle=False)
    opt["obj_vocab_size"] = dataset_test.get_obj_vocab_size()
    opt["rel_vocab_size"] = dataset_test.get_rel_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["obj_vocab_size"],
                             opt["rel_vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    model = model.cuda()
    model.load_state_dict(torch.load(opt['ckpt_path']))
    crit = utils.ObjRelCriterion()
    test(model, crit, opt, dataloader_test)
def main(opt):
    train_dataset = VideoDataset(opt, 'train')
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True)
    opt.vocab_size = train_dataset.get_vocab_size()
    opt.seq_length = train_dataset.seq_length
    val_dataset = VideoDataset(opt, 'val')
    val_dataloader = DataLoader(val_dataset, batch_size=120, shuffle=True)
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=opt.rnn_dropout_p)
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt.learning_rate_decay_every,
        gamma=opt.learning_rate_decay_rate)
    if not os.path.isdir(opt.checkpoint_path):
        os.mkdir(opt.checkpoint_path)
    train(train_dataloader, val_dataloader, model, crit, optimizer,
          exp_lr_scheduler, opt, rl_crit)
Esempio n. 13
0
def main(opt):
    dataset = VideoDataset(opt, 'train')

    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    elif opt["model"] == "CTCmodel":
        # input_dim, hidden_dim, output_dim, num_layers, biFlag, dropout = 0.5
        # model = CTCmodel(opt["dim_vid"],opt["dim_hidden"],opt["vocab_size"]+1)
        model = CTCmodel(opt['vocab_size'], opt['dim_hidden'])
    elif opt["model"] == "CTC_Hieratical_LSTM":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = two_lstm(
            opt["dim_hidden"] * 2,
            opt['vocab_size'],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        model = CTC_Hieratical_LSTM(encoder, second_lstm, opt['vocab_size'],
                                    opt['dim_word'], opt['dim_hidden'],
                                    opt['duration'], opt['video_duration'])

    # model = model.cuda()
    # crit = utils.LanguageModelCriterion()
    # rl_crit = utils.RewardCriterion()
    ctc_loss = nn.CTCLoss(reduction='mean')
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    if opt['lr_schluder'] == 'StepLR':
        lr_scheduler = optim.lr_scheduler.StepLR(
            optimizer,
            step_size=opt["learning_rate_decay_every"],
            gamma=opt["learning_rate_decay_rate"])
    elif opt['lr_schluder'] == 'ReduceLROnPlateau':
        lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.1,
            patience=opt['patience'],
            verbose=True,
            threshold_mode='rel',
            threshold=opt['threshold'],
            cooldown=0,
            min_lr=opt['min_lr'],
            eps=1e-8)
    else:
        raise NotImplementedError('Only implement ReduceLROnPlateau | StepLR')
    opt['check_bool'] = False
    if opt['check_bool']:
        check_path = os.path.join(opt['check_path'], 'model_10.pth')
        model.load_state_dict(torch.load(check_path))
        opt['root_model_path'] = opt['check_path']
        print('have loaded model info from:', check_path)
        #TODO断点重新训练
        val(model, ctc_loss, opt)
    else:
        opt_json = os.path.join(
            opt["checkpoint_path"],
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())),
            'opt_info.json')
        root_model_path = os.path.join(
            opt['checkpoint_path'],
            time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
        opt['root_model_path'] = root_model_path
        if not os.path.isdir(opt["checkpoint_path"]):
            os.mkdir(opt["checkpoint_path"])
        if not os.path.isdir(root_model_path):
            os.mkdir(root_model_path)
        with open(opt_json, 'w') as f:
            json.dump(opt, f)
        print('save opt details to %s' % (opt_json))
    train(dataloader, model, ctc_loss, optimizer, lr_scheduler, opt)
Esempio n. 14
0
    def __init__(self, opt):
        # tutorials/09 - Image Captioning
        # Build Models
        self.grad_clip = opt.grad_clip
        self.img_enc = EncoderImage(opt.data_name, opt.img_dim, opt.embed_size,
                                    opt.finetune, opt.cnn_type,
                                    use_abs=opt.use_abs,
                                    no_imgnorm=opt.no_imgnorm)
        self.txt_enc = EncoderText(opt.vocab_size, opt.word_dim,
                                   opt.embed_size, opt.num_layers,
                                   use_abs=opt.use_abs)
        if torch.cuda.is_available():
            self.img_enc.cuda()
            self.txt_enc.cuda()
            cudnn.benchmark = True



        #####   captioning elements
        
        self.encoder = EncoderRNN(
            opt.dim_vid,
            opt.dim_hidden,
            bidirectional=opt.bidirectional,
            input_dropout_p=opt.input_dropout_p,
            rnn_cell=opt.rnn_type,
            rnn_dropout_p=opt.rnn_dropout_p)
        self.decoder = DecoderRNN(
            opt.vocab_size,
            opt.max_len,
            opt.dim_hidden,
            opt.dim_word,
            input_dropout_p=opt.input_dropout_p,
            rnn_cell=opt.rnn_type,
            rnn_dropout_p=opt.rnn_dropout_p,
            bidirectional=opt.bidirectional)
        
        self.caption_model = S2VTAttModel(self.encoder, self.decoder)
        
        self.crit = utils.LanguageModelCriterion()
        self.rl_crit = utils.RewardCriterion()

        if torch.cuda.is_available():
            self.caption_model.cuda()


        # Loss and Optimizer
        self.criterion = ContrastiveLoss(margin=opt.margin,
                                         measure=opt.measure,
                                         max_violation=opt.max_violation)
        params = list(self.txt_enc.parameters())
        params += list(self.img_enc.parameters())
        params += list(self.decoder.parameters())
        params += list(self.encoder.parameters())
        params += list(self.caption_model.parameters())

        if opt.finetune:
            params += list(self.img_enc.cnn.parameters())
        self.params = params

        self.optimizer = torch.optim.Adam(params, lr=opt.learning_rate)

        self.Eiters = 0
Esempio n. 15
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # # 声音encoder
        # encoder_voice = EncoderRNN(
        #     opt["dim_voice"],
        #     opt["dim_hidden"],
        #     bidirectional=opt["bidirectional"],
        #     input_dropout_p=opt["input_dropout_p"],
        #     rnn_cell=opt['rnn_type'],
        #     rnn_dropout_p=opt["rnn_dropout_p"])
        # 手语encoder
        if opt['with_hand'] == 1:
            encoder_hand = EncoderRNN(opt["dim_hand"],
                                      opt["dim_hand_hidden"],
                                      bidirectional=opt["bidirectional"],
                                      input_dropout_p=opt["input_dropout_p"],
                                      rnn_cell=opt['rnn_type'],
                                      rnn_dropout_p=opt["rnn_dropout_p"])
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"] + opt["dim_hand_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, encoder_hand, decoder)
        else:
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, None, decoder)
        # model = S2VTAttModel(encoder, encoder_voice, encoder_hand, decoder)

    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])
    # print(dataloader)
    # print(crit)
    # print(optimizer)

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)