コード例 #1
0
ファイル: eval.py プロジェクト: htt98/AI_View
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    get_caption(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #2
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
コード例 #3
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    global dataset_val
    global dataloader_val
    dataset_val =  VideoDataset(opt, 'val')
    dataloader_val = DataLoader(dataset_val, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load('data/save_vatex_batch_noc3d/model_500.pth'))
    crit = utils.LanguageModelCriterion()
    optimizer = optim.Adam(model.parameters(),lr=opt["learning_rate"],weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"])
    print("Data Loaded")
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #4
0
ファイル: eval.py プロジェクト: Blues5/video-caption-pytorch
def main(opt):
    dataset = VideoDataset(opt, 'test')
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.seq_length
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=0.2)
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt.saved_model))
    model.eval()
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #5
0
ファイル: eval.py プロジェクト: stillarrow/S2VT_ACT
def main(opt):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    if opt["model"] == "S2VTModel":
        dataset = VideoDataset(opt, "test")
    elif opt["model"] == "S2VTACTModel":
        dataset = VideoActDataset(opt, "test")
    else:
        print('Currently not supported: {}'.format(opt["model"]))
        raise ValueError

    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == "S2VTModel":
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTACTModel":
        model = S2VTACTModel(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTAttModel":
        print('Currently Not Supported: {}'.format(opt["model"]))
        raise ValueError
    # model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), device, opt)
コード例 #6
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"])
    #model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(
        model.parameters(),
        lr=opt["learning_rate"],
        weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #7
0
def main(opt):
    dataset = VideoDataset(opt, 'val', 'chinese')
    opt["vocab_size"] = 13491  #dataset.get_vocab_size() + chinDataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"],
                         opt["dim_hidden"],
                         bidirectional=bool(opt["bidirectional"]),
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"],
                         opt["max_len"],
                         opt["dim_hidden"],
                         opt["dim_word"],
                         input_dropout_p=opt["input_dropout_p"],
                         rnn_cell=opt['rnn_type'],
                         rnn_dropout_p=opt["rnn_dropout_p"],
                         bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    # Setup the model
    model.load_state_dict(
        torch.load(opt["saved_model"], map_location=torch.device('cpu')))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #8
0
ファイル: train.py プロジェクト: stillarrow/S2VT_ACT
def main(opt):
    # DataLoader
    if opt["model"] == 'S2VTModel':
        dataset = VideoDataset(opt, 'train')
    elif opt["model"] == 'S2VTACTModel':
        dataset = VideoActDataset(opt, 'train')
    else:
        print('Currently Not Support this model: {}'.format(opt["model"]))
        raise ValueError
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()

    if opt["model"] == 'S2VTModel':
        print(opt)
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])

    elif opt["model"] == 'S2VTACTModel':
        print(opt)
        model = S2VTACTModel(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             opt['dim_vid'],
                             rnn_cell=opt['rnn_type'],
                             n_layers=opt['num_layers'],
                             rnn_dropout_p=opt["rnn_dropout_p"])

    elif opt["model"] == "S2VTAttModel":
        print('Currently not supported.')
        raise ValueError
    # Load model
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    # Criterions #
    LMCriterion = utils.LanguageModelCriterion()
    # rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, optimizer, exp_lr_scheduler, opt, device,
          LMCriterion)
コード例 #9
0
ファイル: run.py プロジェクト: asuith/VideoCaptioning
def get_loader(opt, mode, print_info=False, specific=-1, target_ratio=-1):
    dataset = VideoDataset(opt, mode, print_info, specific=specific, target_ratio=target_ratio)
    if opt.get('splits_redefine_path', ''):
        dataset.set_splits_by_json_path(opt['splits_redefine_path']) 
    return DataLoader(
        dataset, 
        batch_size=opt["batch_size"], 
        shuffle=True if mode=='train' else False
        #shuffle=False
        )
コード例 #10
0
    def __init__(self,
                 ckp_path,
                 epoch_nums,
                 batch_size,
                 lr,
                 lr_step_size=10,
                 resnet_model='resnet50',
                 num_classes=num_classes_train,
                 pre_model_rgb=None):
        # get params
        self.ckp_path = ckp_path
        self.epoch_nums = epoch_nums
        self.batch_size = batch_size
        self.lr = lr
        self.lr_step_size = lr_step_size
        self.resnet_model = resnet_model
        self.num_classes = num_classes

        self.pre_model_rgb = pre_model_rgb

        # mkdir ckg_path
        if not os.path.exists(self.ckp_path):
            os.makedirs(self.ckp_path)

        # load model
        self.mymodel = model_rgb(batch_size=self.batch_size,
                                 num_classes=self.num_classes)
        self.mymodel.train()

        if (pre_model_rgb):
            checkpoints = torch.load(pre_model_rgb,
                                     map_location=lambda storage, loc: storage)
            self.mymodel.load_state_dict(checkpoints)
            print('submodel_rgb loaded from {}'.format(pre_model_rgb))

        #self.mymodel = torch.nn.DataParallel(self.mymodel)
        #self.mymodel.cuda()
        print('model loaded.')

        # define video_dataloader
        self.myDataset = VideoDataset(KINETICS_FRAME_DIR,
                                      TRAIN_LIST,
                                      mode='train')
        self.myDataloader = DataLoader(self.myDataset,
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       num_workers=0)

        self.testDataset = VideoDataset(KINETICS_FRAME_DIR,
                                        TEST_LIST,
                                        mode='test')
        self.testDataloader = DataLoader(self.testDataset,
                                         batch_size=self.batch_size,
                                         shuffle=False,
                                         num_workers=0)
コード例 #11
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()
    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #12
0
def main(opt):

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=8,
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    model.load_state_dict(
        torch.load(
            "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth"
        ))
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #13
0
def eval():
    opt = Opt()

    # prepare data
    validset = VideoDataset(opt.caption_file, opt.feats_path, mode='test')
    valid_loader = torch.utils.data.DataLoader(validset, batch_size=opt.batch_size, shuffle=False)
    word2ix = validset.word2ix
    ix2word = validset.ix2word
    vocab_size = len(word2ix)

    # load model
    model = torch.load(opt.model_path).to(device)

    ###
    ### start test
    ###

    pred_dict = {}
    for index, (feats, targets, IDs, masks) in enumerate(tqdm(valid_loader, desc="test")):
        # get prediction and cal loss
        model.eval()
        with torch.no_grad():
            preds = model(feats, mode='test')  # preds [B, L]
        # save result
        for ID, pred in zip(IDs, preds):
            word_preds = [ix2word[str(i.item())] for i in pred]
            if '<eos>' in word_preds:
                word_preds = word_preds[:word_preds.index('<eos>')]
            pred_dict[ID] = ' '.join(word_preds)

    return pred_dict
コード例 #14
0
def get_loader(opt, mode, print_info=False, specific=-1, **kwargs):
    dataset = VideoDataset(opt, mode, print_info, specific=specific, **kwargs)
    batch_size = kwargs.get('batch_size', opt['batch_size'])
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True if mode=='train' else False
    )
コード例 #15
0
def main(opt):
    train_dataset = VideoDataset(opt, 'train')
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True)
    opt.vocab_size = train_dataset.vocab_size
    opt.seq_length = train_dataset.seq_length
    val_dataset = VideoDataset(opt, 'val')
    val_dataloader = DataLoader(val_dataset,
                                batch_size=opt.batch_size,
                                shuffle=True)
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "Vid2seq":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             use_attention=True,
                             rnn_dropout_p=opt.rnn_dropout_p)
        model = Vid2seq(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt.learning_rate_decay_every,
        gamma=opt.learning_rate_decay_rate)
    if not os.path.isdir(opt.checkpoint_path):
        os.mkdir(opt.checkpoint_path)
    train(train_dataloader, val_dataloader, model, crit, optimizer,
          exp_lr_scheduler, opt, rl_crit)
コード例 #16
0
ファイル: eval.py プロジェクト: sadari1/VideoCaptioningAttack
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    if torch.cuda.device_count() > 1:
        print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
コード例 #17
0
def main(opt):
    dataset_test = VideoDataset(opt, 'test')
    dataloader_test = DataLoader(dataset_test,
                                 batch_size=opt["batch_size"],
                                 shuffle=False)
    opt["obj_vocab_size"] = dataset_test.get_obj_vocab_size()
    opt["rel_vocab_size"] = dataset_test.get_rel_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["obj_vocab_size"],
                             opt["rel_vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    model = model.cuda()
    model.load_state_dict(torch.load(opt['ckpt_path']))
    crit = utils.ObjRelCriterion()
    test(model, crit, opt, dataloader_test)
コード例 #18
0
ファイル: train.py プロジェクト: jackson1895/beautifulday
def main(opt):
    opt_test = opt
    test_dataset = VideoDataset(opt_test, 'test')
    opt_test["vocab_size"] = test_dataset.get_vocab_size()
    opt_test["seq_length"] = test_dataset.max_len
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = Two_Lstm(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, second_lstm, decoder)
    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit,
          opt_test, test_dataset)
コード例 #19
0
def train(opt,EncoderRNN,DecoderCNN,Convcap,itow):
    '''
    training 
    initialize the models 
    pass the arg through the Convcap model
    output: /checkpoint/model.pth ----- trained model
    '''
    t_start = time.time()
    train_data=VideoDataset(opt, 'train')
    #####DataLODER#####
    collate_fn=PadSequence()
    train_loader=DataLoader(train_data, batch_size=opt["batch_size"],collate_fn=collate_fn,num_workers=opt['num_workers'], shuffle=True)
    print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start))
    #,word_embed,masks,gts
    for c3d_feat,lengths,labels,ids,word_embed,masks,gts in train_loader:
    #for data in train_loader: 
                print("came here")
コード例 #20
0
def main():
    # 画像群のパス作成
    root_path = './data/kinetics_videos/'
    video_list = make_datapath_list(root_path)

    # 動画クラスラベルとIDの辞書作成
    label_dicitionary_path = './video_download/kinetics_400_label_dicitionary.csv'
    label_id_dict, id_label_dict = get_label_id_dictionary(
        label_dicitionary_path)

    # 前処理の設定
    resize, crop_size = 224, 224
    mean, std = [104, 117, 123], [1, 1, 1]
    video_transform = VideoTransform(resize, crop_size, mean, std)

    # Datasetの作成
    val_dataset = VideoDataset(video_list,
                               label_id_dict,
                               num_segments=16,
                               phase='val',
                               transform=video_transform,
                               img_template='image_{:05d}.jpg')

    # DataLoaderを作成して,データをロード
    batch_size = 8
    val_dataloader = data.DataLoader(val_dataset,
                                     batch_size=batch_size,
                                     shuffle=False)
    batch_iterator = iter(val_dataloader)  # イテレータに変換
    imgs_transformeds, labels, label_ids, dir_path = next(batch_iterator)

    # モデルをインスタンス化し,推論モードに変更後,学習済みモデルをロード
    net = ECO_Lite()
    net.eval()
    net.load_state_dict(torch.load('./models/pretrained.pth'))

    # ECOで推論
    with torch.set_grad_enabled(False):
        outputs = net(imgs_transformeds)

    # 予測を実施
    idx = 0
    show_eco_inference_result(dir_path, outputs, id_label_dict, idx)
コード例 #21
0
def train(opt,EncoderRNN,DecoderCNN,Convcap,itow):
    '''
    training 
    initialize the models 
    pass the arg through the Convcap model
    output: /checkpoint/model.pth ----- trained model
    '''
    t_start = time.time()
    train_data=VideoDataset(opt, 'train')
    #####DataLODER#####
    #collate_fn=PadSequence(),collate_fn=collate_fn,
    train_loader=DataLoader(train_data, batch_size=opt["batch_size"],num_workers=opt['num_workers'], shuffle=True)
    print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start))
    

    ##initialize encoder,decoder,model
    encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda()
    decoder=DecoderCNN.DecoderCNN(train_data.get_vocab_size()).cuda()
    convcap=Convcap.Convcap(encoder,decoder).cuda()


    ####initialize hyper params
    optimizer = optim.RMSprop(convcap.parameters(), lr=opt["learning_rate"])
    scheduler = lr_scheduler.StepLR(optimizer, step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"])
    batchsize =opt['batch_size']
    cap_size= opt['max_len']
    nbatches = np.int_(np.floor((len(train_data)*1.)/batchsize))
    bestscore = .0
    batchsize_cap = batchsize*1
    max_tokens= opt['max_len']
    # print(batchsize,cap_size,batchsize_cap,max_tokens)
    # print("nbatches"+str(nbatches))

    itr = 0
    loss_graph=[]
    graph_x=[]
    for epoch in range(opt['epochs']):
        loss_train = 0.
        scheduler.step()
        
        for data in train_loader:
        #for c3d_feat,lengths,labels,word_embed,masks,gts in train_loader:
            #print("came here")
            print("iteration"+str(itr))
            itr+=1
            vid_feat=Variable(data['c3d_feats']).cuda()
            labels = Variable(data['labels'].type(torch.LongTensor)).cuda()
            mask = Variable(data['masks']).cpu()
            word_embed=Variable(data['word_embed']).cuda()
            cap=data['cap']
            # vid_feat=Variable(c3d_feat).cuda()
            # print(vid_feat.dtype)
            # print(vid_feat.shape)
            # labels = Variable(torch.FloatTensor(labels)).cuda()
            # mask = Variable(torch.FloatTensor(masks)).cpu()
            
            # word_embed= torch.stack([x for x in word_embed],dim=0)
            # print(word_embed.shape)
            
            # word_embed=Variable(word_embed).cuda()
            # lengths=lengths.type(torch.FloatTensor)
            # print(c3d_feat,word_embed)

            optimizer.zero_grad()
            wordact = convcap(vid_feat,labels,word_embed,'train')
            # print("//////////////////////////////////////////////")
            #print("1.wordact.shape"+str(wordact.shape))
            wordact = wordact[:,:,:-1]
            #print("2.wordact.shape"+str(wordact.shape))
            # print("////////////////////////////////////")
            labels = labels[:,1:]
            mask = mask[:,1:].contiguous()
            # print (wordact.shape)
            # print(batchsize_cap,max_tokens)
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(\
            batchsize*(max_tokens-1), -1)
            #print(wordact_t.shape)
            wordclass_t = labels.contiguous().view(\
            batchsize*(max_tokens-1), 1)

            maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)
            #print("mask Ids \t"+str(maskids))
            loss = F.cross_entropy(wordact_t[maskids, ...], \
            wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))
            ########### for visual##############################
            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            # print("word_class \t"+str(wordclass_t.shape)+"\t"+str(wordclass_t.dtype))
            # print(wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))
            # print(wordact_t[maskids, ...])
            wordids = np.argmax(wordprobs, axis=1)
            for i in wordids:
                print(itow[i])
            # for i in wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]):
            #     print(itow[i])
            print(cap)
             ############################ #####################  
            if itr%500 == 0:
                graph_x.append(itr)
                loss_graph.append(loss)

                

            loss_train = loss_train + loss.item()
            loss.backward()
            optimizer.step()
            print("loss"+str(loss_train))
            
        
        loss_train = (loss_train*1.)/(nbatches)
        print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train))

        modelfn = osp.join(opt['checkpoint_path'], 'model_j_19_'+str(itr)+'.pth')
        torch.save({
                'epoch': epoch,
                'state_dict': convcap.state_dict(),              
                'optimizer' : optimizer.state_dict(),
                'loss':loss_train
            }, modelfn)
        print('time for epoch %f' % (time.time() - t_start))
    plt.plot(graph_x,loss_graph,'ro') 
    plt.show()
コード例 #22
0
ファイル: train.py プロジェクト: Kamino666/S2VT-video-caption
def train():
    opt = Opt()
    # write log
    save_opt(opt)

    # prepare data
    trainset = VideoDataset(opt.caption_file, opt.feats_path)
    train_loader = torch.utils.data.DataLoader(trainset,
                                               batch_size=opt.batch_size,
                                               shuffle=True)
    testset = VideoDataset(opt.caption_file, opt.feats_path, mode='valid')
    test_loader = torch.utils.data.DataLoader(testset,
                                              batch_size=opt.batch_size,
                                              shuffle=False)
    word2ix = trainset.word2ix
    ix2word = trainset.ix2word
    vocab_size = len(word2ix)

    # build model
    # model = S2VT_Att(
    #     vocab_size,
    #     opt.feat_dim,
    #     dim_hid=opt.dim_hidden,
    #     dim_embed=opt.dim_embed,
    #     length=opt.train_length,
    #     feat_dropout=opt.feat_dropout,
    #     rnn_dropout=opt.rnn_dropout,
    #     out_dropout=opt.out_dropout,
    #     num_layers=opt.num_layers,
    #     bidirectional=opt.bidirectional,
    #     rnn_type=opt.rnn_type,
    #     sos_ix=word2ix['<sos>'],
    #     eos_ix=word2ix['<eos>'],
    # ).to(device)
    model = Att_Baseline(vocab_size,
                         opt.feat_dim,
                         length=opt.train_length,
                         dim_hid=opt.dim_hidden,
                         dim_embed=opt.dim_embed,
                         feat_dropout=opt.feat_dropout,
                         out_dropout=opt.out_dropout,
                         sos_ix=3,
                         eos_ix=4).to(device)
    # model.load_glove_weights('./data/glove.6B.300d.txt', 300, trainset.ix2word)
    optimizer = optim.Adam(
        model.parameters(),
        lr=opt.lr,
        # weight_decay=opt.weight_decay
    )
    # dynamic learning rate
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, verbose=True, patience=opt.learning_rate_patience)
    early_stopping = EarlyStopping(patience=opt.early_stopping_patience,
                                   verbose=True,
                                   path=os.path.join(
                                       opt.save_path,
                                       opt.start_time + 'stop.pth'))
    criterion = MaskCriterion()

    ###
    ### start training
    ###

    try:
        for epoch in range(opt.EPOCHS):
            # ****************************
            #            train
            # ****************************
            train_running_loss = 0.0
            loss_count = 0
            for index, (feats, targets, IDs, masks) in enumerate(
                    tqdm(train_loader, desc="epoch:{}".format(epoch))):
                optimizer.zero_grad()
                model.train()

                # probs [B, L, vocab_size]
                probs = model(feats, targets=targets[:, :-1], mode='train')

                loss = criterion(probs, targets, masks)

                loss.backward()
                optimizer.step()

                train_running_loss += loss.item()
                loss_count += 1

            train_running_loss /= loss_count
            writer.add_scalar('train_loss',
                              train_running_loss,
                              global_step=epoch)

            # ****************************
            #           validate
            # ****************************
            valid_running_loss = 0.0
            loss_count = 0
            for index, (feats, targets, IDs, masks) in enumerate(test_loader):
                model.eval()

                with torch.no_grad():
                    probs = model(feats, targets=targets[:, :-1], mode='train')
                    loss = criterion(probs, targets, masks)

                valid_running_loss += loss.item()
                loss_count += 1

            valid_running_loss /= loss_count
            writer.add_scalar('valid_loss',
                              valid_running_loss,
                              global_step=epoch)
            writer.add_scalar('lr',
                              optimizer.state_dict()['param_groups'][0]['lr'],
                              global_step=epoch)
            if epoch % opt.histogram_freq == 0:
                for i, (name, param) in enumerate(model.named_parameters()):
                    writer.add_histogram(name, param, epoch)

            print("train loss:{} valid loss: {}".format(
                train_running_loss, valid_running_loss))
            lr_scheduler.step(valid_running_loss)

            # early stop
            early_stopping(valid_running_loss, model)
            if early_stopping.early_stop:
                print("Early stopping")
                break

            # save checkpoint
            if epoch % opt.save_freq == 0:
                print('epoch:{}, saving checkpoint'.format(epoch))
                torch.save(
                    model,
                    os.path.join(opt.save_path,
                                 opt.start_time + str(epoch) + '.pth'))

    except KeyboardInterrupt as e:
        print(e)
        print("Training interruption, save tensorboard log...")
        writer.close()
    # save model
    torch.save(model, os.path.join(opt.save_path,
                                   opt.start_time + 'final.pth'))
コード例 #23
0
def main():

    timestamp = str(time.asctime(time.localtime(time.time())))

    print('__pyTorch VERSION:', torch.__version__)
    print('__CUDA VERSION')

    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__Devices')
    use_cuda = False
    # use_cuda = torch.cuda.is_available()
    device = torch.device("cuda:0" if use_cuda else "cpu")
    # Training settings

    print("DEVICE ", device)

    max_epochs = 100

    N_CL = 47
    dim = (224, 224)

    seq_length = 300
    n_channels = 3

    test_params = {'batch_size': 1, 'shuffle': False, 'num_workers': 2}

    train_params = {'batch_size': 1, 'shuffle': True, 'num_workers': 4}

    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--modality',
                        type=str,
                        default='full_image',
                        metavar='rc',
                        help='hands or full image')
    parser.add_argument('--batch-size',
                        type=int,
                        default=1,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=max_epochs,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.00001,
                        metavar='LR',
                        help='learning rate (default: 0.00001)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--dataloader',
                        type=str,
                        default='rand_crop',
                        metavar='rc',
                        help='data augmentation')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=100,
        metavar='N',
        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model',
                        action='store_true',
                        default=True,
                        help='For Saving the current Model')
    parser.add_argument('--seq-length',
                        type=int,
                        default=seq_length,
                        metavar='num',
                        help='squence length')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=1024,
                        metavar='num',
                        help='lstm units')
    parser.add_argument('--optim',
                        type=int,
                        default=1,
                        metavar='optim number',
                        help='optimizer sgd or adam')
    parser.add_argument('--n_layers',
                        type=int,
                        default=2,
                        metavar='num',
                        help='hidden size')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.7,
                        metavar='num',
                        help='hidden size')
    parser.add_argument('--bidirectional',
                        action='store_true',
                        default=True,
                        help='hidden size')

    args = parser.parse_args()

    # Helper: Save the model.

    train_partition, train_labels = load_csv_file(train_filepath)
    test_partition, test_labels = load_csv_file(test_filepath)
    classes = count_classes(train_labels)
    id2w = list(range(len(classes)))
    id2w = dict(zip(id2w, classes))
    #print(id2w)

    print('Number of Classes {} \n \n  '.format(len(classes)))

    # DATA GENERATORS

    training_set = VideoDataset(train_prefix, train_partition, train_labels,
                                classes, seq_length, dim, False, False)
    training_generator = data.DataLoader(training_set, **train_params)

    validation_set = VideoDataset(test_prefix, test_partition, test_labels,
                                  classes, seq_length, dim, False, False)
    validation_generator = data.DataLoader(validation_set, **test_params)

    # MODEL ILITIALIZE
    model = RNNModel(args.hidden_size, args.n_layers, args.dropout,
                     len(classes)).to(device)
    # model.load_state_dict(
    #     torch.load('/home/hatzis/Desktop/stim_ctc/checkpoints/stim_ctc_firt_try/best_wer46.35898383147232.pth', map_location=device)[
    #         'model_dict'])

    if (args.optim == 1):
        opt_name = 'ADAMlr '
        print(" use optim ", opt_name, args.lr)
        optimizer = optim.Adam(model.parameters(), lr=args.lr)

    elif (args.optim == 2):
        opt_name = 'SGD lr '
        print(" use optim", opt_name, args.lr)
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=0.8,
                              weight_decay=0.0000001)
    max_epochs = 100

    scheduler = StepLR(optimizer, step_size=20, gamma=0.5)

    # TRAINING LOOP

    for epoch in range(1, max_epochs):

        #scheduler.step()

        train(args, model, device, training_generator, optimizer, epoch,
              timestamp, ngrams, id2w)

        print("!!!!!!!!   VALIDATION   !!!!!!!!!!!!!!!!!!")
        validate(args, model, device, validation_generator, optimizer, epoch,
                 timestamp, ngrams, id2w)
コード例 #24
0
ファイル: train.py プロジェクト: salmon7ish/Video-Captioning
def cross_validate(model, crit, opt):
    dataset = VideoDataset(opt, 'val')
    print(len(dataset))
    _, _, seq_probs, seq_preds, labels, masks = test(model, crit, dataset,
                                                     dataset.get_vocab(), opt)
コード例 #25
0
import numpy as np
import csv
import time
import argparse
import opt
from dataloader import VideoDataset
import json
import os
import os.path as osp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torch.utils.data import DataLoader
from model import EncoderRNN, DecoderCNN, Convcap
train_data = VideoDataset(opt, 'train')
train_loader = DataLoader(train_data,
                          batch_size=opt["batch_size"],
                          num_workers=3,
                          shuffle=False)
for data in train_loader:

    vid_feat = data['c3d_feats']
    labels = data['labels']
    mask = data['masks']
    word_embed = data['word_embed']
    vid_id = data['video_id']
コード例 #26
0
 def prepare_data(self):
     self.train_dataset = VideoDataset(self.hparams.train_dir,
                                       transform=self.hparams.augmentation)
     self.test_dataset = ImageDataset(self.hparams.test_dir)
コード例 #27
0
def test(opt,EncoderRNN,DecoderCNN,Convcap,itow,wtoi,modelfn=None):

    '''
    input : option, model, checkpoint/model.pth
    output :scores
    '''
    t_start = time.time()
    t_start = time.time()
    
    test_data=VideoDataset(opt, 'test')
    test_loader=DataLoader(test_data, batch_size=opt["batch_size"],num_workers=30, shuffle=False)
    print('[DEBUG] Loading test data ... %f secs' % (time.time() - t_start))

    batchsize =opt['batch_size']
    cap_size= opt['max_len']
    nbatches = np.int_(np.floor((len(test_data)*1.)/batchsize))
    bestscore = .0
    batchsize_cap = batchsize*1
    max_tokens= opt['max_len']

    if(modelfn is not None):
        encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda()
        decoder=DecoderCNN.DecoderCNN(test_data.get_vocab_size()).cuda()
        convcap=Convcap.Convcap(encoder,decoder).cuda()
        print('[DEBUG] Loading checkpoint %s' % modelfn)
        checkpoint = torch.load(modelfn)
        convcap.load_state_dict(checkpoint['state_dict'])
        
    convcap.train(False)
    pred_captions = []
    itr=0
    for data in test_loader:
        print("iteration"+str(itr))
        #print(data['labels'].data[0])
        print("\n")
        print("gt\n")
        for i in data['labels'].data[0]:

            print(itow[int(i)])
            

        itr+=1
        vid_feat=Variable(data['c3d_feats']).cuda()
        labels=Variable(data['labels'].type(torch.LongTensor)).cuda()
        mask = Variable(data['masks']).cpu()
        word_embed=Variable(data['word_embed']).cuda()
        vid_id=data['video_ids']
        #print(vid_id[0])
        wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
        wordclass_feed[:,0] =wtoi['<sos>'] #1 #index of <sos>
        # print('wordclass_feed shape')
        # print(wordclass_feed.shape)
        outcaps = np.empty((batchsize, 0)).tolist()
        x_outcaps=np.empty((batchsize, 0)).tolist()
        for j in range(max_tokens-1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()
            wordact = convcap(vid_feat,wordclass,word_embed,'test')
            x=convcap(vid_feat,labels,word_embed,'test')
            x=x[:,:,:-1]
            x_t=x.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
            x_prob=F.softmax(x_t).cpu().data.numpy()
            

            wordact = wordact[:,:,:-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
           # print("convcap output"+str(wordact_t.shape))
            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            x_id=np.argmax(x_prob, axis=1)
            
            wordids = np.argmax(wordprobs, axis=1)
            probs=np.max(wordprobs,axis=1)
            
            for k in range(batchsize):
                word = itow[wordids[j+k*(max_tokens-1)]]
                x_word=itow[x_id[j+k*(max_tokens-1)]]
                outcaps[k].append(word)
                x_outcaps[k].append(x_word)
                if(j < max_tokens-1):
                    wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)]

                    # print("word ids"+str(wordids[j+k*(max_tokens-1)]))
        for j in range(batchsize):
            num_words = len(outcaps[j])
            x_n_words=len(x_outcaps[j]) 
            if 'eos' in x_outcaps[j]:
                x_n_words=x_outcaps[j].index('eos')
            x_outcap=' '.join(x_outcaps[j][:num_words])
            if 'eos' in outcaps[j]:
                num_words = outcaps[j].index('eos')
            outcap = ' '.join(outcaps[j][:num_words])
            pred_captions.append({'vid_id': vid_id[0][5:], 'caption': outcap})
        print("------------------------------------------------------------------------------")
        print("videoID \t"+str(vid_id))
        print("caption \n")
        print(x_outcap)
        
        #print(itow[int(i)] for i in data['labels'].data[0])
        print("------------------------------------------------------------------------------")
        
        
    scores = language_eval(pred_captions, '/home/sanjay/Documents/Video_convcap/output', 'test')

    return scores
コード例 #28
0
import opts
from dataloader import VideoDataset
from torch.utils.data import DataLoader

if __name__ == '__main__':
    opt = opts.parse_opt()
    opt = vars(opt)
    dataset = VideoDataset(opt, 'train')
    loader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)

    # data['fc_feats'] = torch.from_numpy(fc_feat).type(torch.FloatTensor)
    # data['labels'] = label
    # data['masks'] = mask
    # data['gts'] = gts
    # data['video_ids'] = vid_id
    for data in loader:
        for key in ('fc_feats', 'labels', 'masks', 'video_ids'):
            print("{} shape: {}".format(key, data[key].shape))
        break
コード例 #29
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # # 声音encoder
        # encoder_voice = EncoderRNN(
        #     opt["dim_voice"],
        #     opt["dim_hidden"],
        #     bidirectional=opt["bidirectional"],
        #     input_dropout_p=opt["input_dropout_p"],
        #     rnn_cell=opt['rnn_type'],
        #     rnn_dropout_p=opt["rnn_dropout_p"])
        # 手语encoder
        if opt['with_hand'] == 1:
            encoder_hand = EncoderRNN(opt["dim_hand"],
                                      opt["dim_hand_hidden"],
                                      bidirectional=opt["bidirectional"],
                                      input_dropout_p=opt["input_dropout_p"],
                                      rnn_cell=opt['rnn_type'],
                                      rnn_dropout_p=opt["rnn_dropout_p"])
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"] + opt["dim_hand_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, encoder_hand, decoder)
        else:
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, None, decoder)
        # model = S2VTAttModel(encoder, encoder_voice, encoder_hand, decoder)

    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])
    # print(dataloader)
    # print(crit)
    # print(optimizer)

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
コード例 #30
0
    device = 'cuda:0'

    transform = transforms.Compose(
        [
            transforms.Resize(args.size),
            transforms.CenterCrop(args.size),
            transforms.ToTensor(),
            transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
        ]
    )

    #dataset = datasets.ImageFolder(args.path, transform=transform)
    #loader = DataLoader(dataset, batch_size=128, shuffle=True, num_workers=4)

    train_data = VideoDataset(args.path, split='train', clip_len=3, preprocess=False,transform=transform)
    loader = DataLoader(train_data, batch_size=args.bs, shuffle=True, num_workers=4)
    
    test_data = VideoDataset(args.path, split='val', clip_len=3, clip_jump=1,preprocess=False,transform=transform)
    test_loader = DataLoader(test_data, batch_size=args.bs, shuffle=True, num_workers=4)
    #raw_input('Enter')
    model = DQLR()
    if not args.pretrained == None:
        print('Loading pretrained weights...')
        pre_w = torch.load(args.pretrained)
        for key in pre_w.keys():
            model.state_dict()[key] = pre_w[key]
            if 'dec' in key: 
                key2 = key.replace('dec','dec1')
                model.state_dict()[key2] = pre_w[key]
                key2 = key.replace('dec','dec2')