Exemple #1
0
def main(opt):
    opt_test = opt
    test_dataset = VideoDataset(opt_test, 'test')
    opt_test["vocab_size"] = test_dataset.get_vocab_size()
    opt_test["seq_length"] = test_dataset.max_len
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        second_lstm = Two_Lstm(
            opt["dim_vid"],
            opt["dim_hidden"],
            # bidirectional=opt["bidirectional"],
            input_dropout_p=opt["input_dropout_p"],
            rnn_cell=opt['rnn_type'],
            rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, second_lstm, decoder)
    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit,
          opt_test, test_dataset)
Exemple #2
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    get_caption(model, crit, dataset, dataset.get_vocab(), opt)
Exemple #3
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = S2VTAttModel(encoder, decoder)
    #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"])
    #model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(
        model.parameters(),
        lr=opt["learning_rate"],
        weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
Exemple #4
0
def main(opt):
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    if opt["model"] == "S2VTModel":
        dataset = VideoDataset(opt, "test")
    elif opt["model"] == "S2VTACTModel":
        dataset = VideoActDataset(opt, "test")
    else:
        print('Currently not supported: {}'.format(opt["model"]))
        raise ValueError

    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt["model"] == "S2VTModel":
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTACTModel":
        model = S2VTACTModel(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             rnn_dropout_p=opt["rnn_dropout_p"]).to(device)
    elif opt["model"] == "S2VTAttModel":
        print('Currently Not Supported: {}'.format(opt["model"]))
        raise ValueError
    # model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), device, opt)
Exemple #5
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    for video_path in opt['videos']:
        print(video_path)
        with torch.no_grad():
            frames = skvideo.io.vread(video_path)
            # bp ---
            batches = create_batches(frames, load_img_fn, tf_img_fn)
            seq_prob, seq_preds = full_decoder(batches, mode='inference')
            sents = utils.decode_sequence(vocab, seq_preds)

            for sent in sents:
                print(sent)
Exemple #6
0
def main(opt):
    dataset = VideoDataset(opt, 'test')
    opt.vocab_size = dataset.get_vocab_size()
    opt.seq_length = dataset.seq_length
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=0.2)
        model = S2VTAttModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    # Setup the model
    model.load_state_dict(torch.load(opt.saved_model))
    model.eval()
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
Exemple #7
0
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    global dataset_val
    global dataloader_val
    dataset_val =  VideoDataset(opt, 'val')
    dataloader_val = DataLoader(dataset_val, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    encoder = EncoderRNN(
        opt["dim_vid"],
        opt["dim_hidden"],
        bidirectional=bool(opt["bidirectional"]),
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(
        opt["vocab_size"],
        opt["max_len"],
        opt["dim_hidden"],
        opt["dim_word"],
        input_dropout_p=opt["input_dropout_p"],
        rnn_cell=opt['rnn_type'],
        rnn_dropout_p=opt["rnn_dropout_p"],
        bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder)
    model = model.cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load('data/save_vatex_batch_noc3d/model_500.pth'))
    crit = utils.LanguageModelCriterion()
    optimizer = optim.Adam(model.parameters(),lr=opt["learning_rate"],weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"])
    print("Data Loaded")
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
Exemple #8
0
def main(opt):
    # DataLoader
    if opt["model"] == 'S2VTModel':
        dataset = VideoDataset(opt, 'train')
    elif opt["model"] == 'S2VTACTModel':
        dataset = VideoActDataset(opt, 'train')
    else:
        print('Currently Not Support this model: {}'.format(opt["model"]))
        raise ValueError
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()

    if opt["model"] == 'S2VTModel':
        print(opt)
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])

    elif opt["model"] == 'S2VTACTModel':
        print(opt)
        model = S2VTACTModel(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             opt['dim_vid'],
                             rnn_cell=opt['rnn_type'],
                             n_layers=opt['num_layers'],
                             rnn_dropout_p=opt["rnn_dropout_p"])

    elif opt["model"] == "S2VTAttModel":
        print('Currently not supported.')
        raise ValueError
    # Load model
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')
    model = model.to(device)
    # Criterions #
    LMCriterion = utils.LanguageModelCriterion()
    # rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, optimizer, exp_lr_scheduler, opt, device,
          LMCriterion)
Exemple #9
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
    decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"]))
    model = EncoderDecoderModel(encoder, decoder).cuda()
    model = nn.DataParallel(model)
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()
    test(model, crit, dataset, dataset.get_vocab(), opt)
Exemple #10
0
def main(opt):

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=8,
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"]).cuda()
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    model.load_state_dict(
        torch.load(
            "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth"
        ))
    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
Exemple #11
0
def main(opt):
    dataset = VideoDataset(opt, "test")
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    if torch.cuda.device_count() > 1:
        print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Setup the model
    model.load_state_dict(torch.load(opt["saved_model"]))
    crit = utils.LanguageModelCriterion()

    test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt):
    train_dataset = VideoDataset(opt, 'train')
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=opt.batch_size,
                                  shuffle=True)
    opt.vocab_size = train_dataset.get_vocab_size()
    opt.seq_length = train_dataset.seq_length
    val_dataset = VideoDataset(opt, 'val')
    val_dataloader = DataLoader(val_dataset, batch_size=120, shuffle=True)
    if opt.model == 'S2VTModel':
        model = S2VTModel(opt.vocab_size,
                          opt.seq_length,
                          opt.dim_hidden,
                          opt.dim_word,
                          rnn_dropout_p=opt.rnn_dropout_p).cuda()
    elif opt.model == "S2VTAttModel":
        encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden)
        decoder = DecoderRNN(opt.vocab_size,
                             opt.seq_length,
                             opt.dim_hidden,
                             opt.dim_word,
                             rnn_dropout_p=opt.rnn_dropout_p)
        model = S2VTAttModel(encoder, decoder).cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt.learning_rate,
                           weight_decay=opt.weight_decay)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt.learning_rate_decay_every,
        gamma=opt.learning_rate_decay_rate)
    if not os.path.isdir(opt.checkpoint_path):
        os.mkdir(opt.checkpoint_path)
    train(train_dataloader, val_dataloader, model, crit, optimizer,
          exp_lr_scheduler, opt, rl_crit)
def train(opt,EncoderRNN,DecoderCNN,Convcap,itow):
    '''
    training 
    initialize the models 
    pass the arg through the Convcap model
    output: /checkpoint/model.pth ----- trained model
    '''
    t_start = time.time()
    train_data=VideoDataset(opt, 'train')
    #####DataLODER#####
    #collate_fn=PadSequence(),collate_fn=collate_fn,
    train_loader=DataLoader(train_data, batch_size=opt["batch_size"],num_workers=opt['num_workers'], shuffle=True)
    print('[DEBUG] Loading train data ... %f secs' % (time.time() - t_start))
    

    ##initialize encoder,decoder,model
    encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda()
    decoder=DecoderCNN.DecoderCNN(train_data.get_vocab_size()).cuda()
    convcap=Convcap.Convcap(encoder,decoder).cuda()


    ####initialize hyper params
    optimizer = optim.RMSprop(convcap.parameters(), lr=opt["learning_rate"])
    scheduler = lr_scheduler.StepLR(optimizer, step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"])
    batchsize =opt['batch_size']
    cap_size= opt['max_len']
    nbatches = np.int_(np.floor((len(train_data)*1.)/batchsize))
    bestscore = .0
    batchsize_cap = batchsize*1
    max_tokens= opt['max_len']
    # print(batchsize,cap_size,batchsize_cap,max_tokens)
    # print("nbatches"+str(nbatches))

    itr = 0
    loss_graph=[]
    graph_x=[]
    for epoch in range(opt['epochs']):
        loss_train = 0.
        scheduler.step()
        
        for data in train_loader:
        #for c3d_feat,lengths,labels,word_embed,masks,gts in train_loader:
            #print("came here")
            print("iteration"+str(itr))
            itr+=1
            vid_feat=Variable(data['c3d_feats']).cuda()
            labels = Variable(data['labels'].type(torch.LongTensor)).cuda()
            mask = Variable(data['masks']).cpu()
            word_embed=Variable(data['word_embed']).cuda()
            cap=data['cap']
            # vid_feat=Variable(c3d_feat).cuda()
            # print(vid_feat.dtype)
            # print(vid_feat.shape)
            # labels = Variable(torch.FloatTensor(labels)).cuda()
            # mask = Variable(torch.FloatTensor(masks)).cpu()
            
            # word_embed= torch.stack([x for x in word_embed],dim=0)
            # print(word_embed.shape)
            
            # word_embed=Variable(word_embed).cuda()
            # lengths=lengths.type(torch.FloatTensor)
            # print(c3d_feat,word_embed)

            optimizer.zero_grad()
            wordact = convcap(vid_feat,labels,word_embed,'train')
            # print("//////////////////////////////////////////////")
            #print("1.wordact.shape"+str(wordact.shape))
            wordact = wordact[:,:,:-1]
            #print("2.wordact.shape"+str(wordact.shape))
            # print("////////////////////////////////////")
            labels = labels[:,1:]
            mask = mask[:,1:].contiguous()
            # print (wordact.shape)
            # print(batchsize_cap,max_tokens)
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(\
            batchsize*(max_tokens-1), -1)
            #print(wordact_t.shape)
            wordclass_t = labels.contiguous().view(\
            batchsize*(max_tokens-1), 1)

            maskids = torch.nonzero(mask.view(-1)).numpy().reshape(-1)
            #print("mask Ids \t"+str(maskids))
            loss = F.cross_entropy(wordact_t[maskids, ...], \
            wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))
            ########### for visual##############################
            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            # print("word_class \t"+str(wordclass_t.shape)+"\t"+str(wordclass_t.dtype))
            # print(wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]))
            # print(wordact_t[maskids, ...])
            wordids = np.argmax(wordprobs, axis=1)
            for i in wordids:
                print(itow[i])
            # for i in wordclass_t[maskids, ...].contiguous().view(maskids.shape[0]):
            #     print(itow[i])
            print(cap)
             ############################ #####################  
            if itr%500 == 0:
                graph_x.append(itr)
                loss_graph.append(loss)

                

            loss_train = loss_train + loss.item()
            loss.backward()
            optimizer.step()
            print("loss"+str(loss_train))
            
        
        loss_train = (loss_train*1.)/(nbatches)
        print('[DEBUG] Training epoch %d has loss %f' % (epoch, loss_train))

        modelfn = osp.join(opt['checkpoint_path'], 'model_j_19_'+str(itr)+'.pth')
        torch.save({
                'epoch': epoch,
                'state_dict': convcap.state_dict(),              
                'optimizer' : optimizer.state_dict(),
                'loss':loss_train
            }, modelfn)
        print('time for epoch %f' % (time.time() - t_start))
    plt.plot(graph_x,loss_graph,'ro') 
    plt.show()
def main(opt):
    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            shuffle=True)
    opt["vocab_size"] = dataset.get_vocab_size()
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          rnn_cell=opt['rnn_type'],
                          n_layers=opt['num_layers'],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_cell=opt['rnn_type'],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        # # 声音encoder
        # encoder_voice = EncoderRNN(
        #     opt["dim_voice"],
        #     opt["dim_hidden"],
        #     bidirectional=opt["bidirectional"],
        #     input_dropout_p=opt["input_dropout_p"],
        #     rnn_cell=opt['rnn_type'],
        #     rnn_dropout_p=opt["rnn_dropout_p"])
        # 手语encoder
        if opt['with_hand'] == 1:
            encoder_hand = EncoderRNN(opt["dim_hand"],
                                      opt["dim_hand_hidden"],
                                      bidirectional=opt["bidirectional"],
                                      input_dropout_p=opt["input_dropout_p"],
                                      rnn_cell=opt['rnn_type'],
                                      rnn_dropout_p=opt["rnn_dropout_p"])
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"] + opt["dim_hand_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, encoder_hand, decoder)
        else:
            decoder = DecoderRNN(opt["vocab_size"],
                                 opt["max_len"],
                                 opt["dim_hidden"],
                                 opt["dim_word"],
                                 input_dropout_p=opt["input_dropout_p"],
                                 rnn_cell=opt['rnn_type'],
                                 rnn_dropout_p=opt["rnn_dropout_p"],
                                 bidirectional=opt["bidirectional"])
            model = S2VTAttModel(encoder, None, decoder)
        # model = S2VTAttModel(encoder, encoder_voice, encoder_hand, decoder)

    model = model.cuda()
    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])
    # print(dataloader)
    # print(crit)
    # print(optimizer)

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def test(opt,EncoderRNN,DecoderCNN,Convcap,itow,wtoi,modelfn=None):

    '''
    input : option, model, checkpoint/model.pth
    output :scores
    '''
    t_start = time.time()
    t_start = time.time()
    
    test_data=VideoDataset(opt, 'test')
    test_loader=DataLoader(test_data, batch_size=opt["batch_size"],num_workers=30, shuffle=False)
    print('[DEBUG] Loading test data ... %f secs' % (time.time() - t_start))

    batchsize =opt['batch_size']
    cap_size= opt['max_len']
    nbatches = np.int_(np.floor((len(test_data)*1.)/batchsize))
    bestscore = .0
    batchsize_cap = batchsize*1
    max_tokens= opt['max_len']

    if(modelfn is not None):
        encoder=EncoderRNN.EncoderRNN(opt['dim_vid'],opt['dim_hidden'],bidirectional=opt['bidirectional'],rnn_cell=opt['rnn_type']).cuda()
        decoder=DecoderCNN.DecoderCNN(test_data.get_vocab_size()).cuda()
        convcap=Convcap.Convcap(encoder,decoder).cuda()
        print('[DEBUG] Loading checkpoint %s' % modelfn)
        checkpoint = torch.load(modelfn)
        convcap.load_state_dict(checkpoint['state_dict'])
        
    convcap.train(False)
    pred_captions = []
    itr=0
    for data in test_loader:
        print("iteration"+str(itr))
        #print(data['labels'].data[0])
        print("\n")
        print("gt\n")
        for i in data['labels'].data[0]:

            print(itow[int(i)])
            

        itr+=1
        vid_feat=Variable(data['c3d_feats']).cuda()
        labels=Variable(data['labels'].type(torch.LongTensor)).cuda()
        mask = Variable(data['masks']).cpu()
        word_embed=Variable(data['word_embed']).cuda()
        vid_id=data['video_ids']
        #print(vid_id[0])
        wordclass_feed = np.zeros((batchsize, max_tokens), dtype='int64')
        wordclass_feed[:,0] =wtoi['<sos>'] #1 #index of <sos>
        # print('wordclass_feed shape')
        # print(wordclass_feed.shape)
        outcaps = np.empty((batchsize, 0)).tolist()
        x_outcaps=np.empty((batchsize, 0)).tolist()
        for j in range(max_tokens-1):
            wordclass = Variable(torch.from_numpy(wordclass_feed)).cuda()
            wordact = convcap(vid_feat,wordclass,word_embed,'test')
            x=convcap(vid_feat,labels,word_embed,'test')
            x=x[:,:,:-1]
            x_t=x.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
            x_prob=F.softmax(x_t).cpu().data.numpy()
            

            wordact = wordact[:,:,:-1]
            wordact_t = wordact.permute(0, 2, 1).contiguous().view(batchsize*(max_tokens-1), -1)
           # print("convcap output"+str(wordact_t.shape))
            wordprobs = F.softmax(wordact_t).cpu().data.numpy()
            x_id=np.argmax(x_prob, axis=1)
            
            wordids = np.argmax(wordprobs, axis=1)
            probs=np.max(wordprobs,axis=1)
            
            for k in range(batchsize):
                word = itow[wordids[j+k*(max_tokens-1)]]
                x_word=itow[x_id[j+k*(max_tokens-1)]]
                outcaps[k].append(word)
                x_outcaps[k].append(x_word)
                if(j < max_tokens-1):
                    wordclass_feed[k, j+1] = wordids[j+k*(max_tokens-1)]

                    # print("word ids"+str(wordids[j+k*(max_tokens-1)]))
        for j in range(batchsize):
            num_words = len(outcaps[j])
            x_n_words=len(x_outcaps[j]) 
            if 'eos' in x_outcaps[j]:
                x_n_words=x_outcaps[j].index('eos')
            x_outcap=' '.join(x_outcaps[j][:num_words])
            if 'eos' in outcaps[j]:
                num_words = outcaps[j].index('eos')
            outcap = ' '.join(outcaps[j][:num_words])
            pred_captions.append({'vid_id': vid_id[0][5:], 'caption': outcap})
        print("------------------------------------------------------------------------------")
        print("videoID \t"+str(vid_id))
        print("caption \n")
        print(x_outcap)
        
        #print(itow[int(i)] for i in data['labels'].data[0])
        print("------------------------------------------------------------------------------")
        
        
    scores = language_eval(pred_captions, '/home/sanjay/Documents/Video_convcap/output', 'test')

    return scores
Exemple #16
0
    for line in fp:
        video_id.append(line.strip())
    fp.close()

    f = open(output_path, 'w')
    for i in range(len(results)):
        f.write(video_id[i] + ',' + results[i] + '\n')
    f.close()


dim_vid = 4096
dim_hidden = 512
dim_word = 512

dataset = VideoDataset('generate', folder_path)
vocab_size = dataset.get_vocab_size()
seq_length = 25

encoder = Encoder(dim_vid, dim_hidden)
decoder = Decoder(vocab_size,
                  seq_length,
                  dim_hidden,
                  dim_word,
                  rnn_dropout_p=0.2)
model = Model(encoder, decoder).cuda()

model = nn.DataParallel(model)
model.load_state_dict(torch.load('./good_model.pth'))
model.eval()
test(model, dataset, dataset.get_vocab())