def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() get_caption(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'val', 'chinese') opt["vocab_size"] = 13491 #dataset.get_vocab_size() + chinDataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) # Setup the model model.load_state_dict( torch.load(opt["saved_model"], map_location=torch.device('cpu'))) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray for video_path in opt['videos']: print(video_path) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) for sent in sents: print(sent)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN( opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) #model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam( model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): dataset = VideoDataset(opt, 'test') opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.seq_length if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=0.2) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt.saved_model)) model.eval() crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): video_path = opt["video_path"] os.environ['CUDA_VISIBLE_DEVICES'] = '0' image_feats = extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load(opt["saved_model"])) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(opt): opt_test = opt test_dataset = VideoDataset(opt_test, 'test') opt_test["vocab_size"] = test_dataset.get_vocab_size() opt_test["seq_length"] = test_dataset.max_len dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) second_lstm = Two_Lstm( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) # bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, second_lstm, decoder) model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit, opt_test, test_dataset)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], num_workers=8, shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) model.load_state_dict( torch.load( "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth" )) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(self, opt): os.environ['CUDA_VISIBLE_DEVICES'] = '0' video_path = self.ent1.get().replace("/", "\\") image_feats = self.extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type( torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load("data/save/model_500.pth")) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] self.sent = NLUtils.decode_sequence(vocab, seq_preds) hasil = self.translator.translate(self.sent[0], dest='id') print(self.sent[0]) self.hasilPred.configure(text=self.sent[0]) self.hasiltrans.configure(text=hasil.text) # coba = self.sent[0] self.textToSpeech(self.sent[0], hasil.text) del seq_preds torch.cuda.empty_cache()
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return if torch.cuda.device_count() > 1: print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) model = nn.DataParallel(model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset_test = VideoDataset(opt, 'test') dataloader_test = DataLoader(dataset_test, batch_size=opt["batch_size"], shuffle=False) opt["obj_vocab_size"] = dataset_test.get_obj_vocab_size() opt["rel_vocab_size"] = dataset_test.get_rel_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["obj_vocab_size"], opt["rel_vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) model = model.cuda() model.load_state_dict(torch.load(opt['ckpt_path'])) crit = utils.ObjRelCriterion() test(model, crit, opt, dataloader_test)
def main(opt): train_dataset = VideoDataset(opt, 'train') train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True) opt.vocab_size = train_dataset.get_vocab_size() opt.seq_length = train_dataset.seq_length val_dataset = VideoDataset(opt, 'val') val_dataloader = DataLoader(val_dataset, batch_size=120, shuffle=True) if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p) model = S2VTAttModel(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt.learning_rate_decay_every, gamma=opt.learning_rate_decay_rate) if not os.path.isdir(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) train(train_dataloader, val_dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) elif opt["model"] == "CTCmodel": # input_dim, hidden_dim, output_dim, num_layers, biFlag, dropout = 0.5 # model = CTCmodel(opt["dim_vid"],opt["dim_hidden"],opt["vocab_size"]+1) model = CTCmodel(opt['vocab_size'], opt['dim_hidden']) elif opt["model"] == "CTC_Hieratical_LSTM": encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) second_lstm = two_lstm( opt["dim_hidden"] * 2, opt['vocab_size'], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) model = CTC_Hieratical_LSTM(encoder, second_lstm, opt['vocab_size'], opt['dim_word'], opt['dim_hidden'], opt['duration'], opt['video_duration']) # model = model.cuda() # crit = utils.LanguageModelCriterion() # rl_crit = utils.RewardCriterion() ctc_loss = nn.CTCLoss(reduction='mean') optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) if opt['lr_schluder'] == 'StepLR': lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) elif opt['lr_schluder'] == 'ReduceLROnPlateau': lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=opt['patience'], verbose=True, threshold_mode='rel', threshold=opt['threshold'], cooldown=0, min_lr=opt['min_lr'], eps=1e-8) else: raise NotImplementedError('Only implement ReduceLROnPlateau | StepLR') opt['check_bool'] = False if opt['check_bool']: check_path = os.path.join(opt['check_path'], 'model_10.pth') model.load_state_dict(torch.load(check_path)) opt['root_model_path'] = opt['check_path'] print('have loaded model info from:', check_path) #TODO断点重新训练 val(model, ctc_loss, opt) else: opt_json = os.path.join( opt["checkpoint_path"], time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())), 'opt_info.json') root_model_path = os.path.join( opt['checkpoint_path'], time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))) opt['root_model_path'] = root_model_path if not os.path.isdir(opt["checkpoint_path"]): os.mkdir(opt["checkpoint_path"]) if not os.path.isdir(root_model_path): os.mkdir(root_model_path) with open(opt_json, 'w') as f: json.dump(opt, f) print('save opt details to %s' % (opt_json)) train(dataloader, model, ctc_loss, optimizer, lr_scheduler, opt)
def __init__(self, opt): # tutorials/09 - Image Captioning # Build Models self.grad_clip = opt.grad_clip self.img_enc = EncoderImage(opt.data_name, opt.img_dim, opt.embed_size, opt.finetune, opt.cnn_type, use_abs=opt.use_abs, no_imgnorm=opt.no_imgnorm) self.txt_enc = EncoderText(opt.vocab_size, opt.word_dim, opt.embed_size, opt.num_layers, use_abs=opt.use_abs) if torch.cuda.is_available(): self.img_enc.cuda() self.txt_enc.cuda() cudnn.benchmark = True ##### captioning elements self.encoder = EncoderRNN( opt.dim_vid, opt.dim_hidden, bidirectional=opt.bidirectional, input_dropout_p=opt.input_dropout_p, rnn_cell=opt.rnn_type, rnn_dropout_p=opt.rnn_dropout_p) self.decoder = DecoderRNN( opt.vocab_size, opt.max_len, opt.dim_hidden, opt.dim_word, input_dropout_p=opt.input_dropout_p, rnn_cell=opt.rnn_type, rnn_dropout_p=opt.rnn_dropout_p, bidirectional=opt.bidirectional) self.caption_model = S2VTAttModel(self.encoder, self.decoder) self.crit = utils.LanguageModelCriterion() self.rl_crit = utils.RewardCriterion() if torch.cuda.is_available(): self.caption_model.cuda() # Loss and Optimizer self.criterion = ContrastiveLoss(margin=opt.margin, measure=opt.measure, max_violation=opt.max_violation) params = list(self.txt_enc.parameters()) params += list(self.img_enc.parameters()) params += list(self.decoder.parameters()) params += list(self.encoder.parameters()) params += list(self.caption_model.parameters()) if opt.finetune: params += list(self.img_enc.cnn.parameters()) self.params = params self.optimizer = torch.optim.Adam(params, lr=opt.learning_rate) self.Eiters = 0
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) # # 声音encoder # encoder_voice = EncoderRNN( # opt["dim_voice"], # opt["dim_hidden"], # bidirectional=opt["bidirectional"], # input_dropout_p=opt["input_dropout_p"], # rnn_cell=opt['rnn_type'], # rnn_dropout_p=opt["rnn_dropout_p"]) # 手语encoder if opt['with_hand'] == 1: encoder_hand = EncoderRNN(opt["dim_hand"], opt["dim_hand_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"] + opt["dim_hand_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, encoder_hand, decoder) else: decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, None, decoder) # model = S2VTAttModel(encoder, encoder_voice, encoder_hand, decoder) model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) # print(dataloader) # print(crit) # print(optimizer) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)