def main(args): # preprocessing: word(en, cn) -> number(one hot vector) # load sentences (English and Chinese) train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} args.en_total_words = en_total_words args.cn_total_words = cn_total_words # encode the words into numbers train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) # convert the train and dev data into numpy matrices # batch_size * seq_length train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) model = models.EncoderDecoderModel() crit = utils.LanguageModelCriterion() learning_rate = args.learning_rate optimizer = optim.Adam(model.parameter(), lr=learning_rate) for epoch in range(args.num_epochs): for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in enumerate(train_data): # convert numpy ndarray to Pytorch tensor # convert to Pytorch Variable batch_size = mb_x.shape[0] mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])).long() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) # calculate loss function loss = crit(mb_pred, mb_out, mb_out_mask) # update the model optimizer.zero_grad() # zero the previous gradient loss.backward() # calculate gradient optimizer.step() # gradient descent
def main(args): if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb")) else: print("vocab file does not exit!") exit(-1) args.en_total_words = en_total_words args.cn_total_words = cn_total_words inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} if os.path.isfile(args.model_file): model = torch.load(args.model_file) else: print("model file does not exit!") exit(-1) if args.use_cuda: model = model.cuda() crit = utils.LanguageModelCriterion() test_en, test_cn = utils.load_data(args.test_file) args.num_test = len(test_en) test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict) test_data = utils.gen_examples(test_en, test_cn, args.batch_size) translate(model, test_data, en_dict, inv_en_dict, cn_dict, inv_cn_dict) correct_count, loss, num_words = eval(model, test_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss) ) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool. # "batch_size": BATCH_SIZE, "c": 10000, "learning_rate": 0.2, "num_iterations": 1000, "input_shape": (224, 224), "num_frames": 288, "dimensions": 224, "k": 0.1, # "attack_algorithm": "showandfool" "attack_algorithm": "carliniwagner" } convnet = 'vgg16' # convnet = 'nasnetalarge' # convnet = 'resnet152' full_decoder = ConvS2VT(convnet, model, opt) ''' Layer freezing experiment. Top 10 contributing layers: conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight conv.cell_13.comb_iter_4_left.bn_sep_1.weight conv.reduction_cell_0.conv_prev_1x1.bn.weight conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight conv.cell_13.comb_iter_0_left.bn_sep_1.weight ''' top = open("top_layers.txt", "r") top_layers = top.readlines() top.close() print(top_layers) #set the gradients on the layers you don't want to contribute to 0 top_layers = [] for name, parameters in full_decoder.named_parameters(): reset = True for f in top_layers: if name in f: reset = False if reset: parameters.require_grad = False if parameters.grad is not None: print(name) parameters.grad.data.zero_() # for name, parameters in full_decoder.named_parameters(): # for f in top_layers: # if name not in f: # print(name) # parameters.require_grad = False # if parameters.grad is not None: # # parameters.data = 0 # parameters.grad.data.zero_() # else: # # print(name) # continue #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #target_caption = np.random.choice(viable_target_captions) # 5 captions: ''' <sos> A person is typing into a laptop computer <eos> <sos> A boy is kicking a soccer ball into the goal <eos> <sos> Someone is frying fish <eos> <sos> A dog is running with a ball <eos> <sos> The cat approaches on grass <eos> ''' captions = { 1: '<sos> A woman is talking <eos>', 2: '<sos> A boy is kicking a soccer ball into the goal <eos>', 3: '<sos> A man is frying fish <eos>', 4: '<sos> A dog is running with a ball <eos>', 5: '<sos> A cat is walking on grass <eos>' } #1 doesn't work videos = { #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22' #1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19.avi', 3: 'O2qiPS2NCeY_2_18.avi', 4: 'kI6MWZrl8v8_149_161.avi', 5: 'X7sQq-Iu1gQ_12_22.avi', 6: '77iDIp40m9E_159_181.avi', 7: 'SaYwh6chmiw_15_40.avi', 8: 'pFSoWsocv0g_8_17.avi', 9: 'HmVPxs4ygMc_44_53.avi', 10: 'glii-kazad8_21_29.avi', 11: 'AJJ-iQkbRNE_97_109.avi' } #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi" # video_path = '' video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[ 2] # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Just switch the number to get a target caption. target_caption = captions[1] #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96) #12 frames length = 3 print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = ['Adam', (0.9, 0.999)] crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window while (frame_counter < length): print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations))) iteration = iteration + 1 if length - frame_counter < BATCH_SIZE: window = [frame_counter, length] frame_counter = frame_counter + (length - frame_counter) print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) else: window = [frame_counter, frame_counter + BATCH_SIZE - 1] print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) frame_counter = frame_counter + BATCH_SIZE base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') print("\nSaving to: {}".format(adv_path)) # adv_frames_1 = np.concatenate(adv_frames, axis=0) # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # print("Adversarial Frames 1: {}".format(sents[0])) adv_frames = np.concatenate(adv_frames, axis=0) # batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Adversarial Frames 2: {}".format(sents[0])) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ #huffyuv is lossless. r10k is really good # '-c:v': 'libx264', #libx264 # use the h.264 codec '-c:v': 'huffyuv', #r210 huffyuv r10k # '-pix_fmt': 'rgb32', # '-crf': '0', # set the constant rate factor to 0, which is lossless # '-preset': 'ultrafast' # ultrafast, veryslow the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) writer.close() # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW') # np.save(np_path, adv_frames) #ffv1 0.215807946043995 #huffyuv 0.21578424050191813 #libx264 0.2341074901578537 #r210 -0.7831487262059795, -0.7833399258537526 #gif 0.6889478809555243 #png 0.2158991440582696 0.21616862708842177 #qtrle 0.21581286337807626 #flashsv 0.21610510459932186 0.21600030673323545 #ffvhuff 0.21620682250167533 #r10k similar to r210 #rawvideo 0.21595001 with torch.no_grad(): #getting a new model to see how it actually works now # full_decoder = ConvS2VT(convnet, model, opt) full_decoder = full_decoder.eval() frames = skvideo.io.vread(adv_path) frames = np.float32(frames) plt.imshow(frames[0] / 255.) plt.show() difference = np.array(adv_frames) - np.array(frames) np.save('difference_tmp', difference) #loadtxt to load np array from txt exp = np.load('difference_tmp.npy') # numpy_frames = np.load(np_path+'.npy') # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames)) # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference)) frames = frames + difference # batches = exp_create_batches(numpy_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption = sents[0] # # print("Numpy Frames exp: {}".format(numpy_caption)) # # numpy_frames_tensor = torch.tensor(numpy_frames) # numpy_frames_tensor = numpy_frames_tensor.float() # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption_tensor = sents[0] # # print("Numpy Frames tensor: {}".format(numpy_caption_tensor)) # numpy_frames = numpy_frames.astype(np.uint8) # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn) # # # batches = exp_create_batches(adv_frames, BATCH_SIZE) # # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Numpy Frames originalscale: {}".format(sents[0])) # # bp --- adv_frames = adv_frames.astype(np.uint8) batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames old: {}".format(sents[0])) batches = exp_create_batches(adv_frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames new: {}".format(sents[0])) frames = frames.astype(np.uint8) batches = create_batches(frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("frames old caption: ", sents[0]) # frames = frames.astype(np.uint8) # batches = create_batches(frames, load_img_fn, tf_img_fn) batches = exp_create_batches(frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
if use_cuda: net.cuda() net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count())) if vars(opt).get('start_from', None) is not None: state_dict = torch.load(os.path.join(opt.start_from, 'model.pth')) if not use_cuda: from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] new_state_dict[name] = v state_dict = new_state_dict net.load_state_dict(state_dict) criterion = utils.LanguageModelCriterion() def test(): net.eval() loader = tqdm(enumerate(dataloader), total=len(dataloader), ascii=True) min_loss = 1e9 for batch_idx, (fc, att, labels, data_info) in loader: if use_cuda: fc, att, labels = fc.cuda(), att.cuda(), labels.cuda() fc, att, labels = Variable(fc, requires_grad=False), Variable(att, requires_grad=False), Variable(labels, requires_grad=False) fc = torch.stack([fc]*opt.seq_per_img).view(-1, *fc.shape[1:]) att = torch.stack([att]*opt.seq_per_img).view(-1, *att.shape[1:]) origin_labels = labels.view(-1, *labels.shape[2:])
def main(args): train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) # code.interact(local=locals()) if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load( open(args.vocab_file, "rb")) else: en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) args.en_total_words = en_total_words args.cn_total_words = cn_total_words inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() crit = utils.LanguageModelCriterion() learning_rate = args.learning_rate optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epochs): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): batch_size = mb_x.shape[0] total_num_sentences += batch_size mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words optimizer.zero_grad() loss.backward() optimizer.step() print("training loss: %f" % (total_train_loss / total_num_words))
def main(opt): def loss(seq_prob, crit): loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda()) return loss def produce_t_mask(): mask = torch.zeros(dataset.max_len) captions = [target_caption.split(' ')] gts = torch.zeros(len(captions), dataset.max_len).long() for i, cap in enumerate(captions): if len(cap) > dataset.max_len: cap = cap[:dataset.max_len] cap[-1] = '<eos>' for j, w in enumerate(cap): gts[i, j] = dataset.word_to_ix[w] label = gts[0] non_zero = (label == 0).nonzero() mask[:int(non_zero[0]) + 1] = 1 return label.unsqueeze(0), mask.unsqueeze(0) dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { "batch_size": BATCH_SIZE, "c": 100, "learning_rate": 0.005, "num_iterations": 1000, "input_shape": (299, 299), "num_frames": 288, "dimensions": 331 } convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #Random target caption # target_caption = np.random.choice(viable_target_captions) # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames length = math.ceil( len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) / 96) print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window frames = skvideo.io.vread(video_path)[0:BATCH_SIZE] original = torch.tensor(frames) original = (original.float()).cuda() batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer.zero_grad() cost.backward() original_grads = {} for name, parameter in full_decoder.named_parameters(): original_grads[name] = parameter.grad print(len(original_grads.keys())) # for key, value in original_grads.items(): # print(key) #Adversarial full_decoder = ConvS2VT(convnet, model, opt) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') adv_frames = skvideo.io.vread(adv_path) adv_frames = np.float32(adv_frames) adv_frames = torch.tensor(adv_frames) adv_frames = (adv_frames.float()).cuda() batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) optimizer.zero_grad() cost.backward() adv_grads = {} for name, parameter in full_decoder.named_parameters(): adv_grads[name] = parameter.grad # for key, value in adv_grads.items(): # print(key) print('\n\n\n------') for key, value in adv_grads.items(): if 'weight' in key: print(key) output = open("s2vt_weightoutput.txt", "w") l2norm_layers = [] for key, value in original_grads.items(): if 'weight' in key: if (value is not None): adv_weight = adv_grads[key] # print(value, adv_weight) diff = value - adv_weight net_change = np.linalg.norm(diff) / np.linalg.norm(value) output.write("{}, {}\n".format(key, net_change)) l2norm_layers.append([key, net_change]) output.close()
def main(args): # load sentences (English and Chinese words) train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) args.num_train = len(train_en) args.num_dev = len(dev_en) # build English and Chinese dictionary if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb")) else: en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) args.en_total_words = en_total_words args.cn_total_words = cn_total_words # index to words dict inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} # encode train and dev sentences into indieces train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) # convert to numpy tensors train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) # code.interact(local=locals()) if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() crit = utils.LanguageModelCriterion() print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss) ) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) best_acc = acc learning_rate = args.learning_rate optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): batch_size = mb_x.shape[0] total_num_sentences += batch_size # convert numpy ndarray to PyTorch tensors and variables mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() hidden = model.init_hidden(batch_size) mb_input = Variable(torch.from_numpy(mb_y[:,:-1])).long() mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words optimizer.zero_grad() loss.backward() optimizer.step() print("training loss: %f" % (total_train_loss / total_num_words)) # evaluate every eval_epoch if (epoch+1) % args.eval_epoch == 0: print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss) ) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) # save model if we have the best accuracy if acc >= best_acc: torch.save(model, args.model_file) best_acc = acc print("model saved...") else: learning_rate *= 0.5 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) print("best dev accuracy: %f" % best_acc) print("#" * 60) # load test data test_en, test_cn = utils.load_data(args.test_file) args.num_test = len(test_en) test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict) test_data = utils.gen_examples(test_en, test_cn, args.batch_size) # evaluate on test correct_count, loss, num_words = eval(model, test_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss) ) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words)) # evaluate on train correct_count, loss, num_words = eval(model, train_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("train loss %s" % (loss) ) print("train accuracy %f" % (acc))
def main(args): train_sentences = utils.load_data(args.train_file) dev_sentences = utils.load_data(args.dev_file) args.num_train = len(train_sentences) args.num_dev = len(dev_sentences) word_dict, args.vocab_size = utils.load_dict(args.vocab_file) # word_dict, args.vocab_size = utils.build_dict(train_sentences, max_words=args.vocab_size) # word_dict["UNK"] = 0 # pickle.dump(word_dict, open(args.dict_file, "wb")) train_sentences = utils.encode(train_sentences, word_dict) train_sentences = utils.gen_examples(train_sentences, args.batch_size) dev_sentences = utils.encode(dev_sentences, word_dict) dev_sentences = utils.gen_examples(dev_sentences, args.batch_size) if os.path.exists(args.model_file): model = torch.load(args.model_file) else: model = LSTMModel(args) # if args.test_only: # print("start evaluating on test") # correct_count, loss = eval(model, all_test, args) # print("test accuracy %f" % (float(correct_count) / float(args.num_test))) # loss = loss / args.num_test # print("test loss %f" % loss) # correct_count, loss = eval(model, all_dev, args) # print("dev accuracy %f" % (float(correct_count) / float(args.num_dev))) # loss = loss / args.num_dev # print("dev loss %f" % loss) # return 0 crit = utils.LanguageModelCriterion() print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("loss %s" % (loss)) print("accuracy %f" % (acc)) print("total number of words %f" % (num_words)) best_acc = acc prev_acc = acc learning_rate = args.learning_rate if args.optimizer == "SGD": optimizer = optim.SGD(model.parameters(), lr=learning_rate) elif args.optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate) # best_loss = loss flog = open(args.log_file, "w") total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_sentences) num_batches = len(train_sentences) # bar = progressbar.ProgressBar(max_value= num_batches * args.eval_epoch, redirect_stdout=True) total_train_loss = 0. total_num_words = 0. start = time.time() for idx, (mb_s, mb_mask) in tqdm(enumerate(train_sentences)): batch_size = mb_s.shape[0] total_num_sentences += batch_size mb_input = Variable(torch.from_numpy(mb_s[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_s[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_mask[:, 1:])) hidden = model.init_hidden(batch_size) mb_pred, hidden = model(mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words optimizer.zero_grad() loss.backward() optimizer.step() # print(loss.data[0]) # bar.update(num_batches * (epoch % args.eval_epoch) + idx +1) end = time.time() total_time += (end - start) # bar.finish() print("training loss: %f" % (total_train_loss / total_num_words)) if (epoch + 1) % args.eval_epoch == 0: print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) if acc > best_acc: torch.save(model, args.model_file) best_acc = acc print("model saved...") elif acc < prev_acc: learning_rate *= 0.5 if args.optimizer == "SGD": optimizer = optim.SGD(model.parameters(), lr=learning_rate) elif args.optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate) prev_acc = acc print("best dev accuracy: %f" % best_acc) print("#" * 60) flog.write("%f\t%f\t%f\t%f\t%f\n" % (total_time, total_num_sentences, best_acc, acc, loss)) print("#sents/sec: %f" % (total_num_sentences / total_time)) test_sentences = utils.load_data(args.test_file) args.num_test = len(test_sentences) test_sentences = utils.encode(test_sentences, word_dict) test_sentences = utils.gen_examples(test_sentences, args.batch_size) correct_count, loss, num_words = eval(model, test_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss)) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words)) err = Counter() correct_count, loss, num_words = eval(model, dev_sentences, args, crit, err=err) if err != None: err = err.most_common()[:20] word_dict_rev = {v: k for k, v in word_dict.iteritems()} for pair in err: p = pair[0].split(",") pg = word_dict_rev[int(p[0])] pp = word_dict_rev[int(p[1])] flog.write("ground truth: " + pg + ", predicted: " + pp + ", number: " + str(pair[1]) + "\\\\\n") flog.close()
def main(args): train_sentences = utils.load_data(args.train_file) dev_sentences = utils.load_data(args.dev_file) args.num_train = len(train_sentences) args.num_dev = len(dev_sentences) word_dict, args.vocab_size = utils.load_dict(args.vocab_file) # word_dict, args.vocab_size = utils.build_dict(train_sentences, max_words=args.vocab_size) # word_dict["UNK"] = 0 # pickle.dump(word_dict, open(args.dict_file, "wb")) train_sentences = utils.encode(train_sentences, word_dict) train_sentences = utils.gen_examples(train_sentences, args.batch_size) dev_sentences = utils.encode(dev_sentences, word_dict) dev_sentences = utils.gen_examples(dev_sentences, args.batch_size) # code.interact(local=locals()) att_dict = {} if os.path.exists(args.model_file): model = torch.load(args.model_file) elif args.model == "LSTMHingeModel": model = LSTMHingeModel(args) elif args.model == "LSTMHingeOutEmbModel": model = LSTMHingeOutEmbModel(args) elif args.model == "LSTMHingeOutEmbNegModel": model = LSTMHingeOutEmbNegModel(args) elif args.model == "LSTMModel": model = LSTMModel(args) if args.criterion == "HingeModelCriterion": crit = utils.HingeModelCriterion() elif args.criterion == "LanguageModelCriterion": crit = utils.LanguageModelCriterion() print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) best_acc = acc prev_acc = acc learning_rate = args.learning_rate if args.optimizer == "SGD": optimizer = optim.SGD(model.parameters(), lr=learning_rate) elif args.optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate) # best_loss = loss flog = open(args.log_file, "w") total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_sentences) num_batches = len(train_sentences) total_train_loss = 0. total_num_words = 0. start = time.time() for idx, (mb_s, mb_mask) in tqdm(enumerate(train_sentences)): batch_size = mb_s.shape[0] total_num_sentences += batch_size mb_input = Variable(torch.from_numpy(mb_s[:, :-1])).long() mb_out = Variable(torch.from_numpy(mb_s[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_mask[:, 1:])) hidden = model.init_hidden(batch_size) if args.model == "LSTMHingeOutEmbNegModel": mb_pred, hidden = model(mb_input, hidden, mb_out) mb_out = Variable( mb_pred.data.new(mb_pred.size(0), mb_pred.size(1)).zero_()).long() loss = crit(mb_pred, mb_out, mb_out_mask) else: mb_pred, hidden = model(mb_input, hidden) loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words # code.interact(local=locals()) total_num_words += num_words optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.grad_clipping) optimizer.step() end = time.time() total_time += (end - start) print("training loss: %f" % (total_train_loss / total_num_words)) if (epoch + 1) % args.eval_epoch == 0: print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) if acc > best_acc: torch.save(model, args.model_file) best_acc = acc # infos['epoch'] = epoch # infos['best_acc'] = best_acc # infos['vocab'] print("model saved...") elif acc < prev_acc: learning_rate *= 0.5 if args.optimizer == "SGD": optimizer = optim.SGD(model.parameters(), lr=learning_rate) elif args.optimizer == "Adam": optimizer = optim.Adam(model.parameters(), lr=learning_rate) prev_acc = acc print("best dev accuracy: %f" % best_acc) print("#" * 60) flog.write("%f\t%f\t%f\t%f\t%f\n" % (total_time, total_num_sentences, best_acc, acc, loss)) correct_count, loss, num_words = eval(model, train_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("train loss %s" % (loss)) print("train accuracy %f" % (acc)) print("#sents/sec: %f" % (total_num_sentences / total_time)) model = torch.load(args.model_file) test_sentences = utils.load_data(args.test_file) args.num_test = len(test_sentences) test_sentences = utils.encode(test_sentences, word_dict) test_sentences = utils.gen_examples(test_sentences, args.batch_size) correct_count, loss, num_words = eval(model, test_sentences, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss)) print("test accuracy %f" % (acc)) flog.close()
def train(opt): loader = DataLoader(opt) tb_summary_writer = tb.SummaryWriter(opt.checkpoint_path) infos = {} histories = {} iteration = infos.get('iter', 0) epoch = infos.get('epoch', 0) val_result_history = histories.get('val_result_history', {}) loss_history = histories.get('loss_history', {}) lr_history = histories.get('lr_history', {}) ss_prob_history = histories.get('ss_prob_history', {}) loader.iterators = infos.get('iterators', loader.iterators) loader.split_ix = infos.get('split_ix', loader.split_ix) # model = FCModel(opt).cuda() model = AttModel(opt).cuda() #dp_model = torch.nn.DataParallel(model) dp_model = model dp_model.train() crit = utils.LanguageModelCriterion() rl_crit = RewardCriterion() optimizer = optim.Adam(model.parameters(), opt.learning_rate, (0.9, 0.999), 1e-8, weight_decay=0) sc_flag = False start = time.time() while True: # sys.stdout.flush() # Learning rate decay if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0: frac = (epoch - opt.learning_rate_decay_start ) // opt.learning_rate_decay_every decay_factor = opt.learning_rate_decay_rate**frac opt.current_lr = opt.learning_rate * decay_factor else: opt.current_lr = opt.learning_rate # Start use SCST to train if opt.self_critical_after >= 0 and epoch >= opt.self_critical_after: sc_flag = True init_scorer(opt.cached_tokens) else: sc_flag = False ## # sc_flag = True # init_scorer(opt.cached_tokens) ## utils.set_lr(optimizer, opt.current_lr) data = loader.get_batch('train') tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'], data['att_masks'] ] tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp optimizer.zero_grad() if not sc_flag: loss = crit( dp_model('forward', fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) # loss = crit(dp_model('forward', fc_feats, att_feats, labels, att_masks), labels, masks) # loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:]) else: # Generate baseline with argmax opt.sample_max = False gen_result, sample_logprobs = dp_model('sample', fc_feats, att_feats, labels, att_masks) opt.sample_max = True reward = get_self_critical_reward(dp_model, fc_feats, att_feats, att_masks, data, gen_result, opt) loss = rl_crit(sample_logprobs, gen_result.data, torch.from_numpy(reward).float().cuda()) loss.backward() torch.nn.utils.clip_grad_norm_(dp_model.parameters(), opt.grad_clip) train_loss = loss.item() optimizer.step() if iteration % opt.print_every == 0: torch.cuda.synchronize() end = time.time() if not sc_flag: print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, end - start)) else: print("iter {} (epoch {}), train_loss = {:.3f}, avg_reward = {:.3f}, time/batch = {:.3f}" \ .format(iteration, epoch, train_loss, np.mean(reward[:, 0]), end - start)) start = time.time() iteration += 1 if data['bounds']['wrapped']: epoch += 1 #-------------------------------------------------------------------# if (iteration % opt.checkpoint_every == 0): add_summary_value(tb_summary_writer, 'train_loss', train_loss, iteration) add_summary_value(tb_summary_writer, 'learning_rate', opt.current_lr, iteration) if sc_flag: add_summary_value(tb_summary_writer, 'avg_reward', np.mean(reward[:, 0]), iteration) loss_history[iteration] = train_loss if not sc_flag else np.mean( reward[:, 0]) lr_history[iteration] = opt.current_lr #-------------------------------------------------------------------# if (iteration % opt.save_every == 0): val_loss, predictions, lang_stats = eval.eval_split( dp_model, crit, loader, 'val', opt) add_summary_value(tb_summary_writer, 'validation loss', val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): add_summary_value(tb_summary_writer, k, v, iteration) print('{} : {}'.format(k, v)) val_result_history[iteration] = { 'loss': val_loss, 'lang_stats': lang_stats, 'predictions': predictions } current_score = lang_stats['CIDEr'] if current_score > opt.best_cider_score: print('New Best Cider Score: {}'.format(current_score)) opt.best_cider_score = current_score checkpoint_path = os.path.join(opt.checkpoint_path, 'model-best.pth') torch.save(model.state_dict(), checkpoint_path) print('Save best model!') if epoch >= opt.max_epochs and opt.max_epochs >= 0: break
def main(args): # code.interact(local=locals()) # 1.加载数据 # 加载句子 train_en, train_cn = utils.load_data(args.train_file) dev_en, dev_cn = utils.load_data(args.dev_file) # 参数存储 args.num_train = len(train_en) args.num_dev = len(dev_en) # 2.构建单词字典 if os.path.isfile(args.vocab_file): en_dict, cn_dict, en_total_words, cn_total_words = pickle.load( open(args.vocab_file, "rb")) else: # 获取字典 en_dict, en_total_words = utils.build_dict(train_en) cn_dict, cn_total_words = utils.build_dict(train_cn) pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb")) # 参数存储 args.en_total_words = en_total_words args.cn_total_words = cn_total_words # 翻转字典,转换为数字->单词 inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()} # 编码单词,单词->数字 train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict) dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict) # convert to numpy tensors train_data = utils.gen_examples(train_en, train_cn, args.batch_size) dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size) # 初始化模型 if os.path.isfile(args.model_file): model = torch.load(args.model_file) elif args.model == "EncoderDecoderModel": model = EncoderDecoderModel(args) if args.use_cuda: model = model.cuda() # 交叉熵loss函数 crit = utils.LanguageModelCriterion() # 指标评估 print("start evaluating on dev...") correct_count, loss, num_words = eval(model, dev_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) best_acc = acc # 定义学习率 learning_rate = args.learning_rate # 定义优化器 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) total_num_sentences = 0. total_time = 0. for epoch in range(args.num_epoches): np.random.shuffle(train_data) total_train_loss = 0. total_num_words = 0. # 获取训练数据和序列下标 for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)): # 获取mini batch size batch_size = mb_x.shape[0] total_num_sentences += batch_size # 将numpy的tensor数据类型转换为torch的tensor,再套上variable mb_x = Variable(torch.from_numpy(mb_x)).long() mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long() # LSTM隐层state hidden = model.init_hidden(batch_size) # 预测句子的给定前缀 mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long() # 预测句子的目标后缀 mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long() mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])) if args.use_cuda: mb_x = mb_x.cuda() mb_x_mask = mb_x_mask.cuda() mb_input = mb_input.cuda() mb_out = mb_out.cuda() mb_out_mask = mb_out_mask.cuda() # 模型预测函数 mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden) # 交叉熵损失函数衡量pred和out差距 loss = crit(mb_pred, mb_out, mb_out_mask) num_words = torch.sum(mb_out_mask).data[0] total_train_loss += loss.data[0] * num_words total_num_words += num_words # 更新模型 # 首先清空模型梯度数据 optimizer.zero_grad() # 计算loss对parameter的梯度 loss.backward() # 实行梯度下降 optimizer.step() # 打印loss值 print("training loss: %f" % (total_train_loss / total_num_words)) # 评估每一轮迭代 if (epoch + 1) % args.eval_epoch == 0: print("start evaluating on dev...") # 获取参数 correct_count, loss, num_words = eval(model, dev_data, args, crit) # 计算损失和准确率 loss = loss / num_words acc = correct_count / num_words print("dev loss %s" % (loss)) print("dev accuracy %f" % (acc)) print("dev total number of words %f" % (num_words)) # 存储最优准确率模型 if acc >= best_acc: torch.save(model, args.model_file) best_acc = acc print("model saved...") else: learning_rate *= 0.5 optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate) # 打印最佳准确率 print("best dev accuracy: %f" % best_acc) print("#" * 60) # 加载数据 test_en, test_cn = utils.load_data(args.test_file) args.num_test = len(test_en) test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict) test_data = utils.gen_examples(test_en, test_cn, args.batch_size) # 测试集评估 correct_count, loss, num_words = eval(model, test_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("test loss %s" % (loss)) print("test accuracy %f" % (acc)) print("test total number of words %f" % (num_words)) # 训练集评估 correct_count, loss, num_words = eval(model, train_data, args, crit) loss = loss / num_words acc = correct_count / num_words print("train loss %s" % (loss)) print("train accuracy %f" % (acc))
def main(args, opt): testpath = 'D:\\College\\Research\\2019 Video Captioning Attack Conference Paper\\youtube2text_preprocessed_for_arctic_capgen_vid\\youtube2text_iccv15\\dict_movieID_caption.pkl' with open(testpath, 'rb') as f: data = pickle.load(f, encoding='latin1') print(data) dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { "batch_size": BATCH_SIZE, "c": 100, "learning_rate": 0.005, "num_iterations": 1000, "input_shape": (299, 299), "num_frames": 288, "dimensions": 331 } convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) # model = torch.nn.Sequential(torch.nn.Conv2d(in_channels=3, out_channels=96, kernel_size=3, padding=0, stride=2, # bias=False), full_decoder) #loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], num_workers=16, shuffle=True) crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)