Example #1
0
def main(args):
    # preprocessing: word(en, cn) -> number(one hot vector)

    # load sentences (English and Chinese)
    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    en_dict, en_total_words = utils.build_dict(train_en)
    cn_dict, cn_total_words = utils.build_dict(train_cn)
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words

    # encode the words into numbers
    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)

    # convert the train and dev data into numpy matrices
    # batch_size * seq_length
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    model = models.EncoderDecoderModel()

    crit = utils.LanguageModelCriterion()
    learning_rate = args.learning_rate
    optimizer = optim.Adam(model.parameter(), lr=learning_rate)

    for epoch in range(args.num_epochs):
        for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in enumerate(train_data):
            # convert numpy ndarray to Pytorch tensor
            # convert to Pytorch Variable
            batch_size = mb_x.shape[0]

            mb_x = Variable(torch.from_numpy(mb_x)).long()
            mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
            hidden = model.init_hidden(batch_size)
            mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:])).long()

            mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

            # calculate loss function
            loss = crit(mb_pred, mb_out, mb_out_mask)

            # update the model
            optimizer.zero_grad()  # zero the previous gradient
            loss.backward()  # calculate gradient
            optimizer.step()  # gradient descent
Example #2
0
def main(args):

	if os.path.isfile(args.vocab_file):
		en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb"))
	else:
		print("vocab file does not exit!")
		exit(-1)

	args.en_total_words = en_total_words
	args.cn_total_words = cn_total_words
	inv_en_dict = {v: k for k, v in en_dict.items()}
	inv_cn_dict = {v: k for k, v in cn_dict.items()}

	

	if os.path.isfile(args.model_file):
		model = torch.load(args.model_file)
	else:
		print("model file does not exit!")
		exit(-1)

	if args.use_cuda:
		model = model.cuda()

	crit = utils.LanguageModelCriterion()

	test_en, test_cn = utils.load_data(args.test_file)
	args.num_test = len(test_en)
	test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict)
	test_data = utils.gen_examples(test_en, test_cn, args.batch_size)
	
	translate(model, test_data, en_dict, inv_en_dict, cn_dict, inv_cn_dict)

	correct_count, loss, num_words = eval(model, test_data, args, crit)
	loss = loss / num_words
	acc = correct_count / num_words
	print("test loss %s" % (loss) )
	print("test accuracy %f" % (acc))
	print("test total number of words %f" % (num_words))
Example #3
0
def main(opt):

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool.
        #
        "batch_size": BATCH_SIZE,
        "c": 10000,
        "learning_rate": 0.2,
        "num_iterations": 1000,
        "input_shape": (224, 224),
        "num_frames": 288,
        "dimensions": 224,
        "k": 0.1,
        # "attack_algorithm": "showandfool"
        "attack_algorithm": "carliniwagner"
    }

    convnet = 'vgg16'
    # convnet = 'nasnetalarge'
    # convnet = 'resnet152'
    full_decoder = ConvS2VT(convnet, model, opt)
    '''
    Layer freezing experiment.
    
    Top 10 contributing layers: 
    conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight
    conv.cell_13.comb_iter_4_left.bn_sep_1.weight
    conv.reduction_cell_0.conv_prev_1x1.bn.weight
    conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight
    conv.cell_13.comb_iter_0_left.bn_sep_1.weight
    
    
    '''

    top = open("top_layers.txt", "r")
    top_layers = top.readlines()
    top.close()
    print(top_layers)

    #set the gradients on the layers you don't want to contribute to 0
    top_layers = []

    for name, parameters in full_decoder.named_parameters():
        reset = True
        for f in top_layers:
            if name in f:
                reset = False

        if reset:
            parameters.require_grad = False
            if parameters.grad is not None:
                print(name)
                parameters.grad.data.zero_()

    # for name, parameters in full_decoder.named_parameters():
    #     for f in top_layers:
    #         if name not in f:
    #             print(name)
    #             parameters.require_grad = False
    #             if parameters.grad is not None:
    #                 # parameters.data = 0
    #                 parameters.grad.data.zero_()
    #         else:
    #             # print(name)
    #             continue

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #target_caption = np.random.choice(viable_target_captions)
    # 5 captions:
    '''
    <sos> A person is typing into a laptop computer <eos>
    <sos> A boy is kicking a soccer ball into the goal <eos>
    <sos> Someone is frying fish <eos>
    <sos> A dog is running with a ball <eos>
    <sos> The cat approaches on grass <eos>
    
    '''
    captions = {
        1: '<sos> A woman is talking <eos>',
        2: '<sos> A boy is kicking a soccer ball into the goal <eos>',
        3: '<sos> A man is frying fish <eos>',
        4: '<sos> A dog is running with a ball <eos>',
        5: '<sos> A cat is walking on grass <eos>'
    }

    #1 doesn't work
    videos = {

        #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi
        #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22'
        #1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19.avi',
        3: 'O2qiPS2NCeY_2_18.avi',
        4: 'kI6MWZrl8v8_149_161.avi',
        5: 'X7sQq-Iu1gQ_12_22.avi',
        6: '77iDIp40m9E_159_181.avi',
        7: 'SaYwh6chmiw_15_40.avi',
        8: 'pFSoWsocv0g_8_17.avi',
        9: 'HmVPxs4ygMc_44_53.avi',
        10: 'glii-kazad8_21_29.avi',
        11: 'AJJ-iQkbRNE_97_109.avi'
    }
    #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi"
    # video_path = ''

    video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[
        2]
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Just switch the number to get a target caption.
    target_caption = captions[1]

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96)
    #12 frames
    length = 3
    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = ['Adam', (0.9, 0.999)]

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    while (frame_counter < length):
        print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations)))
        iteration = iteration + 1
        if length - frame_counter < BATCH_SIZE:
            window = [frame_counter, length]
            frame_counter = frame_counter + (length - frame_counter)
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())

        else:
            window = [frame_counter, frame_counter + BATCH_SIZE - 1]
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))

            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())
            frame_counter = frame_counter + BATCH_SIZE

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    print("\nSaving to: {}".format(adv_path))
    # adv_frames_1 = np.concatenate(adv_frames, axis=0)
    # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn)
    # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)

    # print("Adversarial Frames 1: {}".format(sents[0]))
    adv_frames = np.concatenate(adv_frames, axis=0)
    # batches = create_batches(adv_frames, load_img_fn, tf_img_fn)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)
    #
    # print("Adversarial Frames 2: {}".format(sents[0]))

    outputfile = adv_path

    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            #huffyuv is lossless. r10k is really good

            # '-c:v': 'libx264', #libx264 # use the h.264 codec
            '-c:v': 'huffyuv',  #r210 huffyuv r10k
            # '-pix_fmt': 'rgb32',
            # '-crf': '0', # set the constant rate factor to 0, which is lossless
            # '-preset': 'ultrafast'  # ultrafast, veryslow the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)

    writer.close()

    # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW')
    # np.save(np_path, adv_frames)
    #ffv1 0.215807946043995
    #huffyuv 0.21578424050191813
    #libx264 0.2341074901578537
    #r210 -0.7831487262059795, -0.7833399258537526
    #gif 0.6889478809555243
    #png 0.2158991440582696 0.21616862708842177
    #qtrle  0.21581286337807626
    #flashsv 0.21610510459932186 0.21600030673323545
    #ffvhuff 0.21620682250167533
    #r10k similar to r210
    #rawvideo 0.21595001

    with torch.no_grad():

        #getting a new model to see how it actually works now
        # full_decoder = ConvS2VT(convnet, model, opt)
        full_decoder = full_decoder.eval()

        frames = skvideo.io.vread(adv_path)

        frames = np.float32(frames)
        plt.imshow(frames[0] / 255.)
        plt.show()

        difference = np.array(adv_frames) - np.array(frames)
        np.save('difference_tmp', difference)
        #loadtxt to load np array from txt

        exp = np.load('difference_tmp.npy')

        # numpy_frames = np.load(np_path+'.npy')
        # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames))
        # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference))

        frames = frames + difference

        # batches = exp_create_batches(numpy_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption = sents[0]
        #
        # print("Numpy Frames exp: {}".format(numpy_caption))
        #

        # numpy_frames_tensor = torch.tensor(numpy_frames)
        # numpy_frames_tensor = numpy_frames_tensor.float()
        # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption_tensor = sents[0]
        #
        # print("Numpy Frames tensor: {}".format(numpy_caption_tensor))

        # numpy_frames = numpy_frames.astype(np.uint8)
        # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn)
        #
        # # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        #
        # print("Numpy Frames originalscale: {}".format(sents[0]))
        # # bp ---
        adv_frames = adv_frames.astype(np.uint8)
        batches = create_batches(adv_frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames old: {}".format(sents[0]))

        batches = exp_create_batches(adv_frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames new: {}".format(sents[0]))

        frames = frames.astype(np.uint8)
        batches = create_batches(frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        print("frames old caption: ", sents[0])

        # frames = frames.astype(np.uint8)
        # batches = create_batches(frames, load_img_fn, tf_img_fn)

        batches = exp_create_batches(frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
Example #4
0
if use_cuda:
    net.cuda()
    net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))

if vars(opt).get('start_from', None) is not None:
    state_dict = torch.load(os.path.join(opt.start_from, 'model.pth'))
    if not use_cuda:
        from collections import OrderedDict
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            name = k[7:]
            new_state_dict[name] = v
        state_dict = new_state_dict
    net.load_state_dict(state_dict)

criterion = utils.LanguageModelCriterion()

def test():
    net.eval()

    loader = tqdm(enumerate(dataloader), total=len(dataloader), ascii=True)

    min_loss = 1e9

    for batch_idx, (fc, att, labels, data_info) in loader:
        if use_cuda:
            fc, att, labels = fc.cuda(), att.cuda(), labels.cuda()
        fc, att, labels = Variable(fc, requires_grad=False), Variable(att, requires_grad=False), Variable(labels, requires_grad=False)
        fc = torch.stack([fc]*opt.seq_per_img).view(-1, *fc.shape[1:])
        att = torch.stack([att]*opt.seq_per_img).view(-1, *att.shape[1:])
        origin_labels = labels.view(-1, *labels.shape[2:])
Example #5
0
def main(args):

    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    # code.interact(local=locals())

    if os.path.isfile(args.vocab_file):
        en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(
            open(args.vocab_file, "rb"))
    else:
        en_dict, en_total_words = utils.build_dict(train_en)
        cn_dict, cn_total_words = utils.build_dict(train_cn)
        pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words],
                    open(args.vocab_file, "wb"))

    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)

    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    if os.path.isfile(args.model_file):
        model = torch.load(args.model_file)
    elif args.model == "EncoderDecoderModel":
        model = EncoderDecoderModel(args)

    if args.use_cuda:
        model = model.cuda()

    crit = utils.LanguageModelCriterion()

    learning_rate = args.learning_rate
    optimizer = getattr(optim, args.optimizer)(model.parameters(),
                                               lr=learning_rate)

    total_num_sentences = 0.
    total_time = 0.
    for epoch in range(args.num_epochs):
        np.random.shuffle(train_data)
        total_train_loss = 0.
        total_num_words = 0.
        for idx, (mb_x, mb_x_mask, mb_y,
                  mb_y_mask) in tqdm(enumerate(train_data)):

            batch_size = mb_x.shape[0]
            total_num_sentences += batch_size
            mb_x = Variable(torch.from_numpy(mb_x)).long()
            mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
            hidden = model.init_hidden(batch_size)
            mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:]))

            if args.use_cuda:
                mb_x = mb_x.cuda()
                mb_x_mask = mb_x_mask.cuda()
                mb_input = mb_input.cuda()
                mb_out = mb_out.cuda()
                mb_out_mask = mb_out_mask.cuda()

            mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

            loss = crit(mb_pred, mb_out, mb_out_mask)
            num_words = torch.sum(mb_out_mask).data[0]
            total_train_loss += loss.data[0] * num_words
            total_num_words += num_words

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print("training loss: %f" % (total_train_loss / total_num_words))
Example #6
0
def main(opt):
    def loss(seq_prob, crit):
        loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda())
        return loss

    def produce_t_mask():
        mask = torch.zeros(dataset.max_len)
        captions = [target_caption.split(' ')]
        gts = torch.zeros(len(captions), dataset.max_len).long()
        for i, cap in enumerate(captions):
            if len(cap) > dataset.max_len:
                cap = cap[:dataset.max_len]
                cap[-1] = '<eos>'
            for j, w in enumerate(cap):
                gts[i, j] = dataset.word_to_ix[w]

        label = gts[0]
        non_zero = (label == 0).nonzero()
        mask[:int(non_zero[0]) + 1] = 1

        return label.unsqueeze(0), mask.unsqueeze(0)

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        "batch_size": BATCH_SIZE,
        "c": 100,
        "learning_rate": 0.005,
        "num_iterations": 1000,
        "input_shape": (299, 299),
        "num_frames": 288,
        "dimensions": 331
    }

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #Random target caption
    # target_caption = np.random.choice(viable_target_captions)
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    length = math.ceil(
        len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) /
        96)

    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    frames = skvideo.io.vread(video_path)[0:BATCH_SIZE]
    original = torch.tensor(frames)
    original = (original.float()).cuda()

    batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer.zero_grad()
    cost.backward()
    original_grads = {}
    for name, parameter in full_decoder.named_parameters():
        original_grads[name] = parameter.grad

    print(len(original_grads.keys()))
    # for key, value in original_grads.items():
    #     print(key)

    #Adversarial

    full_decoder = ConvS2VT(convnet, model, opt)

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    adv_frames = skvideo.io.vread(adv_path)
    adv_frames = np.float32(adv_frames)

    adv_frames = torch.tensor(adv_frames)
    adv_frames = (adv_frames.float()).cuda()

    batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    optimizer.zero_grad()
    cost.backward()
    adv_grads = {}
    for name, parameter in full_decoder.named_parameters():
        adv_grads[name] = parameter.grad

    # for key, value in adv_grads.items():
    #     print(key)

    print('\n\n\n------')
    for key, value in adv_grads.items():
        if 'weight' in key:
            print(key)

    output = open("s2vt_weightoutput.txt", "w")

    l2norm_layers = []
    for key, value in original_grads.items():
        if 'weight' in key:
            if (value is not None):
                adv_weight = adv_grads[key]
                # print(value, adv_weight)
                diff = value - adv_weight
                net_change = np.linalg.norm(diff) / np.linalg.norm(value)
                output.write("{}, {}\n".format(key, net_change))
                l2norm_layers.append([key, net_change])
    output.close()
Example #7
0
def main(args):

	# load sentences (English and Chinese words)
	train_en, train_cn = utils.load_data(args.train_file)
	dev_en, dev_cn = utils.load_data(args.dev_file)
	args.num_train = len(train_en)
	args.num_dev = len(dev_en)

	# build English and Chinese dictionary
	if os.path.isfile(args.vocab_file):
		en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(open(args.vocab_file, "rb"))
	else:
		en_dict, en_total_words = utils.build_dict(train_en)
		cn_dict, cn_total_words = utils.build_dict(train_cn)
		pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words], open(args.vocab_file, "wb"))

	args.en_total_words = en_total_words
	args.cn_total_words = cn_total_words
	# index to words dict
	inv_en_dict = {v: k for k, v in en_dict.items()}
	inv_cn_dict = {v: k for k, v in cn_dict.items()}

	# encode train and dev sentences into indieces
	train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
	# convert to numpy tensors
	train_data = utils.gen_examples(train_en, train_cn, args.batch_size)

	dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)
	dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

	# code.interact(local=locals())

	if os.path.isfile(args.model_file):
		model = torch.load(args.model_file)
	elif args.model == "EncoderDecoderModel":
		model = EncoderDecoderModel(args)

	if args.use_cuda:
		model = model.cuda()

	crit = utils.LanguageModelCriterion()

	print("start evaluating on dev...")
	correct_count, loss, num_words = eval(model, dev_data, args, crit)

	loss = loss / num_words
	acc = correct_count / num_words
	print("dev loss %s" % (loss) )
	print("dev accuracy %f" % (acc))
	print("dev total number of words %f" % (num_words))
	best_acc = acc

	learning_rate = args.learning_rate
	optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate)
	
	total_num_sentences = 0.
	total_time = 0.
	for epoch in range(args.num_epoches):
		np.random.shuffle(train_data)
		total_train_loss = 0.
		total_num_words = 0.
		for idx, (mb_x, mb_x_mask, mb_y, mb_y_mask) in tqdm(enumerate(train_data)):

			batch_size = mb_x.shape[0]
			total_num_sentences += batch_size
			# convert numpy ndarray to PyTorch tensors and variables
			mb_x = Variable(torch.from_numpy(mb_x)).long()
			mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()
			hidden = model.init_hidden(batch_size)
			mb_input = Variable(torch.from_numpy(mb_y[:,:-1])).long()
			mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
			mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:]))

			if args.use_cuda:
				mb_x = mb_x.cuda()
				mb_x_mask = mb_x_mask.cuda()
				mb_input = mb_input.cuda()
				mb_out = mb_out.cuda()
				mb_out_mask = mb_out_mask.cuda()
			
			mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

			loss = crit(mb_pred, mb_out, mb_out_mask)
			num_words = torch.sum(mb_out_mask).data[0]
			total_train_loss += loss.data[0] * num_words
			total_num_words += num_words
	
			optimizer.zero_grad()
			loss.backward()
			optimizer.step()
		print("training loss: %f" % (total_train_loss / total_num_words))

		# evaluate every eval_epoch
		if (epoch+1) % args.eval_epoch == 0:
			

			print("start evaluating on dev...")
	
			correct_count, loss, num_words = eval(model, dev_data, args, crit)

			loss = loss / num_words
			acc = correct_count / num_words
			print("dev loss %s" % (loss) )
			print("dev accuracy %f" % (acc))
			print("dev total number of words %f" % (num_words))

			# save model if we have the best accuracy
			if acc >= best_acc:
				torch.save(model, args.model_file)
				best_acc = acc

				print("model saved...")
			else:
				learning_rate *= 0.5
				optimizer = getattr(optim, args.optimizer)(model.parameters(), lr=learning_rate)

			print("best dev accuracy: %f" % best_acc)
			print("#" * 60)

	# load test data
	test_en, test_cn = utils.load_data(args.test_file)
	args.num_test = len(test_en)
	test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict)
	test_data = utils.gen_examples(test_en, test_cn, args.batch_size)

	# evaluate on test
	correct_count, loss, num_words = eval(model, test_data, args, crit)
	loss = loss / num_words
	acc = correct_count / num_words
	print("test loss %s" % (loss) )
	print("test accuracy %f" % (acc))
	print("test total number of words %f" % (num_words))

	# evaluate on train
	correct_count, loss, num_words = eval(model, train_data, args, crit)
	loss = loss / num_words
	acc = correct_count / num_words
	print("train loss %s" % (loss) )
	print("train accuracy %f" % (acc))
Example #8
0
def main(args):

    train_sentences = utils.load_data(args.train_file)
    dev_sentences = utils.load_data(args.dev_file)

    args.num_train = len(train_sentences)
    args.num_dev = len(dev_sentences)

    word_dict, args.vocab_size = utils.load_dict(args.vocab_file)
    # word_dict, args.vocab_size = utils.build_dict(train_sentences, max_words=args.vocab_size)
    # word_dict["UNK"] = 0

    # pickle.dump(word_dict, open(args.dict_file, "wb"))

    train_sentences = utils.encode(train_sentences, word_dict)
    train_sentences = utils.gen_examples(train_sentences, args.batch_size)

    dev_sentences = utils.encode(dev_sentences, word_dict)
    dev_sentences = utils.gen_examples(dev_sentences, args.batch_size)

    if os.path.exists(args.model_file):
        model = torch.load(args.model_file)
    else:
        model = LSTMModel(args)

    # if args.test_only:
    # 	print("start evaluating on test")
    # 	correct_count, loss = eval(model, all_test, args)
    # 	print("test accuracy %f" % (float(correct_count) / float(args.num_test)))
    # 	loss = loss / args.num_test
    # 	print("test loss %f" % loss)

    # 	correct_count, loss = eval(model, all_dev, args)
    # 	print("dev accuracy %f" % (float(correct_count) / float(args.num_dev)))
    # 	loss = loss / args.num_dev
    # 	print("dev loss %f" % loss)
    # 	return 0

    crit = utils.LanguageModelCriterion()

    print("start evaluating on dev...")

    correct_count, loss, num_words = eval(model, dev_sentences, args, crit)

    loss = loss / num_words
    acc = correct_count / num_words
    print("loss %s" % (loss))
    print("accuracy %f" % (acc))
    print("total number of words %f" % (num_words))
    best_acc = acc
    prev_acc = acc

    learning_rate = args.learning_rate
    if args.optimizer == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif args.optimizer == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # best_loss = loss

    flog = open(args.log_file, "w")
    total_num_sentences = 0.
    total_time = 0.
    for epoch in range(args.num_epoches):

        np.random.shuffle(train_sentences)
        num_batches = len(train_sentences)
        # bar = progressbar.ProgressBar(max_value= num_batches * args.eval_epoch, redirect_stdout=True)
        total_train_loss = 0.
        total_num_words = 0.
        start = time.time()
        for idx, (mb_s, mb_mask) in tqdm(enumerate(train_sentences)):

            batch_size = mb_s.shape[0]
            total_num_sentences += batch_size
            mb_input = Variable(torch.from_numpy(mb_s[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_s[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_mask[:, 1:]))
            hidden = model.init_hidden(batch_size)
            mb_pred, hidden = model(mb_input, hidden)

            loss = crit(mb_pred, mb_out, mb_out_mask)
            num_words = torch.sum(mb_out_mask).data[0]
            total_train_loss += loss.data[0] * num_words
            total_num_words += num_words

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # print(loss.data[0])
            # bar.update(num_batches * (epoch % args.eval_epoch) + idx +1)
        end = time.time()
        total_time += (end - start)

        # bar.finish()
        print("training loss: %f" % (total_train_loss / total_num_words))

        if (epoch + 1) % args.eval_epoch == 0:

            print("start evaluating on dev...")

            correct_count, loss, num_words = eval(model, dev_sentences, args,
                                                  crit)

            loss = loss / num_words
            acc = correct_count / num_words
            print("dev loss %s" % (loss))
            print("dev accuracy %f" % (acc))
            print("dev total number of words %f" % (num_words))

            if acc > best_acc:
                torch.save(model, args.model_file)
                best_acc = acc
                print("model saved...")
            elif acc < prev_acc:
                learning_rate *= 0.5
                if args.optimizer == "SGD":
                    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
                elif args.optimizer == "Adam":
                    optimizer = optim.Adam(model.parameters(),
                                           lr=learning_rate)
            prev_acc = acc

            print("best dev accuracy: %f" % best_acc)
            print("#" * 60)

            flog.write("%f\t%f\t%f\t%f\t%f\n" %
                       (total_time, total_num_sentences, best_acc, acc, loss))

    print("#sents/sec: %f" % (total_num_sentences / total_time))
    test_sentences = utils.load_data(args.test_file)
    args.num_test = len(test_sentences)
    test_sentences = utils.encode(test_sentences, word_dict)
    test_sentences = utils.gen_examples(test_sentences, args.batch_size)
    correct_count, loss, num_words = eval(model, test_sentences, args, crit)
    loss = loss / num_words
    acc = correct_count / num_words
    print("test loss %s" % (loss))
    print("test accuracy %f" % (acc))
    print("test total number of words %f" % (num_words))

    err = Counter()
    correct_count, loss, num_words = eval(model,
                                          dev_sentences,
                                          args,
                                          crit,
                                          err=err)
    if err != None:
        err = err.most_common()[:20]
        word_dict_rev = {v: k for k, v in word_dict.iteritems()}
        for pair in err:
            p = pair[0].split(",")
            pg = word_dict_rev[int(p[0])]
            pp = word_dict_rev[int(p[1])]
            flog.write("ground truth: " + pg + ", predicted: " + pp +
                       ", number: " + str(pair[1]) + "\\\\\n")

    flog.close()
Example #9
0
def main(args):

    train_sentences = utils.load_data(args.train_file)
    dev_sentences = utils.load_data(args.dev_file)

    args.num_train = len(train_sentences)
    args.num_dev = len(dev_sentences)

    word_dict, args.vocab_size = utils.load_dict(args.vocab_file)
    # word_dict, args.vocab_size = utils.build_dict(train_sentences, max_words=args.vocab_size)
    # word_dict["UNK"] = 0

    # pickle.dump(word_dict, open(args.dict_file, "wb"))

    train_sentences = utils.encode(train_sentences, word_dict)
    train_sentences = utils.gen_examples(train_sentences, args.batch_size)

    dev_sentences = utils.encode(dev_sentences, word_dict)
    dev_sentences = utils.gen_examples(dev_sentences, args.batch_size)

    # code.interact(local=locals())

    att_dict = {}

    if os.path.exists(args.model_file):
        model = torch.load(args.model_file)
    elif args.model == "LSTMHingeModel":
        model = LSTMHingeModel(args)
    elif args.model == "LSTMHingeOutEmbModel":
        model = LSTMHingeOutEmbModel(args)
    elif args.model == "LSTMHingeOutEmbNegModel":
        model = LSTMHingeOutEmbNegModel(args)
    elif args.model == "LSTMModel":
        model = LSTMModel(args)

    if args.criterion == "HingeModelCriterion":
        crit = utils.HingeModelCriterion()
    elif args.criterion == "LanguageModelCriterion":
        crit = utils.LanguageModelCriterion()

    print("start evaluating on dev...")

    correct_count, loss, num_words = eval(model, dev_sentences, args, crit)

    loss = loss / num_words
    acc = correct_count / num_words
    print("dev loss %s" % (loss))
    print("dev accuracy %f" % (acc))
    print("dev total number of words %f" % (num_words))
    best_acc = acc
    prev_acc = acc

    learning_rate = args.learning_rate
    if args.optimizer == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    elif args.optimizer == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    # best_loss = loss
    flog = open(args.log_file, "w")

    total_num_sentences = 0.
    total_time = 0.
    for epoch in range(args.num_epoches):

        np.random.shuffle(train_sentences)
        num_batches = len(train_sentences)
        total_train_loss = 0.
        total_num_words = 0.
        start = time.time()
        for idx, (mb_s, mb_mask) in tqdm(enumerate(train_sentences)):

            batch_size = mb_s.shape[0]
            total_num_sentences += batch_size
            mb_input = Variable(torch.from_numpy(mb_s[:, :-1])).long()
            mb_out = Variable(torch.from_numpy(mb_s[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_mask[:, 1:]))
            hidden = model.init_hidden(batch_size)
            if args.model == "LSTMHingeOutEmbNegModel":

                mb_pred, hidden = model(mb_input, hidden, mb_out)
                mb_out = Variable(
                    mb_pred.data.new(mb_pred.size(0),
                                     mb_pred.size(1)).zero_()).long()
                loss = crit(mb_pred, mb_out, mb_out_mask)
            else:
                mb_pred, hidden = model(mb_input, hidden)
                loss = crit(mb_pred, mb_out, mb_out_mask)
            num_words = torch.sum(mb_out_mask).data[0]
            total_train_loss += loss.data[0] * num_words
            # code.interact(local=locals())
            total_num_words += num_words

            optimizer.zero_grad()
            loss.backward()

            nn.utils.clip_grad_norm(model.parameters(), args.grad_clipping)
            optimizer.step()

        end = time.time()
        total_time += (end - start)

        print("training loss: %f" % (total_train_loss / total_num_words))

        if (epoch + 1) % args.eval_epoch == 0:

            print("start evaluating on dev...")

            correct_count, loss, num_words = eval(model, dev_sentences, args,
                                                  crit)

            loss = loss / num_words
            acc = correct_count / num_words
            print("dev loss %s" % (loss))
            print("dev accuracy %f" % (acc))
            print("dev total number of words %f" % (num_words))

            if acc > best_acc:
                torch.save(model, args.model_file)
                best_acc = acc
                # infos['epoch'] = epoch
                # infos['best_acc'] = best_acc
                # infos['vocab']

                print("model saved...")
            elif acc < prev_acc:
                learning_rate *= 0.5
                if args.optimizer == "SGD":
                    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
                elif args.optimizer == "Adam":
                    optimizer = optim.Adam(model.parameters(),
                                           lr=learning_rate)
            prev_acc = acc

            print("best dev accuracy: %f" % best_acc)
            print("#" * 60)

            flog.write("%f\t%f\t%f\t%f\t%f\n" %
                       (total_time, total_num_sentences, best_acc, acc, loss))

    correct_count, loss, num_words = eval(model, train_sentences, args, crit)
    loss = loss / num_words
    acc = correct_count / num_words
    print("train loss %s" % (loss))
    print("train accuracy %f" % (acc))
    print("#sents/sec: %f" % (total_num_sentences / total_time))

    model = torch.load(args.model_file)
    test_sentences = utils.load_data(args.test_file)
    args.num_test = len(test_sentences)
    test_sentences = utils.encode(test_sentences, word_dict)
    test_sentences = utils.gen_examples(test_sentences, args.batch_size)
    correct_count, loss, num_words = eval(model, test_sentences, args, crit)
    loss = loss / num_words
    acc = correct_count / num_words
    print("test loss %s" % (loss))
    print("test accuracy %f" % (acc))

    flog.close()
Example #10
0
def train(opt):
    loader = DataLoader(opt)
    tb_summary_writer = tb.SummaryWriter(opt.checkpoint_path)

    infos = {}
    histories = {}

    iteration = infos.get('iter', 0)
    epoch = infos.get('epoch', 0)

    val_result_history = histories.get('val_result_history', {})
    loss_history = histories.get('loss_history', {})
    lr_history = histories.get('lr_history', {})
    ss_prob_history = histories.get('ss_prob_history', {})

    loader.iterators = infos.get('iterators', loader.iterators)
    loader.split_ix = infos.get('split_ix', loader.split_ix)

    # model = FCModel(opt).cuda()
    model = AttModel(opt).cuda()
    #dp_model = torch.nn.DataParallel(model)
    dp_model = model
    dp_model.train()

    crit = utils.LanguageModelCriterion()
    rl_crit = RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           opt.learning_rate, (0.9, 0.999),
                           1e-8,
                           weight_decay=0)

    sc_flag = False

    start = time.time()
    while True:
        # sys.stdout.flush()
        # Learning rate decay
        if epoch > opt.learning_rate_decay_start and opt.learning_rate_decay_start >= 0:
            frac = (epoch - opt.learning_rate_decay_start
                    ) // opt.learning_rate_decay_every
            decay_factor = opt.learning_rate_decay_rate**frac
            opt.current_lr = opt.learning_rate * decay_factor
        else:
            opt.current_lr = opt.learning_rate

        # Start use SCST to train
        if opt.self_critical_after >= 0 and epoch >= opt.self_critical_after:
            sc_flag = True
            init_scorer(opt.cached_tokens)
        else:
            sc_flag = False
        ##
        # sc_flag = True
        # init_scorer(opt.cached_tokens)
        ##
        utils.set_lr(optimizer, opt.current_lr)

        data = loader.get_batch('train')

        tmp = [
            data['fc_feats'], data['att_feats'], data['labels'], data['masks'],
            data['att_masks']
        ]
        tmp = [_ if _ is None else torch.from_numpy(_).cuda() for _ in tmp]
        fc_feats, att_feats, labels, masks, att_masks = tmp

        optimizer.zero_grad()
        if not sc_flag:
            loss = crit(
                dp_model('forward', fc_feats, att_feats, labels, att_masks),
                labels[:, 1:], masks[:, 1:])
            # loss = crit(dp_model('forward', fc_feats, att_feats, labels, att_masks), labels, masks)
            # loss = crit(dp_model(fc_feats, att_feats, labels, att_masks), labels[:, 1:], masks[:, 1:])
        else:
            # Generate baseline with argmax
            opt.sample_max = False
            gen_result, sample_logprobs = dp_model('sample', fc_feats,
                                                   att_feats, labels,
                                                   att_masks)
            opt.sample_max = True
            reward = get_self_critical_reward(dp_model, fc_feats, att_feats,
                                              att_masks, data, gen_result, opt)
            loss = rl_crit(sample_logprobs, gen_result.data,
                           torch.from_numpy(reward).float().cuda())

        loss.backward()

        torch.nn.utils.clip_grad_norm_(dp_model.parameters(), opt.grad_clip)

        train_loss = loss.item()

        optimizer.step()

        if iteration % opt.print_every == 0:
            torch.cuda.synchronize()
            end = time.time()
            if not sc_flag:
                print("iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}" \
                      .format(iteration, epoch, train_loss, end - start))
            else:
                print("iter {} (epoch {}), train_loss = {:.3f}, avg_reward = {:.3f}, time/batch = {:.3f}" \
                      .format(iteration, epoch, train_loss, np.mean(reward[:, 0]), end - start))
            start = time.time()

        iteration += 1

        if data['bounds']['wrapped']:
            epoch += 1
        #-------------------------------------------------------------------#
        if (iteration % opt.checkpoint_every == 0):
            add_summary_value(tb_summary_writer, 'train_loss', train_loss,
                              iteration)
            add_summary_value(tb_summary_writer, 'learning_rate',
                              opt.current_lr, iteration)

            if sc_flag:
                add_summary_value(tb_summary_writer, 'avg_reward',
                                  np.mean(reward[:, 0]), iteration)

            loss_history[iteration] = train_loss if not sc_flag else np.mean(
                reward[:, 0])
            lr_history[iteration] = opt.current_lr

        #-------------------------------------------------------------------#
        if (iteration % opt.save_every == 0):
            val_loss, predictions, lang_stats = eval.eval_split(
                dp_model, crit, loader, 'val', opt)
            add_summary_value(tb_summary_writer, 'validation loss', val_loss,
                              iteration)

            if lang_stats is not None:
                for k, v in lang_stats.items():
                    add_summary_value(tb_summary_writer, k, v, iteration)
                    print('{} : {}'.format(k, v))
            val_result_history[iteration] = {
                'loss': val_loss,
                'lang_stats': lang_stats,
                'predictions': predictions
            }

            current_score = lang_stats['CIDEr']
            if current_score > opt.best_cider_score:
                print('New Best Cider Score: {}'.format(current_score))
                opt.best_cider_score = current_score
                checkpoint_path = os.path.join(opt.checkpoint_path,
                                               'model-best.pth')
                torch.save(model.state_dict(), checkpoint_path)
                print('Save best model!')

        if epoch >= opt.max_epochs and opt.max_epochs >= 0:
            break
Example #11
0
def main(args):
    # code.interact(local=locals())

    # 1.加载数据
    # 加载句子
    train_en, train_cn = utils.load_data(args.train_file)
    dev_en, dev_cn = utils.load_data(args.dev_file)
    # 参数存储
    args.num_train = len(train_en)
    args.num_dev = len(dev_en)

    # 2.构建单词字典
    if os.path.isfile(args.vocab_file):
        en_dict, cn_dict, en_total_words, cn_total_words = pickle.load(
            open(args.vocab_file, "rb"))
    else:
        # 获取字典
        en_dict, en_total_words = utils.build_dict(train_en)
        cn_dict, cn_total_words = utils.build_dict(train_cn)
        pickle.dump([en_dict, cn_dict, en_total_words, cn_total_words],
                    open(args.vocab_file, "wb"))
    # 参数存储
    args.en_total_words = en_total_words
    args.cn_total_words = cn_total_words

    # 翻转字典,转换为数字->单词
    inv_en_dict = {v: k for k, v in en_dict.items()}
    inv_cn_dict = {v: k for k, v in cn_dict.items()}

    # 编码单词,单词->数字
    train_en, train_cn = utils.encode(train_en, train_cn, en_dict, cn_dict)
    dev_en, dev_cn = utils.encode(dev_en, dev_cn, en_dict, cn_dict)

    # convert to numpy tensors
    train_data = utils.gen_examples(train_en, train_cn, args.batch_size)
    dev_data = utils.gen_examples(dev_en, dev_cn, args.batch_size)

    # 初始化模型
    if os.path.isfile(args.model_file):
        model = torch.load(args.model_file)
    elif args.model == "EncoderDecoderModel":
        model = EncoderDecoderModel(args)

    if args.use_cuda:
        model = model.cuda()

    # 交叉熵loss函数
    crit = utils.LanguageModelCriterion()

    # 指标评估
    print("start evaluating on dev...")
    correct_count, loss, num_words = eval(model, dev_data, args, crit)

    loss = loss / num_words
    acc = correct_count / num_words
    print("dev loss %s" % (loss))
    print("dev accuracy %f" % (acc))
    print("dev total number of words %f" % (num_words))
    best_acc = acc

    # 定义学习率
    learning_rate = args.learning_rate

    # 定义优化器
    optimizer = getattr(optim, args.optimizer)(model.parameters(),
                                               lr=learning_rate)

    total_num_sentences = 0.
    total_time = 0.
    for epoch in range(args.num_epoches):
        np.random.shuffle(train_data)
        total_train_loss = 0.
        total_num_words = 0.
        # 获取训练数据和序列下标
        for idx, (mb_x, mb_x_mask, mb_y,
                  mb_y_mask) in tqdm(enumerate(train_data)):
            # 获取mini batch size
            batch_size = mb_x.shape[0]
            total_num_sentences += batch_size
            # 将numpy的tensor数据类型转换为torch的tensor,再套上variable
            mb_x = Variable(torch.from_numpy(mb_x)).long()
            mb_x_mask = Variable(torch.from_numpy(mb_x_mask)).long()

            # LSTM隐层state
            hidden = model.init_hidden(batch_size)
            # 预测句子的给定前缀
            mb_input = Variable(torch.from_numpy(mb_y[:, :-1])).long()
            # 预测句子的目标后缀
            mb_out = Variable(torch.from_numpy(mb_y[:, 1:])).long()
            mb_out_mask = Variable(torch.from_numpy(mb_y_mask[:, 1:]))

            if args.use_cuda:
                mb_x = mb_x.cuda()
                mb_x_mask = mb_x_mask.cuda()
                mb_input = mb_input.cuda()
                mb_out = mb_out.cuda()
                mb_out_mask = mb_out_mask.cuda()

            # 模型预测函数
            mb_pred, hidden = model(mb_x, mb_x_mask, mb_input, hidden)

            # 交叉熵损失函数衡量pred和out差距
            loss = crit(mb_pred, mb_out, mb_out_mask)
            num_words = torch.sum(mb_out_mask).data[0]
            total_train_loss += loss.data[0] * num_words
            total_num_words += num_words

            # 更新模型
            # 首先清空模型梯度数据
            optimizer.zero_grad()
            # 计算loss对parameter的梯度
            loss.backward()
            # 实行梯度下降
            optimizer.step()

        # 打印loss值
        print("training loss: %f" % (total_train_loss / total_num_words))

        # 评估每一轮迭代
        if (epoch + 1) % args.eval_epoch == 0:
            print("start evaluating on dev...")
            # 获取参数
            correct_count, loss, num_words = eval(model, dev_data, args, crit)
            # 计算损失和准确率
            loss = loss / num_words
            acc = correct_count / num_words
            print("dev loss %s" % (loss))
            print("dev accuracy %f" % (acc))
            print("dev total number of words %f" % (num_words))

            # 存储最优准确率模型
            if acc >= best_acc:
                torch.save(model, args.model_file)
                best_acc = acc
                print("model saved...")
            else:
                learning_rate *= 0.5
                optimizer = getattr(optim, args.optimizer)(model.parameters(),
                                                           lr=learning_rate)

            # 打印最佳准确率
            print("best dev accuracy: %f" % best_acc)
            print("#" * 60)

    # 加载数据
    test_en, test_cn = utils.load_data(args.test_file)
    args.num_test = len(test_en)
    test_en, test_cn = utils.encode(test_en, test_cn, en_dict, cn_dict)
    test_data = utils.gen_examples(test_en, test_cn, args.batch_size)

    # 测试集评估
    correct_count, loss, num_words = eval(model, test_data, args, crit)
    loss = loss / num_words
    acc = correct_count / num_words
    print("test loss %s" % (loss))
    print("test accuracy %f" % (acc))
    print("test total number of words %f" % (num_words))

    # 训练集评估
    correct_count, loss, num_words = eval(model, train_data, args, crit)
    loss = loss / num_words
    acc = correct_count / num_words
    print("train loss %s" % (loss))
    print("train accuracy %f" % (acc))
def main(args, opt):

    testpath = 'D:\\College\\Research\\2019 Video Captioning Attack Conference Paper\\youtube2text_preprocessed_for_arctic_capgen_vid\\youtube2text_iccv15\\dict_movieID_caption.pkl'

    with open(testpath, 'rb') as f:
        data = pickle.load(f, encoding='latin1')
    print(data)

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        "batch_size": BATCH_SIZE,
        "c": 100,
        "learning_rate": 0.005,
        "num_iterations": 1000,
        "input_shape": (299, 299),
        "num_frames": 288,
        "dimensions": 331
    }

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    # model = torch.nn.Sequential(torch.nn.Conv2d(in_channels=3, out_channels=96, kernel_size=3, padding=0, stride=2,
    #                                             bias=False), full_decoder)

    #loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=16,
                            shuffle=True)

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)