def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)


    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset)

    carlini.execute(video_path)
Example #2
0
def main(opt):

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool.
        #
        "batch_size": BATCH_SIZE,
        "c": 10000,
        "learning_rate": 0.2,
        "num_iterations": 1000,
        "input_shape": (224, 224),
        "num_frames": 288,
        "dimensions": 224,
        "k": 0.1,
        # "attack_algorithm": "showandfool"
        "attack_algorithm": "carliniwagner"
    }

    convnet = 'vgg16'
    # convnet = 'nasnetalarge'
    # convnet = 'resnet152'
    full_decoder = ConvS2VT(convnet, model, opt)
    '''
    Layer freezing experiment.
    
    Top 10 contributing layers: 
    conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight
    conv.cell_13.comb_iter_4_left.bn_sep_1.weight
    conv.reduction_cell_0.conv_prev_1x1.bn.weight
    conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight
    conv.cell_13.comb_iter_0_left.bn_sep_1.weight
    
    
    '''

    top = open("top_layers.txt", "r")
    top_layers = top.readlines()
    top.close()
    print(top_layers)

    #set the gradients on the layers you don't want to contribute to 0
    top_layers = []

    for name, parameters in full_decoder.named_parameters():
        reset = True
        for f in top_layers:
            if name in f:
                reset = False

        if reset:
            parameters.require_grad = False
            if parameters.grad is not None:
                print(name)
                parameters.grad.data.zero_()

    # for name, parameters in full_decoder.named_parameters():
    #     for f in top_layers:
    #         if name not in f:
    #             print(name)
    #             parameters.require_grad = False
    #             if parameters.grad is not None:
    #                 # parameters.data = 0
    #                 parameters.grad.data.zero_()
    #         else:
    #             # print(name)
    #             continue

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #target_caption = np.random.choice(viable_target_captions)
    # 5 captions:
    '''
    <sos> A person is typing into a laptop computer <eos>
    <sos> A boy is kicking a soccer ball into the goal <eos>
    <sos> Someone is frying fish <eos>
    <sos> A dog is running with a ball <eos>
    <sos> The cat approaches on grass <eos>
    
    '''
    captions = {
        1: '<sos> A woman is talking <eos>',
        2: '<sos> A boy is kicking a soccer ball into the goal <eos>',
        3: '<sos> A man is frying fish <eos>',
        4: '<sos> A dog is running with a ball <eos>',
        5: '<sos> A cat is walking on grass <eos>'
    }

    #1 doesn't work
    videos = {

        #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi
        #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22'
        #1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19.avi',
        3: 'O2qiPS2NCeY_2_18.avi',
        4: 'kI6MWZrl8v8_149_161.avi',
        5: 'X7sQq-Iu1gQ_12_22.avi',
        6: '77iDIp40m9E_159_181.avi',
        7: 'SaYwh6chmiw_15_40.avi',
        8: 'pFSoWsocv0g_8_17.avi',
        9: 'HmVPxs4ygMc_44_53.avi',
        10: 'glii-kazad8_21_29.avi',
        11: 'AJJ-iQkbRNE_97_109.avi'
    }
    #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi"
    # video_path = ''

    video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[
        2]
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Just switch the number to get a target caption.
    target_caption = captions[1]

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96)
    #12 frames
    length = 3
    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = ['Adam', (0.9, 0.999)]

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    while (frame_counter < length):
        print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations)))
        iteration = iteration + 1
        if length - frame_counter < BATCH_SIZE:
            window = [frame_counter, length]
            frame_counter = frame_counter + (length - frame_counter)
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())

        else:
            window = [frame_counter, frame_counter + BATCH_SIZE - 1]
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))

            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())
            frame_counter = frame_counter + BATCH_SIZE

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    print("\nSaving to: {}".format(adv_path))
    # adv_frames_1 = np.concatenate(adv_frames, axis=0)
    # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn)
    # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)

    # print("Adversarial Frames 1: {}".format(sents[0]))
    adv_frames = np.concatenate(adv_frames, axis=0)
    # batches = create_batches(adv_frames, load_img_fn, tf_img_fn)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)
    #
    # print("Adversarial Frames 2: {}".format(sents[0]))

    outputfile = adv_path

    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            #huffyuv is lossless. r10k is really good

            # '-c:v': 'libx264', #libx264 # use the h.264 codec
            '-c:v': 'huffyuv',  #r210 huffyuv r10k
            # '-pix_fmt': 'rgb32',
            # '-crf': '0', # set the constant rate factor to 0, which is lossless
            # '-preset': 'ultrafast'  # ultrafast, veryslow the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)

    writer.close()

    # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW')
    # np.save(np_path, adv_frames)
    #ffv1 0.215807946043995
    #huffyuv 0.21578424050191813
    #libx264 0.2341074901578537
    #r210 -0.7831487262059795, -0.7833399258537526
    #gif 0.6889478809555243
    #png 0.2158991440582696 0.21616862708842177
    #qtrle  0.21581286337807626
    #flashsv 0.21610510459932186 0.21600030673323545
    #ffvhuff 0.21620682250167533
    #r10k similar to r210
    #rawvideo 0.21595001

    with torch.no_grad():

        #getting a new model to see how it actually works now
        # full_decoder = ConvS2VT(convnet, model, opt)
        full_decoder = full_decoder.eval()

        frames = skvideo.io.vread(adv_path)

        frames = np.float32(frames)
        plt.imshow(frames[0] / 255.)
        plt.show()

        difference = np.array(adv_frames) - np.array(frames)
        np.save('difference_tmp', difference)
        #loadtxt to load np array from txt

        exp = np.load('difference_tmp.npy')

        # numpy_frames = np.load(np_path+'.npy')
        # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames))
        # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference))

        frames = frames + difference

        # batches = exp_create_batches(numpy_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption = sents[0]
        #
        # print("Numpy Frames exp: {}".format(numpy_caption))
        #

        # numpy_frames_tensor = torch.tensor(numpy_frames)
        # numpy_frames_tensor = numpy_frames_tensor.float()
        # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption_tensor = sents[0]
        #
        # print("Numpy Frames tensor: {}".format(numpy_caption_tensor))

        # numpy_frames = numpy_frames.astype(np.uint8)
        # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn)
        #
        # # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        #
        # print("Numpy Frames originalscale: {}".format(sents[0]))
        # # bp ---
        adv_frames = adv_frames.astype(np.uint8)
        batches = create_batches(adv_frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames old: {}".format(sents[0]))

        batches = exp_create_batches(adv_frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames new: {}".format(sents[0]))

        frames = frames.astype(np.uint8)
        batches = create_batches(frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        print("frames old caption: ", sents[0])

        # frames = frames.astype(np.uint8)
        # batches = create_batches(frames, load_img_fn, tf_img_fn)

        batches = exp_create_batches(frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len
    vocab = dataset.get_vocab()

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    video_path = opt['videos'][0]
    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]
    # orig_captions = [' '.join(toks) for toks in dataset.vid_to_meta[vid_id]['final_captions']]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    target_caption = np.random.choice(viable_target_captions)
    interval = BATCH_SIZE

    num_seconds = 0.5
    numIt = 4  # int(24 * num_seconds)
    real_len = len(skvideo.io.vread(video_path))
    assert numIt <= real_len

    print("\t\t{} iterations to do.".format(numIt))
    counter = 0
    totalframes = []
    adv_batches = []

    while numIt > (interval - 1):
        window = range(counter, counter + interval)
        counter += interval
        carlini = CarliniAttack(oracle=full_decoder,
                                video_path=video_path,
                                target=target_caption,
                                dataset=dataset,
                                window=window)
        frames = carlini.execute(video_path, window=window, functional=True)
        totalframes.append(frames.detach().cpu().numpy())
        adv_batches.append(
            create_batches(frames, batch_size=interval).detach().cpu().numpy())
        numIt -= interval
        print("\t\tWindow {}".format(numIt))

    if numIt > 0:
        window = range(counter, counter + numIt)
        carlini = CarliniAttack(oracle=full_decoder,
                                video_path=video_path,
                                target=target_caption,
                                dataset=dataset,
                                window=window)
        frames = carlini.execute(video_path, window=window, functional=True)
        totalframes.append(frames.detach().cpu().numpy())
        adv_batches.append(
            create_batches(frames, batch_size=interval).detach().cpu().numpy())
        print("\t\tWindow {}".format(numIt))

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarial.avi')

    frames = np.concatenate(totalframes, axis=0)
    save_frames_to_video(frames, adv_path)

    batches = np.concatenate(adv_batches, axis=0)

    with torch.no_grad():
        print(frames.shape)

        # bp ---
        seq_prob, seq_preds = full_decoder(batches,
                                           mode='inference',
                                           single_batch=False)
        sents = vcp_utils.decode_sequence(vocab, seq_preds)

        print(sents[0])
Example #4
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    time_stamp = get_time_stamp()

    if not os.path.isdir(os.path.join(opt['adv_dir'], time_stamp)):
        os.makedirs(os.path.join(opt['adv_dir'], time_stamp))

    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    video_names = os.listdir(opt['from_dir'])

    viable_ids = dataset.splits['test'] + dataset.splits['val']

    for vn in video_names:
        video_path = os.path.join(opt['from_dir'], vn)
        vid_id = video_path.split('\\')[-1]
        vid_id = vid_id.split('.')[0]
        orig_captions = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[vid_id]['final_captions']
        ]
        original_caption = np.random.choice(orig_captions)

        viable_target_captions = []
        for v_id in viable_ids:
            if v_id == vid_id:
                continue
            plausible_caps = [
                ' '.join(toks)
                for toks in dataset.vid_to_meta[v_id]['final_captions']
                if len(toks) <= MAX_TARGET_LEN
            ]
            viable_target_captions.extend(plausible_caps)

        target_caption = np.random.choice(viable_target_captions)

        carlini = CarliniAttack(oracle=full_decoder,
                                video_path=video_path,
                                target=target_caption,
                                dataset=dataset)

        stats_obj = carlini.execute(video_path, functional=True, stats=True)
        stats_obj['original_caption'] = original_caption
        stats_obj['target_caption'] = target_caption

        base_name = ''.join(vn.split('.')[:-1])
        adv_path = os.path.join(opt['adv_dir'], time_stamp,
                                base_name + '_adversarial.avi')
        adv_raw_path = os.path.join(opt['adv_dir'], time_stamp,
                                    base_name + '_adversarial.pkl')

        save_tensor_to_video(stats_obj['pass_in'], adv_path)
        save_tensor_to_video(stats_obj['delta'], adv_path)
        pickle_write(adv_raw_path, stats_obj)
Example #5
0
def main(opt):
    def loss(seq_prob, crit):
        loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda())
        return loss

    def produce_t_mask():
        mask = torch.zeros(dataset.max_len)
        captions = [target_caption.split(' ')]
        gts = torch.zeros(len(captions), dataset.max_len).long()
        for i, cap in enumerate(captions):
            if len(cap) > dataset.max_len:
                cap = cap[:dataset.max_len]
                cap[-1] = '<eos>'
            for j, w in enumerate(cap):
                gts[i, j] = dataset.word_to_ix[w]

        label = gts[0]
        non_zero = (label == 0).nonzero()
        mask[:int(non_zero[0]) + 1] = 1

        return label.unsqueeze(0), mask.unsqueeze(0)

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        "batch_size": BATCH_SIZE,
        "c": 100,
        "learning_rate": 0.005,
        "num_iterations": 1000,
        "input_shape": (299, 299),
        "num_frames": 288,
        "dimensions": 331
    }

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #Random target caption
    # target_caption = np.random.choice(viable_target_captions)
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    length = math.ceil(
        len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) /
        96)

    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    frames = skvideo.io.vread(video_path)[0:BATCH_SIZE]
    original = torch.tensor(frames)
    original = (original.float()).cuda()

    batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer.zero_grad()
    cost.backward()
    original_grads = {}
    for name, parameter in full_decoder.named_parameters():
        original_grads[name] = parameter.grad

    print(len(original_grads.keys()))
    # for key, value in original_grads.items():
    #     print(key)

    #Adversarial

    full_decoder = ConvS2VT(convnet, model, opt)

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    adv_frames = skvideo.io.vread(adv_path)
    adv_frames = np.float32(adv_frames)

    adv_frames = torch.tensor(adv_frames)
    adv_frames = (adv_frames.float()).cuda()

    batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    optimizer.zero_grad()
    cost.backward()
    adv_grads = {}
    for name, parameter in full_decoder.named_parameters():
        adv_grads[name] = parameter.grad

    # for key, value in adv_grads.items():
    #     print(key)

    print('\n\n\n------')
    for key, value in adv_grads.items():
        if 'weight' in key:
            print(key)

    output = open("s2vt_weightoutput.txt", "w")

    l2norm_layers = []
    for key, value in original_grads.items():
        if 'weight' in key:
            if (value is not None):
                adv_weight = adv_grads[key]
                # print(value, adv_weight)
                diff = value - adv_weight
                net_change = np.linalg.norm(diff) / np.linalg.norm(value)
                output.write("{}, {}\n".format(key, net_change))
                l2norm_layers.append([key, net_change])
    output.close()
Example #6
0
def main(opt):

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #Random target caption
    # target_caption = np.random.choice(viable_target_captions)
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    with torch.no_grad():
        frames = skvideo.io.vread(video_path)

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #/96 gives 3 frames
    length = len(skvideo.io.vread(video_path)) / 96

    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)
    while (frame_counter < length):
        print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations)))
        iteration = iteration + 1
        if length - frame_counter < BATCH_SIZE:
            window = [frame_counter, length]
            frame_counter = frame_counter + (length - frame_counter)
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            carlini = CarliniAttack(oracle=full_decoder,
                                    video_path=video_path,
                                    target=target_caption,
                                    dataset=dataset,
                                    window=window)
            finished_frames = carlini.execute(video_path,
                                              window=window,
                                              functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())

        else:
            window = [frame_counter, frame_counter + BATCH_SIZE - 1]
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            carlini = CarliniAttack(oracle=full_decoder,
                                    video_path=video_path,
                                    target=target_caption,
                                    dataset=dataset,
                                    window=window)
            finished_frames = carlini.execute(video_path,
                                              window=window,
                                              functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())
            frame_counter = frame_counter + BATCH_SIZE

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    print("\nSaving to: {}".format(adv_path))

    adv_frames = np.concatenate(adv_frames, axis=0)

    outputfile = adv_path

    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            #huffyuv is lossless. r10k is really good

            # '-c:v': 'libx264', #libx264 # use the h.264 codec
            '-c:v': 'huffyuv',  #r210 huffyuv r10k
            # '-pix_fmt': 'rgb32',
            # '-crf': '0', # set the constant rate factor to 0, which is lossless
            # '-preset': 'ultrafast'  # ultrafast, veryslow the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)

    writer.close()

    #ffv1 0.215807946043995
    #huffyuv 0.21578424050191813
    #libx264 0.2341074901578537
    #r210 -0.7831487262059795, -0.7833399258537526
    #gif 0.6889478809555243
    #png 0.2158991440582696 0.21616862708842177
    #qtrle  0.21581286337807626
    #flashsv 0.21610510459932186 0.21600030673323545
    #ffvhuff 0.21620682250167533
    #r10k similar to r210
    #rawvideo 0.21595001

    with torch.no_grad():
        full_decoder = full_decoder.eval()

        frames = skvideo.io.vread(adv_path)

        frames = np.float32(frames)

        difference = np.array(adv_frames) - np.array(frames)
        np.save('difference_tmp', difference)
        #loadtxt to load np array from txt

        exp = np.load('difference_tmp.npy')

        print("Is the saved array equal to loaded array for difference: ",
              np.array_equal(exp, difference))

        frames = frames + difference

        # bp ---
        adv_frames = adv_frames.astype(np.uint8)
        batches = create_batches(adv_frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames old: {}".format(sents[0]))

        batches = exp_create_batches(adv_frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames new: {}".format(sents[0]))

        frames = frames.astype(np.uint8)
        batches = create_batches(frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        print("frames old caption: ", sents[0])

        # frames = frames.astype(np.uint8)
        # batches = create_batches(frames, load_img_fn, tf_img_fn)

        batches = exp_create_batches(frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
Example #7
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    length = len(skvideo.io.vread(video_path)) / 8
    print("Total number of frames: {}".format(len(
        skvideo.io.vread(video_path))))
    print("Total number of frames to do: {}".format(length))

    with torch.no_grad():
        frames = skvideo.io.vread(video_path)

        # bp ---

        attn_weights = []

        total_iterations = np.ceil(length / BATCH_SIZE)
        iteration = 1
        frame_counter = 0

        while (frame_counter < length):
            if length - frame_counter < BATCH_SIZE:
                batches = create_batches(frames[frame_counter:int(length)],
                                         load_img_fn, tf_img_fn)
                attn = full_decoder(batches, mode='inference', get_attn=True)
                frame_counter = frame_counter + (length - frame_counter)
            else:
                batches = create_batches(
                    frames[frame_counter:frame_counter + BATCH_SIZE - 1],
                    load_img_fn, tf_img_fn)
                attn = full_decoder(batches, mode='inference', get_attn=True)
                frame_counter = frame_counter + BATCH_SIZE
            # print(attn.shape, attn[0].shape, type(attn))

            attn = attn.cpu().detach().numpy().tolist()[0]

            print("Weights for batch {}: {}".format(iteration, attn))
            for f in attn:
                attn_weights.append(f)
            iteration = iteration + 1

            # attn_weights.append(attn.cpu().detach().numpy().tolist()[0])

        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches,
                                           mode='inference',
                                           get_attn=False)
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    print(attn_weights)

    att_window = np.sort(
        np.argpartition(attn_weights,
                        -ATTACK_BATCH_SIZE)[-ATTACK_BATCH_SIZE:]).tolist()

    print("Indices of frames with highest attention weights: {}".format(
        att_window))
    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    adv_frames = []
    carlini = CarliniAttack(oracle=full_decoder,
                            video_path=video_path,
                            target=target_caption,
                            dataset=dataset,
                            att_window=att_window)
    finished_frames = carlini.execute(video_path,
                                      att_window=att_window,
                                      functional=True)
    adv_frames.append(finished_frames.detach().cpu().numpy())

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarial.avi')

    print("\nSaving to: {}".format(adv_path))
    adv_frames = np.concatenate(adv_frames, axis=0)
    outputfile = adv_path
    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            '-vcodec': 'libx264',  # use the h.264 codec
            '-crf':
            '0',  # set the constant rate factor to 0, which is lossless
            '-vb': '50M',
            '-r': '25',
            '-preset':
            'ultrafast'  # the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)
    print(len(adv_frames))

    # skvideo.io.vwrite(adv_path, adv_frames)
    writer.close()

    with torch.no_grad():
        a_frames = skvideo.io.vread(adv_path)

        # frames = skvideo.io.vread(video_path)

        # for f in range(0, len(att_window)):
        #     frames[att_window[f]] = a_frames[f]

        # frames = frames[:50]
        # frames = adv_frames
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        # plt.imshow(frames[0])
        # plt.show()
        #
        # plt.imshow(adv_frames[0]/255.)
        # plt.show()

        # bp ---

        batches = create_batches(a_frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    # convnet = 'resnet152'
    # convnet = 'vgg16'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)
    #D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\CNN_SaYwh6chmiw_15_40.npy
    videos = {

        # 1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19_adversarialWINDOW.avi',
        3: 'O2qiPS2NCeY_2_18_adversarialWINDOW.avi',
        4: 'kI6MWZrl8v8_149_161_adversarialWINDOW.avi',
        5: 'X7sQq-Iu1gQ_12_22_adversarialWINDOW.avi',
        6: '77iDIp40m9E_159_181_adversarialWINDOW.avi',
        7: 'SaYwh6chmiw_15_40_adversarialWINDOW.avi',
        8: 'pFSoWsocv0g_8_17_adversarialWINDOW.avi',
        9: 'HmVPxs4ygMc_44_53_adversarialWINDOW.avi',
        10: 'glii-kazad8_21_29_adversarialWINDOW.avi',
        11: 'AJJ-iQkbRNE_97_109_adversarialWINDOW.avi'
    }

    videos_CNN = {

        # 1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19.avi',
        3: 'O2qiPS2NCeY_2_18.avi',
        4: 'kI6MWZrl8v8_149_161.avi',
        5: 'X7sQq-Iu1gQ_12_22.avi',
        6: '77iDIp40m9E_159_181.avi',
        7: 'SaYwh6chmiw_15_40.avi',
        8: 'pFSoWsocv0g_8_17.avi',
        9: 'HmVPxs4ygMc_44_53.avi',
        10: 'glii-kazad8_21_29.avi',
        11: 'AJJ-iQkbRNE_97_109.avi'
    }

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # video_path = opt['videos'][0]

    modelname = 'nasnetalarge'

    o_video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos_CNN[
        2]

    video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}Adversarial_'.format(modelname) + \
                 videos_CNN[2]

    # video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\vgg16Adversarial_SaYwh6chmiw_15_40.avi'

    numpy_path = "D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}CNN_{}.npy".format(
        modelname, videos_CNN[2].split('.')[0])
    adv_frames = np.load(numpy_path)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    print(video_path)
    with torch.no_grad():
        frames = skvideo.io.vread(o_video_path)
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Original: ", sents[0])

        frames = skvideo.io.vread(video_path)
        print("Total frames: {}".format(len(frames)))
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        plt.imshow(frames[0] / 255.)
        plt.show()

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial huffyuv: ", sents[0])

        np_frames = adv_frames.astype(np.uint8)
        print("Numpy CNN frames \nTotal frames: {}".format(len(np_frames)))
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        plt.imshow(np_frames[0] / 255.)
        plt.show()

        # bp ---
        batches = create_batches(np_frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial numpy: ", sents[0])
def main(args, opt):

    testpath = 'D:\\College\\Research\\2019 Video Captioning Attack Conference Paper\\youtube2text_preprocessed_for_arctic_capgen_vid\\youtube2text_iccv15\\dict_movieID_caption.pkl'

    with open(testpath, 'rb') as f:
        data = pickle.load(f, encoding='latin1')
    print(data)

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        "batch_size": BATCH_SIZE,
        "c": 100,
        "learning_rate": 0.005,
        "num_iterations": 1000,
        "input_shape": (299, 299),
        "num_frames": 288,
        "dimensions": 331
    }

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    # model = torch.nn.Sequential(torch.nn.Conv2d(in_channels=3, out_channels=96, kernel_size=3, padding=0, stride=2,
    #                                             bias=False), full_decoder)

    #loader, model, crit, optimizer, lr_scheduler, opt, rl_crit=None

    dataset = VideoDataset(opt, 'train')
    dataloader = DataLoader(dataset,
                            batch_size=opt["batch_size"],
                            num_workers=16,
                            shuffle=True)

    crit = utils.LanguageModelCriterion()
    rl_crit = utils.RewardCriterion()
    optimizer = optim.Adam(model.parameters(),
                           lr=opt["learning_rate"],
                           weight_decay=opt["weight_decay"])
    exp_lr_scheduler = optim.lr_scheduler.StepLR(
        optimizer,
        step_size=opt["learning_rate_decay_every"],
        gamma=opt["learning_rate_decay_rate"])

    train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)