Ejemplo n.º 1
0
def infer_recitation_to_text(args):
    prefix = args.filename_prefix
    decoder_model = tf.keras.models.load_model(os.path.join(args.output_dir, f'decoder-model-{prefix}.h5'))
    encoder_model = tf.keras.models.load_model(os.path.join(args.output_dir, f'encoder_model-{prefix}.h5'))
    print("Models loaded")

    encoder_input_data, decoder_input_data, decoder_target_data = get_seq2seq_data()
    print("Data loaded")

    max_decoder_seq_length = decoder_input_data.shape[1]
    num_decoder_tokens = decoder_input_data.shape[-1]

    one_hot_obj = get_one_hot_encodings()
    reverse_target_char_index = one_hot_obj['int_to_char']
    reverse_target_char_index[num_decoder_tokens - 2] = '->'
    reverse_target_char_index[num_decoder_tokens - 1] = '<-'

    # Perform inference on some of the audio files
    with open(os.path.join(args.output_dir, f'inference-{prefix}.txt'), 'w') as f:
        num_predict = args.num_predict
        if num_predict == -1:
            num_predict = encoder_input_data.shape[0]
        for seq_index in range(num_predict):
            print(seq_index, end=' ')
            input_seq = encoder_input_data[seq_index: seq_index + 1]
            decoded_sentence = decode_sequence(
                input_seq, num_decoder_tokens, encoder_model, decoder_model, max_decoder_seq_length)

            true_array = decoder_target_data[seq_index]
            true_sentence = ''
            for pos in range(true_array.shape[0]):
                sampled_token_index = np.argmax(true_array[pos])
                sampled_char = reverse_target_char_index[sampled_token_index]
                true_sentence += sampled_char
            f.write(true_sentence + ',' + decoded_sentence + '\n')
Ejemplo n.º 2
0
def val(split="val"):
    net.eval()
    data_val = CocoCaptionsFeature(fc_dir=opt.input_fc_dir,
                                   att_dir=opt.input_att_dir,
                                   label_file=opt.input_label_h5,
                                   info_file=opt.input_json,
                                   split=split,
                                   opt=opt)
    evalloader = iter(
        DataLoader(data_val, batch_size=opt.val_images_use, num_workers=1))

    #loader = tqdm(enumerate(trainloader), total=len(trainloader), ascii=True)
    fc, att, labels = next(evalloader)

    if use_cuda:
        fc, att, labels = fc.cuda(), att.cuda(), labels.cuda()
    fc, att, labels = Variable(fc, requires_grad=False), Variable(
        att, requires_grad=False), Variable(labels, requires_grad=False)
    fc = torch.stack([fc] * opt.seq_per_img).view(-1, *fc.shape[1:])
    att = torch.stack([att] * opt.seq_per_img).view(-1, *att.shape[1:])
    labels = labels.transpose(1, 0).contiguous().view(-1, *labels.shape[2:])

    labels = labels.long()
    outputs, *_ = net(fc_feats=fc, att_feats=att)
    #loss = criterion(outputs, labels)

    txts = utils.decode_sequence(data.dictionary, outputs.data)
    for txt in txts:
        print(txt)
Ejemplo n.º 3
0
def test():
    net.eval()

    loader = tqdm(enumerate(dataloader), total=len(dataloader), ascii=True)

    min_loss = 1e9

    for batch_idx, (fc, att, labels, data_info) in loader:
        if use_cuda:
            fc, att, labels = fc.cuda(), att.cuda(), labels.cuda()
        fc, att, labels = Variable(fc, requires_grad=False), Variable(att, requires_grad=False), Variable(labels, requires_grad=False)
        fc = torch.stack([fc]*opt.seq_per_img).view(-1, *fc.shape[1:])
        att = torch.stack([att]*opt.seq_per_img).view(-1, *att.shape[1:])
        origin_labels = labels.view(-1, *labels.shape[2:])
        labels = labels.transpose(1, 0).contiguous().view(-1, *labels.shape[2:])

        labels = labels.long()
        outputs, _ = net(fc_feats=fc, att_feats=att, seq=labels)
        loss = criterion(outputs, labels)

        if loss.data[0] < min_loss:
            min_loss = loss.data[0]

            outputs, alpha = net(fc_feats=fc, att_feats=att)
            min_txts = utils.decode_sequence(data.dictionary, outputs.data)
            min_txts_target = utils.decode_sequence(data.dictionary, origin_labels.data)
            file_path = data_info['file_path']

        loader.set_description("Loss: {:.6f} | Min Loss: {:.6f}".format(loss.data[0], min_loss))

        if min_loss < 1.54:
            break

    loader.set_description("Loss: {:.6f} | Min Loss: {:.6f}".format(loss.data[0], min_loss))

    for idx, (txt) in enumerate(min_txts):
        if idx % opt.seq_per_img == 0:
            print(file_path[idx // opt.seq_per_img])
        print(txt)
        print(min_txts_target[idx])
        if idx % opt.seq_per_img == 4:
            print("")

    print(min_loss)
    att_path = './alpha.pt'
    torch.save(alpha.data.cpu(), att_path)
Ejemplo n.º 4
0
def eval_external_ensemble(ensemble, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    num_images = eval_kwargs.get('num_images', -1)
    split = eval_kwargs.get('split', 'val')
    lang_eval = eval_kwargs.get('language_eval', 1)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    logger = eval_kwargs.get('logger')
    caption_model = eval_kwargs.get('caption_model')
    vocab_size = eval_kwargs.get('vocb_size')
    dump_path = eval_kwargs.get('dump_path')

    # Make sure in the evaluation mode
    for cnn_model in ensemble.cnn_models:
        cnn_model.eval()

    for model in ensemble.models:
        model.eval()

    loader.reset_iterator(split)

    n = 0
    predictions = []
    Feats = []
    seq_per_img = 5
    while True:
        data = loader.get_batch(split, seq_per_img=seq_per_img)
        n = n + loader.batch_size

        # forward the model to get loss
        images = data['images']
        images = Variable(torch.from_numpy(images), volatile=True).cuda()

        att_feats_ens, fc_feats_ens = ensemble.get_feats(images)
        seq, probs = ensemble.sample(fc_feats_ens, att_feats_ens, eval_kwargs)
        sents = utils.decode_sequence(loader.get_vocab(), seq)

        for k, sent in enumerate(sents):
            spath = short_path(data['infos'][k]['file_path'])
            print_sampled(spath, sent)
            entry = {'image_id': spath, 'caption': sent}
            predictions.append(entry)
            #  logger.debug('image %s: %s' %(entry['image_id'], entry['caption']))
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        #  logger.warn('ix1 = %d - ix0 = %d' % (ix1, ix0))
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()
        #  logger.debug('validation loss ... %d/%d (%f)' %(ix0 - 1, ix1, loss))
        if data['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
    #  pickle.dump(Feats, open('cnn_features.pkl', 'w'))
    return predictions
Ejemplo n.º 5
0
def eval_external(cnn_model, model, loader, eval_kwargs={}):
    num_images = eval_kwargs.get('num_images', -1)
    split = eval_kwargs.get('split', 'val')
    # serves no purpose except to have the same signature for get_batch
    beam_size = eval_kwargs.get('beam_size', 1)
    logger = eval_kwargs.get('logger')
    caption_model = eval_kwargs.get('caption_model')
    beam_size = eval_kwargs.get('beam_size', 1)
    sample_max = eval_kwargs.get('sample_max', 1)
    temperature = eval_kwargs.get('temperature', 0.5)
    forbid_unk = eval_kwargs.get('forbid_unk', 1)

    print("Eval %s" % caption_model)

    # Make sure in the evaluation mode
    cnn_model.eval()
    model.eval()
    loader.reset_iterator(split)

    n = 0
    predictions = []
    seq_per_img = 1
    while True:
        data = loader.get_batch(split, seq_per_img=seq_per_img)
        n = n + loader.batch_size
        # forward the model to get loss
        images = data['images']
        images = Variable(torch.from_numpy(images), volatile=True).cuda()
        att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps(
            images, seq_per_img, return_unique=True)
        seq, _ = model.sample(
            fc_feats, att_feats, {
                'beam_size': beam_size,
                'forbid_unk': forbid_unk,
                "sample_max": sample_max,
                "temperature": temperature
            })
        sents = decode_sequence(loader.get_vocab(), seq)

        for k, sent in enumerate(sents):
            spath = short_path(data['infos'][k]['file_path'])
            entry = {'image_id': spath, 'caption': sent}
            print_sampled(spath, sent)
            predictions.append(entry)
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()
        if data['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
    # Switch back to training mode
    model.train()
    return predictions
Ejemplo n.º 6
0
def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result,
                             vocab, cocoid2caps, seqLen, opt):
    # batch_size = gen_result.size(0)  # batch_size = sample_size * seq_per_img
    batch_size = len(gen_result)
    # seq_per_img = batch_size // len(data['gts'])

    # get greedy decoding baseline
    model.eval()
    with torch.no_grad():
        # greedy_res, _ = model(fc_feats, att_feats, att_masks=att_masks, mode='sample')
        word_idx, father_idx, mask = model._greedy_search(fc_feats,
                                                          att_feats,
                                                          max_seq_length=40)
    model.train()
    greedy_res = utils.decode_sequence(vocab, word_idx, father_idx, mask)

    res = OrderedDict()
    for i in range(batch_size):
        res[i] = [gen_result[i]]
    for i in range(batch_size):
        res[batch_size + i] = [greedy_res[i]]

    gts = OrderedDict()
    for i in range(batch_size):
        gts[i] = cocoid2caps[data['image_id'][i].item()]

    res_ = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]
    res__ = {i: res[i] for i in range(2 * batch_size)}
    gts = {i: gts[i % batch_size] for i in range(2 * batch_size)}

    # for i in range(2 * batch_size):
    #     print(res[i], gts[i])

    if opt.cider_reward_weight > 0:
        _, cider_scores = CiderD_scorer.compute_score(gts, res_)
        print('Sider scores:', _ * 0.1)
    else:
        cider_scores = 0

    if opt.bleu_reward_weight > 0:
        _, bleu_scores = Bleu_scorer.compute_score(gts, res__)
        bleu_scores = np.array(bleu_scores[3])
        print('Bleu scores:', _[3])
    else:
        bleu_scores = 0

    scores = opt.cider_reward_weight * cider_scores + opt.bleu_reward_weight * bleu_scores
    scores = scores[:batch_size] - scores[batch_size:]
    scores = scores * 0.1
    print('Mean reward:', scores.mean())
    rewards = np.repeat(scores[:, np.newaxis], seqLen, 1)

    return rewards
Ejemplo n.º 7
0
 def create_model(self, pred_sequence, inp_shape):
     ## change sequence to its decoded value
     pred_sequence = utils.decode_sequence(
         utils.vocab_dict(self.target_classes), pred_sequence)
     ## set optimizer parameters
     if self.optimizer == 'sgd':
         optim = optimizers.SGD(lr=self.lr,
                                decay=self.decay,
                                momentum=self.momentum)
     else:
         optim = getattr(optimizers, self.optimizer)(lr=self.lr,
                                                     decay=self.decay)
     ## generate a sequential architecture for the sequence
     ## add flatten if data is 3d or more
     if len(inp_shape) > 1:
         model = Sequential()
         model.add(Flatten(name='flatten', input_shape=inp_shape))
         for i in range(len(pred_sequence)):
             if pred_sequence[i] is 'dropout':
                 model.add(Dropout(self.dropout))
             else:
                 model.add(
                     Dense(units=pred_sequence[i][0],
                           activation=pred_sequence[i][1]))
         model.compile(loss=self.loss_func,
                       optimizer=optim,
                       metrics=self.metrics)
         return model
     else:
         model = Sequential()
         for i in range(len(pred_sequence)):
             if i == 0:
                 model.add(
                     Dense(units=pred_sequence[i][0],
                           activation=pred_sequence[i][1],
                           input_shape=inp_shape))
             elif pred_sequence[i] is 'dropout':
                 model.add(Dropout(self.dropout))
             else:
                 model.add(
                     Dense(units=pred_sequence[i][0],
                           activation=pred_sequence[i][1]))
         model.compile(loss=self.loss_func,
                       optimizer=optim,
                       metrics=self.metrics)
         return model
Ejemplo n.º 8
0
def score_trads(preds, trg_loader, eval_kwargs):
    split = eval_kwargs.get('split', 'val')
    batch_size = eval_kwargs.get('batch_size', 80)
    verbose = eval_kwargs.get('verbose', 0)
    ground_truths = []
    trg_loader.reset_iterator(split)
    n = 0
    while True:
        # get batch
        data_trg = trg_loader.get_trg_batch(split,
                                            range(batch_size),
                                            batch_size)
        output_lines_trg_gold = data_trg['out_labels']
        n += batch_size
        # Decode a minibatch greedily __TODO__ add beam search decoding
        # Do the same for gold sentences
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    output_lines_trg_gold,
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)
        if not verbose:
            verb = not (n % 1000)
        else:
            verb = verbose
        for (l, gl) in zip(preds, sent_gold):
            ground_truths.append(gl)
            if verb:
                lg.print_sampled("", gl, l)
        ix1 = data_trg['bounds']['it_max']
        if data_trg['bounds']['wrapped']:
            break
        if n >= ix1:
            print('Evaluated the required samples (%s)' % n)
            break
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    scores = {'Bleu': bleu_moses}
    return scores
Ejemplo n.º 9
0
def validate(model, criterion, loader, opt, max_iters=None, type='val'):
    model.eval()
    loader.reset()

    num_videos = loader.get_num_videos()
    batch_size = loader.get_batch_size()
    if max_iters is None:
        num_iters = int(math.ceil(num_videos * 1.0 / batch_size))
    else:
        num_iters = max_iters
    last_batch_size = num_videos % batch_size
    seq_per_img = loader.get_seq_per_img()
    model.set_seq_per_img(seq_per_img)

    loss_sum = 0
    logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters,
                batch_size, seq_per_img)
    predictions = []
    gt_avglogps = []
    test_avglogps = []
    prec_recs = dict()
    for ii in range(num_iters):
        data = loader.get_batch()
        feats = data['feats']
        bfeats = data['bfeats']
        if loader.has_label:
            labels = data['labels']
            masks = data['masks']
            labels_svo = data['labels_svo']

        if ii == (num_iters - 1) and last_batch_size > 0:
            feats = [f[:last_batch_size] for f in feats]
            bfeats = [f[:last_batch_size] for f in bfeats]
            if loader.has_label:
                labels = labels[:last_batch_size *
                                seq_per_img]  # labels shape is DxN
                masks = masks[:last_batch_size * seq_per_img]
                labels_svo = labels_svo[:last_batch_size *
                                        seq_per_img]  # labels shape is DxN

        if torch.cuda.is_available():
            feats = [feat.cuda() for feat in feats]
            bfeats = [bfeat.cuda() for bfeat in bfeats]
            if loader.has_label:
                labels = labels.cuda()
                masks = masks.cuda()
                labels_svo = labels_svo.cuda()

        if loader.has_label and model.gt_concepts_while_testing == 0:
            pred, gt_seq, gt_logseq, _, _, _ = model(feats, bfeats, labels,
                                                     labels_svo)
            # memReport()
            if opt.output_logp == 1:
                gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data)
                gt_avglogps.extend(gt_avglogp)

            loss = criterion(pred, labels[:, 1:], masks[:, 1:])
            loss_sum += loss.item()
            del pred, gt_seq, gt_logseq
            torch.cuda.empty_cache()

        seq, logseq, _, concept_seq = model.sample(
            feats, bfeats, labels_svo, {'beam_size': opt.beam_size})
        sents = utils.decode_sequence(opt.vocab, seq)
        if opt.output_logp == 1:
            test_avglogp = utils.compute_avglogp(seq, logseq)
            test_avglogps.extend(test_avglogp)

        if concept_seq is not None:

            # if type == 'test':
            #     if concept_seq.shape[0] != 136:
            #         print()
            labels_svo = torch.reshape(
                labels_svo, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0]
            # concept_seq = torch.reshape(concept_seq, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0]

            concept_seq_words = utils.decode_sequence(opt.vocab, concept_seq)

            # Calculate TP,FP,FN for precision and recall calcs
            if opt.grounder_type in ['niuc', 'nioc', 'iuc', 'ioc']:
                gt_concept_seq_words = utils.decode_sequence(
                    opt.vocab, labels_svo)
                gt_concept_seq_words = [
                    g.split(' ') for g in gt_concept_seq_words
                ]
                for bi in range(len(gt_concept_seq_words)):
                    pr_words = list()
                    repeat = int(
                        len(gt_concept_seq_words) / len(concept_seq_words))

                    for pr in concept_seq_words[int(
                            math.floor(float(bi) / repeat))].split(' '):
                        pr_word = pr.split(' ')[0]
                        pr_words.append(pr_word)
                        if pr_word not in prec_recs:
                            prec_recs[pr_word] = [0, 0, 0]
                        if pr_word in gt_concept_seq_words[bi]:
                            prec_recs[pr_word][0] += 1  # TP
                        else:
                            prec_recs[pr_word][1] += 1  # FP
                    for gt in gt_concept_seq_words[bi]:
                        if gt not in prec_recs:
                            prec_recs[gt] = [0, 0, 0]
                        if gt not in pr_words:
                            prec_recs[gt][2] += 1  # FN
            try:
                for jj, (sent,
                         sent_svo) in enumerate(zip(sents, concept_seq_words)):
                    if opt.output_logp == 1:
                        entry = {
                            'image_id': data['ids'][jj],
                            'caption': sent,
                            'svo': sent_svo,
                            'avglogp': test_avglogp[jj],
                            'box_att': model.attention_record[jj].tolist()
                        }
                    else:
                        entry = {
                            'image_id': data['ids'][jj],
                            'caption': sent,
                            'svo': sent_svo
                        }  #, 'box_att': model.attention_record[jj].tolist()}  # todo removed fot transformer model
                    predictions.append(entry)
                    logger.debug('[%d] video %s: %s pr(%s) gt(%s)' %
                                 (jj, entry['image_id'], entry['caption'],
                                  entry['svo'], gt_concept_seq_words[jj]))
            except IndexError:
                print()
        else:

            for jj, sent in enumerate(sents):
                if opt.output_logp == 1:
                    entry = {
                        'image_id': data['ids'][jj],
                        'caption': sent,
                        'avglogp': test_avglogp[jj],
                        'box_att': model.attention_record[jj].tolist()
                    }
                else:
                    entry = {'image_id': data['ids'][jj], 'caption': sent}
                predictions.append(entry)
                logger.debug('[%d] video %s: %s' %
                             (jj, entry['image_id'], entry['caption']))

        del feats, labels, masks, labels_svo, seq, logseq
        torch.cuda.empty_cache()

    loss = round(loss_sum / num_iters, 3)
    results = {}
    lang_stats = {}

    if opt.language_eval == 1 and loader.has_label:
        logger.info('>>> Language evaluating ...')
        tmp_checkpoint_json = os.path.join(
            opt.model_file.split('.')[0] + '_' + type + '.json')
        json.dump(predictions, open(tmp_checkpoint_json, 'w'))
        lang_stats = utils.language_eval(loader.cocofmt_file,
                                         tmp_checkpoint_json)
        # os.remove(tmp_checkpoint_json)

    results['predictions'] = predictions
    results['scores'] = {'Loss': -loss}
    results['scores'].update(lang_stats)

    if opt.output_logp == 1:
        avglogp = sum(test_avglogps) / float(len(test_avglogps))
        results['scores'].update({'avglogp': avglogp})

        gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img)
        assert num_videos == gt_avglogps.shape[0]

        gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl',
                                                  1)
        cPickle.dump(gt_avglogps,
                     open(gt_avglogps_file, 'w'),
                     protocol=cPickle.HIGHEST_PROTOCOL)

        logger.info('Wrote GT logp to: %s', gt_avglogps_file)

    if len(prec_recs.keys()) > 0:
        prec = dict()
        rec = dict()
        for k, v in prec_recs.items():
            if v[0] + v[1] > 0:
                prec[k] = v[0] / float(v[0] + v[1])
            else:
                prec[k] = 0
            if v[0] + v[2] > 0:
                rec[k] = v[0] / float(v[0] + v[2])
            else:
                rec[k] = 0

        precv = sum(prec.values()) / len(prec_recs)
        recv = sum(rec.values()) / len(prec_recs)
        results['scores'].update({'prec': precv, 'rec': recv})
        print('prec: ', precv, ' .. rec: ', recv)
        logger.debug('prec: ' + str(prec))
        logger.debug('rec: ' + str(rec))
    return results
Ejemplo n.º 10
0
def validate(model, criterion, loader, opt):
    model.eval()
    loader.reset()

    num_videos = loader.get_num_videos()
    batch_size = loader.get_batch_size()
    num_iters = int(math.ceil(num_videos * 1.0 / batch_size))
    last_batch_size = num_videos % batch_size
    seq_per_img = loader.get_seq_per_img()
    model.set_seq_per_img(seq_per_img)

    loss_sum = 0
    logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters,
                batch_size, seq_per_img)
    predictions = []
    gt_avglogps = []
    test_avglogps = []
    for ii in range(num_iters):
        data = loader.get_batch()
        feats = [Variable(feat, volatile=True) for feat in data['feats']]
        if loader.has_label:
            labels = Variable(data['labels'], volatile=True)
            masks = Variable(data['masks'], volatile=True)

        if ii == (num_iters - 1) and last_batch_size > 0:
            feats = [f[:last_batch_size] for f in feats]
            if loader.has_label:
                labels = labels[:last_batch_size *
                                seq_per_img]  # labels shape is DxN
                masks = masks[:last_batch_size * seq_per_img]

        if torch.cuda.is_available():
            feats = [feat.cuda() for feat in feats]
            if loader.has_label:
                labels = labels.cuda()
                masks = masks.cuda()

        if loader.has_label:
            pred, gt_seq, gt_logseq = model(feats, labels)
            if opt.output_logp == 1:
                gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data)
                gt_avglogps.extend(gt_avglogp)

            loss = criterion(pred, labels[:, 1:], masks[:, 1:])
            loss_sum += loss.data[0]

        seq, logseq = model.sample(feats, {'beam_size': opt.beam_size})
        sents = utils.decode_sequence(opt.vocab, seq)
        if opt.output_logp == 1:
            test_avglogp = utils.compute_avglogp(seq, logseq)
            test_avglogps.extend(test_avglogp)

        for jj, sent in enumerate(sents):
            if opt.output_logp == 1:
                entry = {
                    'image_id': data['ids'][jj],
                    'caption': sent,
                    'avglogp': test_avglogp[jj]
                }
            else:
                entry = {'image_id': data['ids'][jj], 'caption': sent}
            predictions.append(entry)
            logger.debug('[%d] video %s: %s' %
                         (jj, entry['image_id'], entry['caption']))

    loss = round(loss_sum / num_iters, 3)
    results = {}
    lang_stats = {}

    if opt.language_eval == 1 and loader.has_label:
        logger.info('>>> Language evaluating ...')
        tmp_checkpoint_json = os.path.join(opt.model_file + str(uuid.uuid4()) +
                                           '.json')
        json.dump(predictions, open(tmp_checkpoint_json, 'w'))
        lang_stats = utils.language_eval(loader.cocofmt_file,
                                         tmp_checkpoint_json)
        os.remove(tmp_checkpoint_json)

    results['predictions'] = predictions
    results['scores'] = {'Loss': -loss}
    results['scores'].update(lang_stats)

    if opt.output_logp == 1:
        avglogp = sum(test_avglogps) / float(len(test_avglogps))
        results['scores'].update({'avglogp': avglogp})

        gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img)
        assert num_videos == gt_avglogps.shape[0]

        gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl',
                                                  1)
        cPickle.dump(gt_avglogps,
                     open(gt_avglogps_file, 'w'),
                     protocol=cPickle.HIGHEST_PROTOCOL)

        logger.info('Wrote GT logp to: %s', gt_avglogps_file)

    return results
Ejemplo n.º 11
0
def evaluate_model(model, src_loader, trg_loader, logger, eval_kwargs):
    """Evaluate model."""
    preds = []
    ground_truths = []
    batch_size = eval_kwargs.get('batch_size', 1)
    split = eval_kwargs.get('split', 'val')
    verbose = eval_kwargs.get('verbose', 0)
    eval_kwargs['BOS'] = trg_loader.bos
    eval_kwargs['EOS'] = trg_loader.eos
    eval_kwargs['PAD'] = trg_loader.pad
    eval_kwargs['UNK'] = trg_loader.unk

    # Make sure to be in evaluation mode
    model.eval()
    src_loader.reset_iterator(split)
    trg_loader.reset_iterator(split)
    n = 0
    loss_sum = 0
    ml_loss_sum = 0
    loss_evals = 0
    while True:
        # get batch
        data_src, order = src_loader.get_src_batch(split, batch_size)
        tmp = [data_src['labels']]
        input_lines_src, = [Variable(torch.from_numpy(_),
                                    requires_grad=False).cuda()
                           for _ in tmp]
        src_lengths = data_src['lengths']
        data_trg = trg_loader.get_trg_batch(split, order, batch_size)
        tmp = [data_trg['labels'], data_trg['out_labels'], data_trg['mask']]
        input_lines_trg_gold, output_lines_trg_gold, mask = [Variable(torch.from_numpy(_),
                                                                      requires_grad=False).cuda()
                                                             for _ in tmp]
        trg_lengths = data_trg['lengths']
        n += batch_size

        # decoder_logit = model(input_lines_src, input_lines_trg_gold)
        # if model.opt.sample_reward:
            # ml_loss, loss, stats = model.crit(model, input_lines_src, input_lines_trg_gold,
                                              # output_lines_trg_gold, mask)
        # else:
            # ml_loss, loss, stats = model.crit(decoder_logit, output_lines_trg_gold, mask)

        ml_loss, loss, _ = model.step(input_lines_src, src_lengths,
                                      input_lines_trg_gold, trg_lengths,
                                      output_lines_trg_gold,
                                      mask)
        loss_sum += loss.data.item()
        ml_loss_sum += ml_loss.data.item()
        loss_evals = loss_evals + 1
        # Initialize target with <BOS> for every sentence Index = 2
        # print('Sampling sentence')
        # print('GPU:', os.environ['CUDA_VISIBLE_DEVICES'])
        start = time.time()
        # print('>>> Sampling:')
        batch_preds, _ = model.sample(input_lines_src, src_lengths, opt=eval_kwargs)
        if isinstance(batch_preds, list):
            # wiht beam size unpadded preds
            sent_preds = [decode_sequence(trg_loader.get_vocab(),
                                          np.array(pred).reshape(1, -1),
                                          eos=trg_loader.eos,
                                          bos=trg_loader.bos)[0]
                          for pred in batch_preds]
        else:
            # decode
            sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds,
                                         eos=trg_loader.eos,
                                         bos=trg_loader.bos)
        # Do the same for gold sentences
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      input_lines_src.data.cpu().numpy(),
                                      eos=src_loader.eos, bos=src_loader.bos)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    output_lines_trg_gold.data.cpu().numpy(),
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)
        if not verbose:
            verb = not (n % 300)
        else:
            verb = verbose
        for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold):
            preds.append(l)
            ground_truths.append(gl)
            if verb:
                lg.print_sampled(sl, gl, l)
        ix1 = data_src['bounds']['it_max']
        if data_src['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
    # print('Predictions lenght:', len(preds), len(ground_truths))
    # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0])
    bleu_moses, _ = corpus_bleu(preds, ground_truths)
    return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
Ejemplo n.º 12
0
def eval_ensemble(ens_model, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    seq_length = eval_kwargs.get('seq_length', 16)
    split = eval_kwargs.get('split', 'test')
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    batch_size = eval_kwargs.get('batch_size', 1)
    val_images_use = eval_kwargs.get('val_images_use', -1)
    print('Evaluating ', val_images_use, ' images')

    # Make sure in the evaluation mode
    for cnn_model in ens_model.cnn_models:
        cnn_model.eval()

    for model in ens_model.models:
        model.eval()

    loader.reset_iterator(split)

    n = 0
    # loss_sum = 0
    # real_loss_sum = 0
    # loss_evals = 0
    predictions = []

    while True:
        # fetch a batch of data
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        #evaluate loss if we have the labels
        # loss = 0

        # Get the image features first
        tmp = [
            data['images'],
            data.get('labels', np.zeros(1)),
            data.get('masks', np.zeros(1)), data['scores']
        ]
        tmp = [
            Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp
        ]
        images, labels, masks, scores = tmp

        att_feats_ens = []
        fc_feats_ens = []
        for cnn_model in ens_model.cnn_models:
            att_feats, fc_feats = cnn_model.forward(images)
            att_feats_ens.append(att_feats)
            fc_feats_ens.append(fc_feats)
        # Eavluate the loss:
        # real_loss, loss = ens_model.step(data)
        # loss_sum = loss_sum + loss.data[0]
        # real_loss_sum += real_loss.data[0]
        # loss_evals = loss_evals + 1

        seq, probs = ens_model.sample(fc_feats_ens, att_feats_ens, eval_kwargs)
        sent_scores = probs.cpu().numpy().sum(axis=1)
        sents = utils.decode_sequence(loader.get_vocab(), seq)
        for k, sent in enumerate(sents):
            # print('id:', data['infos'][k]['id'])
            entry = {
                'image_id': data['infos'][k]['id'],
                'caption': sent,
                'score': str(round(sent_scores[k], 4)),
                "source": 'gen'
            }
            predictions.append(entry)
            if verbose:
                print(('image %s (%s) %s' %
                       (entry['image_id'], entry['score'], entry['caption'])))
        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if val_images_use != -1:
            ix1 = min(ix1, val_images_use)
        if data['bounds']['wrapped']:
            break
        if n >= ix1:
            ens_model.logger.warn('Evaluated the required samples (%s)' % n)
            break
    lang_stats = None
    unseen_grams = None
    if lang_eval == 1:
        lang_stats, unseen_grams = language_eval(dataset,
                                                 predictions,
                                                 ens_opt.logger,
                                                 get_creativity=False)
    # Switch back to training mode
    # model.train()
    return predictions, lang_stats
Ejemplo n.º 13
0
def eval_multiple(cnn_model, model, crit, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    score_ground_truth = eval_kwargs.get('score_ground_truth', False)
    n_gen = eval_kwargs.get('n_gen', 5)
    num_images = eval_kwargs.get('num_images', -1)
    seq_length = eval_kwargs.get('seq_length', 16)
    split = eval_kwargs.get('split', 'test')
    lang_eval = eval_kwargs.get('language_eval', 0)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    batch_size = eval_kwargs.get('batch_size', 1)

    # Make sure in the evaluation mode
    cnn_model.eval()
    model.eval()
    loader.reset_iterator(split)

    n = 0
    loss_sum = 0
    loss_evals = 1e-8
    predictions = []

    while True:
        # fetch a batch of data
        data = loader.get_batch(split, batch_size)
        n = n + batch_size

        #evaluate loss if we have the labels
        loss = 0

        # Get the image features first
        tmp = [
            data['images'],
            data.get('labels', np.zeros(1)),
            data.get('masks', np.zeros(1)), data['scores']
        ]
        tmp = [
            Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp
        ]
        images, labels, masks, scores = tmp

        att_feats, fc_feats = cnn_model.forward(images)
        _att_feats = att_feats
        _fc_feats = fc_feats

        # forward the model to get loss
        if data.get('labels', None) is not None:
            att_feats = att_feats.unsqueeze(1).expand(*((
                att_feats.size(0),
                loader.seq_per_img,
            ) + att_feats.size()[1:])).contiguous().view(
                *((att_feats.size(0) * loader.seq_per_img, ) +
                  att_feats.size()[1:]))
            fc_feats = fc_feats.unsqueeze(1).expand(*((
                fc_feats.size(0),
                loader.seq_per_img,
            ) + fc_feats.size()[1:])).contiguous().view(
                *((fc_feats.size(0) * loader.seq_per_img, ) +
                  fc_feats.size()[1:]))

            input = model(fc_feats, att_feats, labels)
            probs = input
            N = input.size(0)
            mask = masks[:, 1:]
            target = labels[:, 1:]
            target = target[:, :input.size(1)]
            mask = mask[:, :input.size(1)]
            input = utils.to_contiguous(input).view(-1, input.size(2))
            target = utils.to_contiguous(target).view(-1, 1)
            mask = mask[:, :input.size(1)]
            mask = utils.to_contiguous(mask).view(-1, 1)
            output = input.gather(1, target) * mask
            output = output.cpu().data.numpy()
            # sum over seq_length
            gt_scores = [
                np.sum(output[seq_length * i:seq_length * (i + 1)])
                for i in np.arange(N)
            ]
            gt_sents = decode_sequence(loader.get_vocab(), labels[:, 1:].data)
            real_loss, loss = crit(probs, labels[:, 1:], masks[:, 1:], scores)
            loss_sum = loss_sum + loss.item()
            loss_evals = loss_evals + 1

        # forward the model to also get generated samples for each image
        # Only leave one feature for each image, in case duplicate sample
        fc_feats, att_feats = _fc_feats, _att_feats
        # forward the model to also get generated samples for each image
        for _ in range(n_gen):
            seq, probs = model.sample(fc_feats, att_feats, eval_kwargs)
            sent_scores = probs.cpu().numpy().sum(axis=1)
            #set_trace()
            sents = decode_sequence(loader.get_vocab(), seq)
            print('Gen:', len(sents), len(sent_scores))
            for k, sent in enumerate(sents):
                # print('id:', data['infos'][k]['id'])
                if loader.flip:
                    entry = {
                        'image_id': data['infos'][k // 2]['id'],
                        'caption': sent,
                        'score': str(round(sent_scores[k], 4)),
                        "source": 'gen'
                    }
                    if not k % 2:
                        unflipped = entry
                    else:
                        # compare the new entry to unflipped and keep the best candidate
                        # print('Comparing:', entry, ' to ', unflipped)
                        if float(entry['score']) > float(unflipped['score']):
                            predictions.append(entry)
                            # print('picking:', entry)
                        else:
                            predictions.append(unflipped)
                            # print('picking:', unflipped)
                else:
                    entry = {
                        'image_id': data['infos'][k]['id'],
                        'caption': sent,
                        'score': str(round(sent_scores[k], 4)),
                        "source": 'gen'
                    }
                    predictions.append(entry)
                if verbose:
                    # print(entry)
                    print(
                        ('%s >>  %s' % (entry['image_id'], entry['caption'])))
        if score_ground_truth:
            print('Gt:', len(gt_sents), len(gt_scores))
            for k, sent in enumerate(gt_sents):
                if loader.flip:
                    entry = {
                        'image_id':
                        data['infos'][k // (loader.seq_per_img * 2)]['id'],
                        'caption':
                        sent,
                        'score':
                        str(round(gt_scores[k], 4)),
                        "source":
                        'gt'
                    }
                else:
                    entry = {
                        'image_id':
                        data['infos'][k // loader.seq_per_img]['id'],
                        'caption': sent,
                        'score': str(round(gt_scores[k], 4)),
                        "source": 'gt'
                    }
                predictions.append(entry)
                if verbose:
                    print((
                        'image %s (GT : %s) %s' %
                        (entry['image_id'], entry['score'], entry['caption'])))

        # if we wrapped around the split or used up val imgs budget then bail
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if num_images != -1:
            ix1 = min(ix1, num_images)
        for i in range(n - ix1):
            predictions.pop()

        if verbose:
            print('evaluating validation preformance... %d/%d (%f)' %
                  (ix0 - 1, ix1, loss.item()))

        if data['bounds']['wrapped']:
            break
        if num_images >= 0 and n >= num_images:
            break

    lang_stats = None
    unseen_grams = None
    if lang_eval == 1:
        lang_stats, unseen_grams = language_eval(dataset,
                                                 predictions,
                                                 logger=None)  # FIXME
    # Switch back to training mode
    model.train()
    return loss_sum / loss_evals, predictions, lang_stats, unseen_grams
Ejemplo n.º 14
0
def eval_split(cnn_model, model, loader, logger, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', False)
    dataset = eval_kwargs.get('dataset', 'coco')
    split = eval_kwargs.get('split', 'val')
    val_images_use = eval_kwargs.get('val_images_use', -1)
    lang_eval = eval_kwargs.get('language_eval', 1)
    language_creativity = eval_kwargs.get('language_creativity', 1)
    all_metrics = eval_kwargs.get('all_metrics', 0)
    single_metrics = eval_kwargs.get('single_metrics', 0)

    beam_size = eval_kwargs.get('beam_size', 1)
    sample_max = eval_kwargs.get('sample_max', 1)
    temperature = eval_kwargs.get('temperature', 0.5)
    forbid_unk = eval_kwargs.get('forbid_unk', 1)
    batch_size = eval_kwargs.get('batch_size', 1)
    seq_per_img = eval_kwargs.get('seq_per_img')
    region_size = model.region_size
    # Make sure to be in the evaluation mode
    cnn_model.eval()
    model.eval()
    logger.warn('Evaluating the %s split (%d)' % (split, val_images_use))
    loader.reset_iterator(split)
    n = 0
    loss_sum = 0
    ml_loss_sum = 0
    loss_evals = 0
    predictions = []
    while True:
        data = loader.get_batch(split,
                                batch_size=batch_size,
                                seq_per_img=seq_per_img)
        n = n + loader.batch_size
        images = data['images']
        images = Variable(torch.from_numpy(images), requires_grad=False).cuda()
        att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps(
            images, seq_per_img, return_unique=True)
        ml_loss, loss, stats = model.step(data,
                                          att_feats,
                                          fc_feats,
                                          train=False)
        # print('Scores : ', stats)
        ml_loss_sum += ml_loss.item()
        loss_sum += loss.item()
        loss_evals = loss_evals + 1
        # TODO Only leave one feature for each image, in case duplicate sample
        seq, probs = model.sample(fc_unique,
                                  att_unique,
                                  opt={
                                      'beam_size': beam_size,
                                      "forbid_unk": forbid_unk,
                                      "sample_max": sample_max,
                                      "temperature": temperature
                                  })
        sent_scores = probs.cpu().numpy().sum(axis=1)
        sents = decode_sequence(loader.get_vocab(), seq)
        for k, sent in enumerate(sents):
            if loader.flip:
                entry = {
                    'image_id': data['infos'][k // 2]['id'],
                    'caption': sent,
                    'score': sent_scores[k]
                }
                if not k % 2:
                    unflipped = entry
                else:
                    if entry['score'] > unflipped['score']:
                        del entry['score']
                        predictions.append(entry)
                    else:
                        del unflipped['score']
                        predictions.append(unflipped)
            else:
                entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
                predictions.append(entry)
            print_sampled(entry['image_id'], entry['caption'])
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if val_images_use != -1:
            ix1 = min(ix1, val_images_use)
        for i in range(n - ix1):
            predictions.pop()
        if data['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
    lang_stats = None
    if lang_eval:
        lang_stats, preds, _ = language_eval(dataset, predictions, logger,
                                             all_metrics, single_metrics,
                                             language_creativity)
        print('preds:', preds)
    # Back to training:
    model.train()
    if model.cnn_finetuning:
        logger.warn('Finetuning cnn ON, filtering the BN layers')
        cnn_model.train()
        cnn_model.filter_bn()
    return ml_loss_sum / loss_evals, loss_sum / loss_evals, predictions, lang_stats, preds
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    # convnet = 'resnet152'
    # convnet = 'vgg16'
    vocab = dataset.get_vocab()
    full_decoder = ConvS2VT(convnet, model, opt)
    #D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\CNN_SaYwh6chmiw_15_40.npy
    videos = {

        # 1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19_adversarialWINDOW.avi',
        3: 'O2qiPS2NCeY_2_18_adversarialWINDOW.avi',
        4: 'kI6MWZrl8v8_149_161_adversarialWINDOW.avi',
        5: 'X7sQq-Iu1gQ_12_22_adversarialWINDOW.avi',
        6: '77iDIp40m9E_159_181_adversarialWINDOW.avi',
        7: 'SaYwh6chmiw_15_40_adversarialWINDOW.avi',
        8: 'pFSoWsocv0g_8_17_adversarialWINDOW.avi',
        9: 'HmVPxs4ygMc_44_53_adversarialWINDOW.avi',
        10: 'glii-kazad8_21_29_adversarialWINDOW.avi',
        11: 'AJJ-iQkbRNE_97_109_adversarialWINDOW.avi'
    }

    videos_CNN = {

        # 1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19.avi',
        3: 'O2qiPS2NCeY_2_18.avi',
        4: 'kI6MWZrl8v8_149_161.avi',
        5: 'X7sQq-Iu1gQ_12_22.avi',
        6: '77iDIp40m9E_159_181.avi',
        7: 'SaYwh6chmiw_15_40.avi',
        8: 'pFSoWsocv0g_8_17.avi',
        9: 'HmVPxs4ygMc_44_53.avi',
        10: 'glii-kazad8_21_29.avi',
        11: 'AJJ-iQkbRNE_97_109.avi'
    }

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # video_path = opt['videos'][0]

    modelname = 'nasnetalarge'

    o_video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos_CNN[
        2]

    video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}Adversarial_'.format(modelname) + \
                 videos_CNN[2]

    # video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\vgg16Adversarial_SaYwh6chmiw_15_40.avi'

    numpy_path = "D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}CNN_{}.npy".format(
        modelname, videos_CNN[2].split('.')[0])
    adv_frames = np.load(numpy_path)

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray

    print(video_path)
    with torch.no_grad():
        frames = skvideo.io.vread(o_video_path)
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Original: ", sents[0])

        frames = skvideo.io.vread(video_path)
        print("Total frames: {}".format(len(frames)))
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        plt.imshow(frames[0] / 255.)
        plt.show()

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial huffyuv: ", sents[0])

        np_frames = adv_frames.astype(np.uint8)
        print("Numpy CNN frames \nTotal frames: {}".format(len(np_frames)))
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        plt.imshow(np_frames[0] / 255.)
        plt.show()

        # bp ---
        batches = create_batches(np_frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial numpy: ", sents[0])
Ejemplo n.º 16
0
 def architecture_search(self):
     ## initialise network modelling and controller instances
     self.nn = gnn.NeuralNetwork(self.target_classes)
     self.nn.optimizer = self.nn_optim
     self.nn.lr = self.nn_lr
     self.nn.decay = self.nn_decay
     self.nn.momentum = self.nn_momentum
     self.nn.dropout = self.dropout
     self.cntrl = lstm.LSTMController(self.max_len, self.nb_classes,
                                      self.target_classes,
                                      (1, self.max_len - 1), len(self.data))
     self.cntrl.lstm_dim = self.lstm_dim
     self.cntrl.use_attention = self.controller_attention
     self.cntrl.optimizer = self.controller_optim
     self.cntrl.lr = self.controller_lr
     self.cntrl.decay = self.controller_decay
     self.cntrl.momentum = self.controller_momentum
     ## start architecture search
     for n in range(self.cntrl_epochs):
         # self.pre_training = False
         print("Controller epoch:", n + 1)
         self.curr_epoch = n
         ## generate sequences using random probabilistic sampling
         sequences = self.cntrl.sample_arch_sequences(self.mc_samples)
         ## train predictor and get predicted accuracies for new sequences
         pred_val_acc = self.cntrl.get_predicted_accuracies_hybrid_model(
             sequences)
         ## for each randomly generated sample
         for i in range(len(sequences)):
             print("probabilistic sampling. model no:", i + 1)
             print(utils.decode_sequence(self.vocab, sequences[i]))
             ## create model. train model
             print("training model...")
             if self.target_classes == 2:
                 self.nn.loss_func = 'binary_crossentropy'
             model = self.nn.create_model(sequences[i],
                                          np.shape(self.x_data[0]))
             print("predicted validation accuracy:", pred_val_acc[i])
             x, y = utils.unison_shuffled_copies(self.x_data, self.y_data)
             history = self.nn.train_model(model, x, y, self.nn_epochs)
             ## condition to avoid error for nn_epochs = 1
             if len(history.history['val_acc']) == 1:
                 self.data.append([
                     sequences[i], history.history['val_acc'][0],
                     pred_val_acc[i]
                 ])
             else:
                 self.data.append([
                     sequences[i],
                     np.ma.average(history.history['val_acc'],
                                   weights=np.arange(
                                       1,
                                       len(history.history['val_acc']) + 1),
                                   axis=-1), pred_val_acc[i]
                 ])
         cntrl_sequences = pad_sequences(sequences,
                                         maxlen=self.max_len,
                                         padding='post')
         xc = cntrl_sequences[:, :-1].reshape(len(cntrl_sequences), 1,
                                              self.max_len - 1)
         yc = to_categorical(cntrl_sequences[:, -1], self.nb_classes)
         ## sequence, validation accuracy data sorted by validation accuracy
         print("[sequence, val acc, predicted val acc]")
         for data in self.data:
             print(data)
         ## train the controller
         val_acc_target = [item[1] for item in self.data]
         self.cntrl.train_hybrid_model(xc, yc,
                                       val_acc_target[-self.mc_samples:],
                                       self.custom_loss, len(self.data),
                                       self.hybrid_model_epochs)
     val_accs = [item[1] for item in self.data]
     sorted_idx = np.argsort(val_accs)[::-1]
     self.data = [self.data[x] for x in sorted_idx]
     print(
         "saving tested architectures, their validation accuracy and predicted accuracy..."
     )
     with open(
             'logdir/tested_archs_data{}.pkl'.format(
                 datetime.now().strftime("%H%M")), 'wb') as file:
         pickle.dump(self.data, file)
     print("saving encoding-decoding dictionary...")
     with open('logdir/encode_decode_dict.pkl', 'wb') as file:
         pickle.dump(self.vocab, file)
     return self.data
Ejemplo n.º 17
0
def train(model,
          criterion,
          optimizer,
          train_loader,
          val_loader,
          opt,
          rl_criterion=None):

    infos = {
        'iter': 0,
        'epoch': 0,
        'start_epoch': 0,
        'best_score': float('-inf'),
        'best_iter': 0,
        'best_epoch': opt.max_epochs
    }

    checkpoint_checked = False
    rl_training = False
    seq_per_img = train_loader.get_seq_per_img()
    infos_history = {}

    if os.path.exists(opt.start_from):
        if os.path.isdir(opt.start_from):
            # loading the same model file at a different experiment dir
            start_from_file = os.path.join(opt.start_from,
                                           os.path.basename(opt.model_file))
        else:
            start_from_file = opt.start_from
        logger.info('Loading state from: %s', start_from_file)
        checkpoint = torch.load(start_from_file)
        model.load_state_dict(checkpoint['model'])
        infos = checkpoint['infos']
        infos['start_epoch'] = infos['epoch']
        checkpoint_checked = True  # this epoch is already checked
    else:
        logger.info('No checkpoint found! Training from the scratch')

    if opt.use_rl == 1 and opt.use_rl_after == 0:
        opt.use_rl_after = infos['epoch']
        opt.use_cst_after = infos['epoch']
        train_loader.set_current_epoch(infos['epoch'])

    if opt.grounder_type in ['niuc', 'iuc']:
        # get class weights
        one_hot_sums = None
        totes = 0
        cur_index = train_loader.get_current_index()
        train_loader.reset()
        ep = infos['epoch']
        while True:
            data = train_loader.get_batch()
            labels_svo = data['labels_svo']
            one_hot = torch.clamp(
                torch.sum(torch.nn.functional.one_hot(
                    labels_svo, num_classes=model.vocab_size),
                          axis=1), 0, 1)
            one_hot[:, 0] = 0  # make the padding index 0
            totes += one_hot.shape[0]
            if one_hot_sums is None:
                one_hot_sums = torch.sum(one_hot, axis=0)
            else:
                one_hot_sums += torch.sum(one_hot, axis=0)

            if ep < train_loader.get_current_epoch():
                one_hot_negs = -one_hot_sums + totes
                pos_weight = one_hot_negs.type(torch.FloatTensor) / (
                    1 + one_hot_sums.type(torch.FloatTensor))
                pos_weight = pos_weight.cuda()

                train_loader.set_current_index(index=cur_index)
                break

    while True:
        t_start = time.time()
        model.train()
        data = train_loader.get_batch()
        feats = data['feats']
        bfeats = data['bfeats']
        labels = data['labels']
        masks = data['masks']
        labels_svo = data['labels_svo']
        masks_svo = data['masks_svo']

        if torch.cuda.is_available():
            feats = [feat.cuda() for feat in feats]
            bfeats = [bfeat.cuda() for bfeat in bfeats]
            labels = labels.cuda()
            masks = masks.cuda()
            labels_svo = labels_svo.cuda()
            masks_svo = masks_svo.cuda()

        # implement scheduled sampling
        opt.ss_prob = 0
        if opt.use_ss == 1 and infos['epoch'] >= opt.use_ss_after:
            annealing_prob = opt.ss_k / \
                (opt.ss_k + np.exp((infos['epoch'] - opt.use_ss_after) / opt.ss_k))
            opt.ss_prob = min(1 - annealing_prob, opt.ss_max_prob)
            model.set_ss_prob(opt.ss_prob)

        if opt.use_rl == 1 and infos[
                'epoch'] >= opt.use_rl_after and not rl_training:
            logger.info('Using RL objective...')
            rl_training = True
            bcmr_scorer = {
                'Bleu_4': Bleu(),
                'CIDEr': Cider(df=opt.train_cached_tokens),
                'METEOR': Meteor(),
                'ROUGE_L': Rouge(),
                'SPICE': Spice()
            }[opt.eval_metric]

            #logger.info('loading gt refs: %s', train_loader.cocofmt_file)
            #gt_refs = utils.load_gt_refs(train_loader.cocofmt_file)

        mixer_from = opt.mixer_from
        if opt.use_mixer == 1 and rl_training:
            #annealing_mixer = opt.ss_k / \
            #    (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k))
            #annealing_mixer = int(round(annealing_mixer * opt.seq_length))

            # -1 for annealing
            if opt.mixer_from == -1:
                annealing_mixer = opt.seq_length - int(
                    np.ceil((infos['epoch'] - opt.use_rl_after + 1) /
                            float(opt.mixer_descrease_every)))
                mixer_from = max(1, annealing_mixer)

            model.set_mixer_from(mixer_from)

        scb_captions = opt.scb_captions
        if opt.use_cst == 1 and rl_training:
            # if opt.use_cst == 1 and opt.ss_k == 0,
            # then do not using annealing, but the fixed scb_captions provided
            #annealing_robust = opt.ss_k / \
            #    (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k))
            #annealing_robust = int(round((1 - annealing_robust) * seq_per_img))

            # do not use robust before fully mixed
            # if opt.use_mixer == 1 and mixer_from > 1:
            #    opt.use_cst_after = infos['epoch']

            # if opt.scb_captions is -1, then use the annealing value,
            # otherwise, use the set value
            if opt.scb_captions == -1:
                annealing_robust = int(
                    np.ceil((infos['epoch'] - opt.use_cst_after + 1) /
                            float(opt.cst_increase_every)))
                scb_captions = min(annealing_robust, seq_per_img - 1)

        optimizer.zero_grad()
        model.set_seq_per_img(seq_per_img)

        if rl_training:
            # sampling from model distribution
            # model_res, logprobs = model.sample(
            #    feats, {'sample_max': 0, 'expand_feat': opt.expand_feat, 'temperature': 1})

            # using mixer
            pred, model_res, logprobs, pred_svo, res_svo, logprobs_svo = model(
                feats, bfeats, labels, labels_svo)

            if opt.use_cst == 0:
                # greedy decoding baseline in SCST paper
                greedy_baseline, _, _, _ = model.sample(
                    [Variable(f.data, volatile=True) for f in feats],
                    [Variable(f.data, volatile=True) for f in bfeats], {
                        'sample_max': 1,
                        'expand_feat': opt.expand_feat
                    })

            if opt.use_cst == 1:
                bcmrscores = data['bcmrscores']
                reward, m_score, g_score = utils.get_cst_reward(
                    model_res,
                    data['gts'],
                    bcmr_scorer,
                    bcmrscores=bcmrscores,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    scb_captions=scb_captions,
                    scb_baseline=opt.scb_baseline,
                    use_eos=opt.use_eos,
                    use_mixer=opt.use_mixer)
            else:
                # use greedy baseline by default, compute self-critical reward
                reward, m_score, g_score = utils.get_self_critical_reward(
                    model_res,
                    greedy_baseline,
                    data['gts'],
                    bcmr_scorer,
                    expand_feat=opt.expand_feat,
                    seq_per_img=train_loader.get_seq_per_img(),
                    use_eos=opt.use_eos)

            loss = rl_criterion(
                model_res, logprobs,
                Variable(torch.from_numpy(reward).float().cuda(),
                         requires_grad=False))
            loss_svo = criterion(pred_svo, labels_svo,
                                 torch.ones(labels.shape).cuda())
            loss = loss + (opt.labda / 10.0) * loss_svo

        else:
            pred, _, _, pred_svo, svo_it, svo_gath = model(
                feats, bfeats, labels, labels_svo)
            loss_cap = criterion(pred,
                                 labels[:, 1:],
                                 masks[:, 1:],
                                 bcmrscores=torch.from_numpy(
                                     data['bcmrscores'].astype(
                                         np.float32)).cuda())
            if opt.grounder_type in ['None', 'none']:
                loss = loss_cap
            else:
                if opt.grounder_type in ['niuc', 'iuc']:  # unordered
                    svo_criterion = torch.nn.BCEWithLogitsLoss(
                        pos_weight=pos_weight)
                    concepts_one_hot = torch.clamp(
                        torch.sum(torch.nn.functional.one_hot(
                            labels_svo, num_classes=model.vocab_size),
                                  axis=1), 0, 1)
                    loss_svo = svo_criterion(
                        pred_svo[:, 0],
                        concepts_one_hot.type(torch.FloatTensor).cuda()
                    )  # pred_svo[: 0] undoes the repeat at the end of non_iterative_grounder()
                else:
                    loss_svo = criterion(pred_svo, labels_svo,
                                         torch.ones(labels.shape).cuda())
                    # loss_svo = criterion(pred_svo, labels_svo, masks_svo)

                if random.random() < 0.01:  # compare the svos during training
                    print('---------------------')
                    print(utils.decode_sequence(opt.vocab, pred.argmax(-1)))
                    print(utils.decode_sequence(opt.vocab, labels_svo)[0])
                    print(utils.decode_sequence(opt.vocab, svo_it)[0])
                loss = loss_cap + (opt.labda / 10.0) * loss_svo

        loss.backward()
        clip_grad_norm_(model.parameters(), opt.grad_clip)
        optimizer.step()
        # memReport()
        del pred, feats, labels, masks, labels_svo
        torch.cuda.empty_cache()

        infos['TrainLoss'] = loss.item()
        infos['CAPTrainLoss'] = loss_cap.item()
        if opt.grounder_type not in ['None', 'none']:
            infos['SVOTrainLoss'] = loss_svo.item()
        else:
            infos['SVOTrainLoss'] = 0
        infos['mixer_from'] = mixer_from
        infos['scb_captions'] = scb_captions

        if infos['iter'] % opt.print_log_interval == 0:
            elapsed_time = time.time() - t_start

            log_info = [('Epoch', infos['epoch']), ('Iter', infos['iter']),
                        ('Loss', infos['TrainLoss']),
                        ('CAP Loss', infos['CAPTrainLoss']),
                        ('SVO Loss', infos['SVOTrainLoss'])]

            if rl_training:
                log_info += [('Reward', np.mean(reward[:, 0])),
                             ('{} (m)'.format(opt.eval_metric), m_score),
                             ('{} (b)'.format(opt.eval_metric), g_score)]

            if opt.use_ss == 1:
                log_info += [('ss_prob', opt.ss_prob)]

            if opt.use_mixer == 1:
                log_info += [('mixer_from', mixer_from)]

            if opt.use_cst == 1:
                log_info += [('scb_captions', scb_captions)]

            log_info += [('Time', elapsed_time)]
            logger.info(
                '%s',
                '\t'.join(['{}: {}'.format(k, v) for (k, v) in log_info]))

        infos['iter'] += 1

        if infos['epoch'] < train_loader.get_current_epoch():
            infos['epoch'] = train_loader.get_current_epoch()
            checkpoint_checked = False
            learning_rate = utils.adjust_learning_rate(
                opt, optimizer, infos['epoch'] - infos['start_epoch'])
            logger.info('===> Learning rate: %f: ', learning_rate)

        # checkpoint_checked = False
        # if 1:   todo debuging, jump straight to validation
        if (infos['epoch'] >= opt.save_checkpoint_from
                and infos['epoch'] % opt.save_checkpoint_every == 0
                and not checkpoint_checked):
            # evaluate the validation performance
            results = validate(model, criterion, val_loader, opt)
            logger.info(
                'Validation output: %s',
                json.dumps(results['scores'], indent=4, sort_keys=True))
            # infos.update(results['scores'])

            # todo added training set eval to check for overfitting
            cur_index = train_loader.get_current_index()
            train_loader.reset()
            results_train = validate(model,
                                     criterion,
                                     train_loader,
                                     opt,
                                     max_iters=20,
                                     type='train')
            train_loader.set_current_index(index=cur_index)
            for k, v in results_train['scores'].items():
                results['scores']['Train_' + k] = v

            logger.info(
                'Training output: %s',
                json.dumps(results_train['scores'], indent=4, sort_keys=True))
            infos.update(results['scores'])

            check_model(model, opt, infos, infos_history)
            checkpoint_checked = True

        if (infos['epoch'] >= opt.max_epochs
                or infos['epoch'] - infos['best_epoch'] > opt.max_patience):
            logger.info('>>> Terminating...')
            break

    return infos
Ejemplo n.º 18
0
def train(opt):
    # setup gpu
    try:
        import subprocess
        # gpu_id = subproces.check_output('source gpu_setVisibleDevices.sh', shell=True)
        gpu_id = int(subprocess.check_output('gpu_getIDs.sh', shell=True))
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        opt.logger.warn('GPU ID: %s | available memory: %dM' \
                        % (os.environ['CUDA_VISIBLE_DEVICES'], get_gpu_memory(gpu_id)))

    except:
        opt.logger.warn("Requested gpu_id : %s" % opt.gpu_id)
        os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id
        opt.logger.warn('GPU ID: %s | available memory: %dM' \
                        % (os.environ['CUDA_VISIBLE_DEVICES'], get_gpu_memory(opt.gpu_id)))

    from loader import textDataLoader
    from utils import decode_sequence

    # reproducibility:
    opt.logger.info('Reading data ...')
    src_loader = textDataLoader(
        {
            'h5_file': opt.input_data_src + '.h5',
            'infos_file': opt.input_data_src + '.infos',
            "max_seq_length": opt.max_src_length,
            'batch_size': opt.batch_size
        },
        logger=opt.logger)

    trg_loader = textDataLoader(
        {
            'h5_file': opt.input_data_trg + '.h5',
            'infos_file': opt.input_data_trg + '.infos',
            "max_seq_length": opt.max_trg_length,
            'batch_size': opt.batch_size
        },
        logger=opt.logger)

    goon = True
    bound = 0
    while goon:
        # Load data from train split (0)
        data_src, order = src_loader.get_src_batch('test')
        input_lines_src = data_src['labels']
        data_trg = trg_loader.get_trg_batch('test', order)
        output_lines_trg = data_trg['out_labels']
        sent_source = decode_sequence(src_loader.get_vocab(),
                                      input_lines_src,
                                      eos=src_loader.eos,
                                      bos=src_loader.bos)
        sent_gold = decode_sequence(trg_loader.get_vocab(),
                                    output_lines_trg,
                                    eos=trg_loader.eos,
                                    bos=trg_loader.bos)

        for i, (src, trg) in enumerate(zip(sent_source, sent_gold)):
            if bound + i in [134, 1924, 2092]:
                print(bound + i, '>>>')
                print('Source:', src)
                print('Target:', trg)
        bound = data_src['bounds']['it_pos_now']
        goon = bound < 2100
Ejemplo n.º 19
0
def main(opt):

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #Random target caption
    # target_caption = np.random.choice(viable_target_captions)
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    with torch.no_grad():
        frames = skvideo.io.vread(video_path)

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #/96 gives 3 frames
    length = len(skvideo.io.vread(video_path)) / 96

    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)
    while (frame_counter < length):
        print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations)))
        iteration = iteration + 1
        if length - frame_counter < BATCH_SIZE:
            window = [frame_counter, length]
            frame_counter = frame_counter + (length - frame_counter)
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            carlini = CarliniAttack(oracle=full_decoder,
                                    video_path=video_path,
                                    target=target_caption,
                                    dataset=dataset,
                                    window=window)
            finished_frames = carlini.execute(video_path,
                                              window=window,
                                              functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())

        else:
            window = [frame_counter, frame_counter + BATCH_SIZE - 1]
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            carlini = CarliniAttack(oracle=full_decoder,
                                    video_path=video_path,
                                    target=target_caption,
                                    dataset=dataset,
                                    window=window)
            finished_frames = carlini.execute(video_path,
                                              window=window,
                                              functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())
            frame_counter = frame_counter + BATCH_SIZE

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    print("\nSaving to: {}".format(adv_path))

    adv_frames = np.concatenate(adv_frames, axis=0)

    outputfile = adv_path

    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            #huffyuv is lossless. r10k is really good

            # '-c:v': 'libx264', #libx264 # use the h.264 codec
            '-c:v': 'huffyuv',  #r210 huffyuv r10k
            # '-pix_fmt': 'rgb32',
            # '-crf': '0', # set the constant rate factor to 0, which is lossless
            # '-preset': 'ultrafast'  # ultrafast, veryslow the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)

    writer.close()

    #ffv1 0.215807946043995
    #huffyuv 0.21578424050191813
    #libx264 0.2341074901578537
    #r210 -0.7831487262059795, -0.7833399258537526
    #gif 0.6889478809555243
    #png 0.2158991440582696 0.21616862708842177
    #qtrle  0.21581286337807626
    #flashsv 0.21610510459932186 0.21600030673323545
    #ffvhuff 0.21620682250167533
    #r10k similar to r210
    #rawvideo 0.21595001

    with torch.no_grad():
        full_decoder = full_decoder.eval()

        frames = skvideo.io.vread(adv_path)

        frames = np.float32(frames)

        difference = np.array(adv_frames) - np.array(frames)
        np.save('difference_tmp', difference)
        #loadtxt to load np array from txt

        exp = np.load('difference_tmp.npy')

        print("Is the saved array equal to loaded array for difference: ",
              np.array_equal(exp, difference))

        frames = frames + difference

        # bp ---
        adv_frames = adv_frames.astype(np.uint8)
        batches = create_batches(adv_frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames old: {}".format(sents[0]))

        batches = exp_create_batches(adv_frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames new: {}".format(sents[0]))

        frames = frames.astype(np.uint8)
        batches = create_batches(frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        print("frames old caption: ", sents[0])

        # frames = frames.astype(np.uint8)
        # batches = create_batches(frames, load_img_fn, tf_img_fn)

        batches = exp_create_batches(frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
Ejemplo n.º 20
0
def main(opt):
    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    length = len(skvideo.io.vread(video_path)) / 8
    print("Total number of frames: {}".format(len(
        skvideo.io.vread(video_path))))
    print("Total number of frames to do: {}".format(length))

    with torch.no_grad():
        frames = skvideo.io.vread(video_path)

        # bp ---

        attn_weights = []

        total_iterations = np.ceil(length / BATCH_SIZE)
        iteration = 1
        frame_counter = 0

        while (frame_counter < length):
            if length - frame_counter < BATCH_SIZE:
                batches = create_batches(frames[frame_counter:int(length)],
                                         load_img_fn, tf_img_fn)
                attn = full_decoder(batches, mode='inference', get_attn=True)
                frame_counter = frame_counter + (length - frame_counter)
            else:
                batches = create_batches(
                    frames[frame_counter:frame_counter + BATCH_SIZE - 1],
                    load_img_fn, tf_img_fn)
                attn = full_decoder(batches, mode='inference', get_attn=True)
                frame_counter = frame_counter + BATCH_SIZE
            # print(attn.shape, attn[0].shape, type(attn))

            attn = attn.cpu().detach().numpy().tolist()[0]

            print("Weights for batch {}: {}".format(iteration, attn))
            for f in attn:
                attn_weights.append(f)
            iteration = iteration + 1

            # attn_weights.append(attn.cpu().detach().numpy().tolist()[0])

        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches,
                                           mode='inference',
                                           get_attn=False)
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    print(attn_weights)

    att_window = np.sort(
        np.argpartition(attn_weights,
                        -ATTACK_BATCH_SIZE)[-ATTACK_BATCH_SIZE:]).tolist()

    print("Indices of frames with highest attention weights: {}".format(
        att_window))
    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    adv_frames = []
    carlini = CarliniAttack(oracle=full_decoder,
                            video_path=video_path,
                            target=target_caption,
                            dataset=dataset,
                            att_window=att_window)
    finished_frames = carlini.execute(video_path,
                                      att_window=att_window,
                                      functional=True)
    adv_frames.append(finished_frames.detach().cpu().numpy())

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarial.avi')

    print("\nSaving to: {}".format(adv_path))
    adv_frames = np.concatenate(adv_frames, axis=0)
    outputfile = adv_path
    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            '-vcodec': 'libx264',  # use the h.264 codec
            '-crf':
            '0',  # set the constant rate factor to 0, which is lossless
            '-vb': '50M',
            '-r': '25',
            '-preset':
            'ultrafast'  # the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)
    print(len(adv_frames))

    # skvideo.io.vwrite(adv_path, adv_frames)
    writer.close()

    with torch.no_grad():
        a_frames = skvideo.io.vread(adv_path)

        # frames = skvideo.io.vread(video_path)

        # for f in range(0, len(att_window)):
        #     frames[att_window[f]] = a_frames[f]

        # frames = frames[:50]
        # frames = adv_frames
        # print(frames[[0, 1, 2, 3, 4, 5]].shape)
        # plt.imshow(frames[0])
        # plt.show()
        #
        # plt.imshow(adv_frames[0]/255.)
        # plt.show()

        # bp ---

        batches = create_batches(a_frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
Ejemplo n.º 21
0
def main(opt):

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool.
        #
        "batch_size": BATCH_SIZE,
        "c": 10000,
        "learning_rate": 0.2,
        "num_iterations": 1000,
        "input_shape": (224, 224),
        "num_frames": 288,
        "dimensions": 224,
        "k": 0.1,
        # "attack_algorithm": "showandfool"
        "attack_algorithm": "carliniwagner"
    }

    convnet = 'vgg16'
    # convnet = 'nasnetalarge'
    # convnet = 'resnet152'
    full_decoder = ConvS2VT(convnet, model, opt)
    '''
    Layer freezing experiment.
    
    Top 10 contributing layers: 
    conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight
    conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight
    conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight
    conv.cell_13.comb_iter_4_left.bn_sep_1.weight
    conv.reduction_cell_0.conv_prev_1x1.bn.weight
    conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight
    conv.cell_13.comb_iter_0_left.bn_sep_1.weight
    
    
    '''

    top = open("top_layers.txt", "r")
    top_layers = top.readlines()
    top.close()
    print(top_layers)

    #set the gradients on the layers you don't want to contribute to 0
    top_layers = []

    for name, parameters in full_decoder.named_parameters():
        reset = True
        for f in top_layers:
            if name in f:
                reset = False

        if reset:
            parameters.require_grad = False
            if parameters.grad is not None:
                print(name)
                parameters.grad.data.zero_()

    # for name, parameters in full_decoder.named_parameters():
    #     for f in top_layers:
    #         if name not in f:
    #             print(name)
    #             parameters.require_grad = False
    #             if parameters.grad is not None:
    #                 # parameters.data = 0
    #                 parameters.grad.data.zero_()
    #         else:
    #             # print(name)
    #             continue

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #target_caption = np.random.choice(viable_target_captions)
    # 5 captions:
    '''
    <sos> A person is typing into a laptop computer <eos>
    <sos> A boy is kicking a soccer ball into the goal <eos>
    <sos> Someone is frying fish <eos>
    <sos> A dog is running with a ball <eos>
    <sos> The cat approaches on grass <eos>
    
    '''
    captions = {
        1: '<sos> A woman is talking <eos>',
        2: '<sos> A boy is kicking a soccer ball into the goal <eos>',
        3: '<sos> A man is frying fish <eos>',
        4: '<sos> A dog is running with a ball <eos>',
        5: '<sos> A cat is walking on grass <eos>'
    }

    #1 doesn't work
    videos = {

        #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi
        #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22'
        #1: 'RSx5G0_xH48_12_17.avi',
        2: 'nc8hwLaOyZU_1_19.avi',
        3: 'O2qiPS2NCeY_2_18.avi',
        4: 'kI6MWZrl8v8_149_161.avi',
        5: 'X7sQq-Iu1gQ_12_22.avi',
        6: '77iDIp40m9E_159_181.avi',
        7: 'SaYwh6chmiw_15_40.avi',
        8: 'pFSoWsocv0g_8_17.avi',
        9: 'HmVPxs4ygMc_44_53.avi',
        10: 'glii-kazad8_21_29.avi',
        11: 'AJJ-iQkbRNE_97_109.avi'
    }
    #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi"
    # video_path = ''

    video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[
        2]
    # target_caption = '<sos> A man is moving a toy <eos>'
    # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Just switch the number to get a target caption.
    target_caption = captions[1]

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96)
    #12 frames
    length = 3
    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = ['Adam', (0.9, 0.999)]

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    while (frame_counter < length):
        print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations)))
        iteration = iteration + 1
        if length - frame_counter < BATCH_SIZE:
            window = [frame_counter, length]
            frame_counter = frame_counter + (length - frame_counter)
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))
            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())

        else:
            window = [frame_counter, frame_counter + BATCH_SIZE - 1]
            print("Using frames {}".format(window))
            print("Frame counter at: {}\nTotal length is: {}\n".format(
                frame_counter, length))

            attack_package = S2VT_Attack(model=full_decoder,
                                         video_path=video_path,
                                         target=target_caption,
                                         dataset=dataset,
                                         config=config,
                                         optimizer=optimizer,
                                         crit=crit,
                                         seq_decoder=seq_decoder,
                                         window=window)
            carlini = Attack(attack_package=attack_package)
            finished_frames = carlini.execute(functional=True)
            adv_frames.append(finished_frames.detach().cpu().numpy())
            frame_counter = frame_counter + BATCH_SIZE

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    print("\nSaving to: {}".format(adv_path))
    # adv_frames_1 = np.concatenate(adv_frames, axis=0)
    # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn)
    # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)

    # print("Adversarial Frames 1: {}".format(sents[0]))
    adv_frames = np.concatenate(adv_frames, axis=0)
    # batches = create_batches(adv_frames, load_img_fn, tf_img_fn)
    # seq_prob, seq_preds = full_decoder(batches, mode='inference')
    # sents = utils.decode_sequence(vocab, seq_preds)
    #
    # print("Adversarial Frames 2: {}".format(sents[0]))

    outputfile = adv_path

    writer = skvideo.io.FFmpegWriter(
        outputfile,
        outputdict={
            #huffyuv is lossless. r10k is really good

            # '-c:v': 'libx264', #libx264 # use the h.264 codec
            '-c:v': 'huffyuv',  #r210 huffyuv r10k
            # '-pix_fmt': 'rgb32',
            # '-crf': '0', # set the constant rate factor to 0, which is lossless
            # '-preset': 'ultrafast'  # ultrafast, veryslow the slower the better compression, in princple, try
        })
    for f in adv_frames:
        writer.writeFrame(f)

    writer.close()

    # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW')
    # np.save(np_path, adv_frames)
    #ffv1 0.215807946043995
    #huffyuv 0.21578424050191813
    #libx264 0.2341074901578537
    #r210 -0.7831487262059795, -0.7833399258537526
    #gif 0.6889478809555243
    #png 0.2158991440582696 0.21616862708842177
    #qtrle  0.21581286337807626
    #flashsv 0.21610510459932186 0.21600030673323545
    #ffvhuff 0.21620682250167533
    #r10k similar to r210
    #rawvideo 0.21595001

    with torch.no_grad():

        #getting a new model to see how it actually works now
        # full_decoder = ConvS2VT(convnet, model, opt)
        full_decoder = full_decoder.eval()

        frames = skvideo.io.vread(adv_path)

        frames = np.float32(frames)
        plt.imshow(frames[0] / 255.)
        plt.show()

        difference = np.array(adv_frames) - np.array(frames)
        np.save('difference_tmp', difference)
        #loadtxt to load np array from txt

        exp = np.load('difference_tmp.npy')

        # numpy_frames = np.load(np_path+'.npy')
        # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames))
        # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference))

        frames = frames + difference

        # batches = exp_create_batches(numpy_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption = sents[0]
        #
        # print("Numpy Frames exp: {}".format(numpy_caption))
        #

        # numpy_frames_tensor = torch.tensor(numpy_frames)
        # numpy_frames_tensor = numpy_frames_tensor.float()
        # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        # numpy_caption_tensor = sents[0]
        #
        # print("Numpy Frames tensor: {}".format(numpy_caption_tensor))

        # numpy_frames = numpy_frames.astype(np.uint8)
        # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn)
        #
        # # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')
        #
        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        # sents = utils.decode_sequence(vocab, seq_preds)
        #
        # print("Numpy Frames originalscale: {}".format(sents[0]))
        # # bp ---
        adv_frames = adv_frames.astype(np.uint8)
        batches = create_batches(adv_frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(adv_frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames old: {}".format(sents[0]))

        batches = exp_create_batches(adv_frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        print("Adversarial Frames new: {}".format(sents[0]))

        frames = frames.astype(np.uint8)
        batches = create_batches(frames, load_img_fn, tf_img_fn)

        # batches = exp_create_batches(frames, BATCH_SIZE)
        # feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference')

        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        print("frames old caption: ", sents[0])

        # frames = frames.astype(np.uint8)
        # batches = create_batches(frames, load_img_fn, tf_img_fn)

        batches = exp_create_batches(frames, BATCH_SIZE)
        feats = full_decoder.conv_forward((batches.unsqueeze(0)))
        seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
            feats, mode='inference')

        # seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)
        adv_caption = sents[0]

    print(
        "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}".
        format(original_caption, target_caption, adv_caption))
Ejemplo n.º 22
0
def main(opt):
    def loss(seq_prob, crit):
        loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda())
        return loss

    def produce_t_mask():
        mask = torch.zeros(dataset.max_len)
        captions = [target_caption.split(' ')]
        gts = torch.zeros(len(captions), dataset.max_len).long()
        for i, cap in enumerate(captions):
            if len(cap) > dataset.max_len:
                cap = cap[:dataset.max_len]
                cap[-1] = '<eos>'
            for j, w in enumerate(cap):
                gts[i, j] = dataset.word_to_ix[w]

        label = gts[0]
        non_zero = (label == 0).nonzero()
        mask[:int(non_zero[0]) + 1] = 1

        return label.unsqueeze(0), mask.unsqueeze(0)

    dataset = VideoDataset(opt, 'inference')
    opt["vocab_size"] = dataset.get_vocab_size()
    opt["seq_length"] = dataset.max_len

    if opt['beam_size'] != 1:
        assert opt["batch_size"] == 1
    if opt["model"] == 'S2VTModel':
        model = S2VTModel(opt["vocab_size"],
                          opt["max_len"],
                          opt["dim_hidden"],
                          opt["dim_word"],
                          opt['dim_vid'],
                          n_layers=opt['num_layers'],
                          rnn_cell=opt['rnn_type'],
                          bidirectional=opt["bidirectional"],
                          rnn_dropout_p=opt["rnn_dropout_p"])
    elif opt["model"] == "S2VTAttModel":
        encoder = EncoderRNN(opt["dim_vid"],
                             opt["dim_hidden"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             bidirectional=opt["bidirectional"],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"])
        decoder = DecoderRNN(opt["vocab_size"],
                             opt["max_len"],
                             opt["dim_hidden"],
                             opt["dim_word"],
                             n_layers=opt['num_layers'],
                             rnn_cell=opt['rnn_type'],
                             input_dropout_p=opt["input_dropout_p"],
                             rnn_dropout_p=opt["rnn_dropout_p"],
                             bidirectional=opt["bidirectional"])
        model = S2VTAttModel(encoder, decoder)
    else:
        return

    # if torch.cuda.device_count() > 1:
    #     print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count()))
    #     model = nn.DataParallel(model)

    #model, videopath, targetcap, dataset, config, optimizer, crit, window

    #config: batch_size, c, learning rate, num it,input shape

    config = {
        "batch_size": BATCH_SIZE,
        "c": 100,
        "learning_rate": 0.005,
        "num_iterations": 1000,
        "input_shape": (299, 299),
        "num_frames": 288,
        "dimensions": 331
    }

    convnet = 'nasnetalarge'
    full_decoder = ConvS2VT(convnet, model, opt)

    #'A woman is cutting a green onion'
    video_path = opt['videos'][0]

    tf_img_fn = ptm_utils.TransformImage(full_decoder.conv)
    load_img_fn = PIL.Image.fromarray
    vocab = dataset.get_vocab()

    vid_id = video_path.split('/')[-1]
    vid_id = vid_id.split('.')[0]

    viable_ids = dataset.splits['test'] + dataset.splits['val']
    viable_target_captions = []
    for v_id in viable_ids:
        if v_id == vid_id:
            continue
        plausible_caps = [
            ' '.join(toks)
            for toks in dataset.vid_to_meta[v_id]['final_captions']
        ]
        viable_target_captions.extend(plausible_caps)

    #Random target caption
    # target_caption = np.random.choice(viable_target_captions)
    # target_caption = '<sos> A man is moving a toy <eos>'
    target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>'

    #Should use the original caption function we use in the attack because the scaling is sightly different
    with torch.no_grad():
        frames = skvideo.io.vread(video_path, num_frames=config["num_frames"])

        # bp ---
        batches = create_batches(frames, load_img_fn, tf_img_fn)
        seq_prob, seq_preds = full_decoder(batches, mode='inference')
        sents = utils.decode_sequence(vocab, seq_preds)

        original_caption = sents[0]

    #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi'

    #/96 gives 3 frames
    length = math.ceil(
        len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) /
        96)

    print("Total number of frames: {}".format(length))
    adv_frames = []
    iteration = 1
    frame_counter = 0

    total_iterations = np.ceil(length / BATCH_SIZE)

    #model is full_decoder

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    crit = utils.LanguageModelCriterion()
    seq_decoder = utils.decode_sequence

    # model, videopath, targetcap, dataset, config, optimizer, crit, window

    frames = skvideo.io.vread(video_path)[0:BATCH_SIZE]
    original = torch.tensor(frames)
    original = (original.float()).cuda()

    batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer.zero_grad()
    cost.backward()
    original_grads = {}
    for name, parameter in full_decoder.named_parameters():
        original_grads[name] = parameter.grad

    print(len(original_grads.keys()))
    # for key, value in original_grads.items():
    #     print(key)

    #Adversarial

    full_decoder = ConvS2VT(convnet, model, opt)

    base_toks = video_path.split('/')
    base_dir_toks = base_toks[:-1]
    base_filename = base_toks[-1]
    base_name = ''.join(base_filename.split('.')[:-1])
    adv_path = os.path.join('/'.join(base_dir_toks),
                            base_name + '_adversarialWINDOW.avi')

    adv_frames = skvideo.io.vread(adv_path)
    adv_frames = np.float32(adv_frames)

    adv_frames = torch.tensor(adv_frames)
    adv_frames = (adv_frames.float()).cuda()

    batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE)
    feats = full_decoder.conv_forward(batch.unsqueeze(0))
    seq_prob, seq_preds = full_decoder.encoder_decoder_forward(
        feats, mode='inference')

    tlabel, tmask = produce_t_mask()

    cost = loss(seq_prob, crit)

    optimizer = optim.Adam(full_decoder.parameters(),
                           lr=0.005,
                           betas=(0.9, 0.999))

    optimizer.zero_grad()
    cost.backward()
    adv_grads = {}
    for name, parameter in full_decoder.named_parameters():
        adv_grads[name] = parameter.grad

    # for key, value in adv_grads.items():
    #     print(key)

    print('\n\n\n------')
    for key, value in adv_grads.items():
        if 'weight' in key:
            print(key)

    output = open("s2vt_weightoutput.txt", "w")

    l2norm_layers = []
    for key, value in original_grads.items():
        if 'weight' in key:
            if (value is not None):
                adv_weight = adv_grads[key]
                # print(value, adv_weight)
                diff = value - adv_weight
                net_change = np.linalg.norm(diff) / np.linalg.norm(value)
                output.write("{}, {}\n".format(key, net_change))
                l2norm_layers.append([key, net_change])
    output.close()
Ejemplo n.º 23
0
 def train_best_architectures(self,
                              best_archs,
                              use_shared_weights=False,
                              earlyStopping=True):
     if use_shared_weights and earlyStopping:
         mode = 'sw_eS'
     elif use_shared_weights:
         mode = 'sw'
     elif earlyStopping:
         mode = 'eS'
     else:
         mode = 'full'
     val_accs = []
     max_val_acc = 0.
     for seq in best_archs[:self.nb_final_archs]:
         if self.target_classes == 2:
             self.nn.loss_func = 'binary_crossentropy'
         ## train every model
         print("architecture:", utils.decode_sequence(self.vocab, seq))
         model = self.nn.create_model(seq, np.shape(self.x_data[0]))
         ## use early stopping
         if earlyStopping:
             callbacks = [EarlyStopping(monitor='val_acc', patience=0)]
         else:
             callbacks = None
         x, y = utils.unison_shuffled_copies(self.x_data, self.y_data)
         if use_shared_weights:
             ## use pre-trained shared weights without updating them
             history = self.nn.train_model(model,
                                           x,
                                           y,
                                           self.final_nn_train_epochs,
                                           validation_split=0.1,
                                           update_shared_weights=False,
                                           callbacks=callbacks)
         else:
             history = model.fit(x,
                                 y,
                                 epochs=self.final_nn_train_epochs,
                                 validation_split=0.1,
                                 callbacks=callbacks)
         val_accs.append(
             np.ma.average(history.history['val_acc'],
                           weights=np.arange(
                               1,
                               len(history.history['val_acc']) + 1),
                           axis=-1))
         ## store model, model_weights if mean weighted rolling
         ## validation accuracy better than previous models
         if val_accs[-1] > max_val_acc:
             best_arch_vals = {}
             best_arch_vals.update({tuple(seq): model.get_weights()})
             max_val_acc = val_accs[-1]
     ## return validation accuracy of all trained architectures
     ## return best architecture, it's weights
     best_archs_dict = {}
     for i in range(self.nb_final_archs):
         best_archs_dict.update({tuple(best_archs[i]): val_accs[i]})
     top_arch = utils.decode_sequence(self.vocab,
                                      list(list(best_arch_vals.keys())[0]))
     print("top {} architectures:".format(self.nb_final_archs),
           best_archs[:self.nb_final_archs])
     print("corresponding validation accuracies:", val_accs)
     print("best architecture:", top_arch)
     print("it's validation accuracy:", max_val_acc)
     print("saving best weights...")
     best_weights_file = 'logdir/best_arch_weights{}{}.pkl'.format(
         mode,
         datetime.now().strftime("%H%M"))
     with open(best_weights_file, 'wb') as file:
         pickle.dump(best_arch_vals, file)
     print("saving top architectures and their validation accuracies...")
     best_archs_file = 'logdir/top{}archs{}{}.pkl'.format(
         self.nb_final_archs, mode,
         datetime.now().strftime("%H%M"))
     with open(best_archs_file, 'wb') as file:
         pickle.dump(best_archs_dict, file)
     return val_accs, top_arch
Ejemplo n.º 24
0
def generate_caps(encoder, decoder, crit, loader, eval_kwargs={}):
    verbose = eval_kwargs.get('verbose', True)
    split = eval_kwargs.get('split', 'train')
    lang_eval = eval_kwargs.get('language_eval', 1)
    dataset = eval_kwargs.get('dataset', 'coco')
    beam_size = eval_kwargs.get('beam_size', 1)
    beam_size = 1
    logger = eval_kwargs.get('logger')
    lm_model = eval_kwargs.get('lm_model')
    vocab_size = eval_kwargs.get('vocab_size')
    sample_max = eval_kwargs.get('sample_max')
    temperature = eval_kwargs.get('temperature')
    tries = eval_kwargs.get('tries', 5)
    sample_limited_vocab = eval_kwargs.get('sample_limited_vocab', 0)
    output_file = eval_kwargs.get('output_file')

    print('Using sample_max = %d  ||  temperature %.2f' %
          (sample_max, temperature))

    # Make sure in the evaluation mode
    encoder.eval()
    decoder.eval()
    logger.warn('Generating captions for the full training set')
    loader.reset_iterator(split)
    n = 0
    blobs = []
    SENTS = []
    gen_SENTS = []
    while True:
        data = loader.get_batch(split)
        n = n + loader.batch_size
        # forward the model to get loss
        #  if n > 100:
        #      break
        infos = data['infos']
        ids = [inf['id'] for inf in infos]
        assert len(ids) == 1, "Batch size larger than 1"
        tmp = [data['labels'], data['masks']]
        tmp = [
            Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp
        ]
        labels, masks = tmp
        tr = 0
        gt = decode_sequence(loader.get_vocab(), labels[:, 1:].data)
        SENTS += gt
        blob_batch = {"id": ids[0], "gt": gt, "sampled": []}
        for igt in gt:
            print_sampled(ids[0], gt)

        while tr < tries:
            #  z_mu, z_var, codes = encoder(labels)
            if lm_model == "rnn_vae":
                codes = encoder.sample(labels)
            elif lm_model == "rnn_multi_vae":
                codes = encoder.sample_group(labels)
                #  scodes = encoder.sample(labels)
            else:
                codes = encoder(labels)
            if sample_limited_vocab:
                sample_vocab = np.unique(labels[:, 1:].cpu().data.numpy())
                print("sample_vocab:", sample_vocab.tolist())
                seq, _ = decoder.sample_ltd(
                    codes, sample_vocab, {
                        'beam_size': beam_size,
                        "vocab_size": vocab_size,
                        "sample_max": sample_max,
                        "temperature": temperature
                    })
            else:
                seq, _ = decoder.sample(
                    codes, {
                        'beam_size': beam_size,
                        "vocab_size": vocab_size,
                        "sample_max": sample_max,
                        "temperature": temperature
                    })

            sents = decode_sequence(loader.get_vocab(), seq)
            #  ssents = decode_sequence(loader.get_vocab(), sseq)
            gen_SENTS += sents
            #  gen_SENTS += ssents
            for isent in sents:
                print_sampled(0, isent, warn=True)
            #  print '--------------------(SINGLE)------------------------'
            #  for isent in ssents:
            #      print _WARNING + isent + _ENDC
            print('----------------------------------------------------')

            blob_batch['sampled'] += sents
            #  blob_batch['sampled'] += ssents
            tr += 1
        ix0 = data['bounds']['it_pos_now']
        ix1 = data['bounds']['it_max']
        if data['bounds']['wrapped']:
            break
        if n >= ix1:
            logger.warn('Evaluated the required samples (%s)' % n)
            break
        blobs.append(blob_batch)
        #  print "Blob batch:", blob_batch
    json.dump(blobs, open(output_file, 'w'))
    if lang_eval:
        lang_stats = language_lm_eval(SENTS, gen_SENTS)
        print(lang_stats)
    encoder.train()
    decoder.train()
    return 1