def infer_recitation_to_text(args): prefix = args.filename_prefix decoder_model = tf.keras.models.load_model(os.path.join(args.output_dir, f'decoder-model-{prefix}.h5')) encoder_model = tf.keras.models.load_model(os.path.join(args.output_dir, f'encoder_model-{prefix}.h5')) print("Models loaded") encoder_input_data, decoder_input_data, decoder_target_data = get_seq2seq_data() print("Data loaded") max_decoder_seq_length = decoder_input_data.shape[1] num_decoder_tokens = decoder_input_data.shape[-1] one_hot_obj = get_one_hot_encodings() reverse_target_char_index = one_hot_obj['int_to_char'] reverse_target_char_index[num_decoder_tokens - 2] = '->' reverse_target_char_index[num_decoder_tokens - 1] = '<-' # Perform inference on some of the audio files with open(os.path.join(args.output_dir, f'inference-{prefix}.txt'), 'w') as f: num_predict = args.num_predict if num_predict == -1: num_predict = encoder_input_data.shape[0] for seq_index in range(num_predict): print(seq_index, end=' ') input_seq = encoder_input_data[seq_index: seq_index + 1] decoded_sentence = decode_sequence( input_seq, num_decoder_tokens, encoder_model, decoder_model, max_decoder_seq_length) true_array = decoder_target_data[seq_index] true_sentence = '' for pos in range(true_array.shape[0]): sampled_token_index = np.argmax(true_array[pos]) sampled_char = reverse_target_char_index[sampled_token_index] true_sentence += sampled_char f.write(true_sentence + ',' + decoded_sentence + '\n')
def val(split="val"): net.eval() data_val = CocoCaptionsFeature(fc_dir=opt.input_fc_dir, att_dir=opt.input_att_dir, label_file=opt.input_label_h5, info_file=opt.input_json, split=split, opt=opt) evalloader = iter( DataLoader(data_val, batch_size=opt.val_images_use, num_workers=1)) #loader = tqdm(enumerate(trainloader), total=len(trainloader), ascii=True) fc, att, labels = next(evalloader) if use_cuda: fc, att, labels = fc.cuda(), att.cuda(), labels.cuda() fc, att, labels = Variable(fc, requires_grad=False), Variable( att, requires_grad=False), Variable(labels, requires_grad=False) fc = torch.stack([fc] * opt.seq_per_img).view(-1, *fc.shape[1:]) att = torch.stack([att] * opt.seq_per_img).view(-1, *att.shape[1:]) labels = labels.transpose(1, 0).contiguous().view(-1, *labels.shape[2:]) labels = labels.long() outputs, *_ = net(fc_feats=fc, att_feats=att) #loss = criterion(outputs, labels) txts = utils.decode_sequence(data.dictionary, outputs.data) for txt in txts: print(txt)
def test(): net.eval() loader = tqdm(enumerate(dataloader), total=len(dataloader), ascii=True) min_loss = 1e9 for batch_idx, (fc, att, labels, data_info) in loader: if use_cuda: fc, att, labels = fc.cuda(), att.cuda(), labels.cuda() fc, att, labels = Variable(fc, requires_grad=False), Variable(att, requires_grad=False), Variable(labels, requires_grad=False) fc = torch.stack([fc]*opt.seq_per_img).view(-1, *fc.shape[1:]) att = torch.stack([att]*opt.seq_per_img).view(-1, *att.shape[1:]) origin_labels = labels.view(-1, *labels.shape[2:]) labels = labels.transpose(1, 0).contiguous().view(-1, *labels.shape[2:]) labels = labels.long() outputs, _ = net(fc_feats=fc, att_feats=att, seq=labels) loss = criterion(outputs, labels) if loss.data[0] < min_loss: min_loss = loss.data[0] outputs, alpha = net(fc_feats=fc, att_feats=att) min_txts = utils.decode_sequence(data.dictionary, outputs.data) min_txts_target = utils.decode_sequence(data.dictionary, origin_labels.data) file_path = data_info['file_path'] loader.set_description("Loss: {:.6f} | Min Loss: {:.6f}".format(loss.data[0], min_loss)) if min_loss < 1.54: break loader.set_description("Loss: {:.6f} | Min Loss: {:.6f}".format(loss.data[0], min_loss)) for idx, (txt) in enumerate(min_txts): if idx % opt.seq_per_img == 0: print(file_path[idx // opt.seq_per_img]) print(txt) print(min_txts_target[idx]) if idx % opt.seq_per_img == 4: print("") print(min_loss) att_path = './alpha.pt' torch.save(alpha.data.cpu(), att_path)
def eval_external_ensemble(ensemble, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) num_images = eval_kwargs.get('num_images', -1) split = eval_kwargs.get('split', 'val') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) logger = eval_kwargs.get('logger') caption_model = eval_kwargs.get('caption_model') vocab_size = eval_kwargs.get('vocb_size') dump_path = eval_kwargs.get('dump_path') # Make sure in the evaluation mode for cnn_model in ensemble.cnn_models: cnn_model.eval() for model in ensemble.models: model.eval() loader.reset_iterator(split) n = 0 predictions = [] Feats = [] seq_per_img = 5 while True: data = loader.get_batch(split, seq_per_img=seq_per_img) n = n + loader.batch_size # forward the model to get loss images = data['images'] images = Variable(torch.from_numpy(images), volatile=True).cuda() att_feats_ens, fc_feats_ens = ensemble.get_feats(images) seq, probs = ensemble.sample(fc_feats_ens, att_feats_ens, eval_kwargs) sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): spath = short_path(data['infos'][k]['file_path']) print_sampled(spath, sent) entry = {'image_id': spath, 'caption': sent} predictions.append(entry) # logger.debug('image %s: %s' %(entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] # logger.warn('ix1 = %d - ix0 = %d' % (ix1, ix0)) if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() # logger.debug('validation loss ... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # pickle.dump(Feats, open('cnn_features.pkl', 'w')) return predictions
def eval_external(cnn_model, model, loader, eval_kwargs={}): num_images = eval_kwargs.get('num_images', -1) split = eval_kwargs.get('split', 'val') # serves no purpose except to have the same signature for get_batch beam_size = eval_kwargs.get('beam_size', 1) logger = eval_kwargs.get('logger') caption_model = eval_kwargs.get('caption_model') beam_size = eval_kwargs.get('beam_size', 1) sample_max = eval_kwargs.get('sample_max', 1) temperature = eval_kwargs.get('temperature', 0.5) forbid_unk = eval_kwargs.get('forbid_unk', 1) print("Eval %s" % caption_model) # Make sure in the evaluation mode cnn_model.eval() model.eval() loader.reset_iterator(split) n = 0 predictions = [] seq_per_img = 1 while True: data = loader.get_batch(split, seq_per_img=seq_per_img) n = n + loader.batch_size # forward the model to get loss images = data['images'] images = Variable(torch.from_numpy(images), volatile=True).cuda() att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps( images, seq_per_img, return_unique=True) seq, _ = model.sample( fc_feats, att_feats, { 'beam_size': beam_size, 'forbid_unk': forbid_unk, "sample_max": sample_max, "temperature": temperature }) sents = decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): spath = short_path(data['infos'][k]['file_path']) entry = {'image_id': spath, 'caption': sent} print_sampled(spath, sent) predictions.append(entry) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # Switch back to training mode model.train() return predictions
def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result, vocab, cocoid2caps, seqLen, opt): # batch_size = gen_result.size(0) # batch_size = sample_size * seq_per_img batch_size = len(gen_result) # seq_per_img = batch_size // len(data['gts']) # get greedy decoding baseline model.eval() with torch.no_grad(): # greedy_res, _ = model(fc_feats, att_feats, att_masks=att_masks, mode='sample') word_idx, father_idx, mask = model._greedy_search(fc_feats, att_feats, max_seq_length=40) model.train() greedy_res = utils.decode_sequence(vocab, word_idx, father_idx, mask) res = OrderedDict() for i in range(batch_size): res[i] = [gen_result[i]] for i in range(batch_size): res[batch_size + i] = [greedy_res[i]] gts = OrderedDict() for i in range(batch_size): gts[i] = cocoid2caps[data['image_id'][i].item()] res_ = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] res__ = {i: res[i] for i in range(2 * batch_size)} gts = {i: gts[i % batch_size] for i in range(2 * batch_size)} # for i in range(2 * batch_size): # print(res[i], gts[i]) if opt.cider_reward_weight > 0: _, cider_scores = CiderD_scorer.compute_score(gts, res_) print('Sider scores:', _ * 0.1) else: cider_scores = 0 if opt.bleu_reward_weight > 0: _, bleu_scores = Bleu_scorer.compute_score(gts, res__) bleu_scores = np.array(bleu_scores[3]) print('Bleu scores:', _[3]) else: bleu_scores = 0 scores = opt.cider_reward_weight * cider_scores + opt.bleu_reward_weight * bleu_scores scores = scores[:batch_size] - scores[batch_size:] scores = scores * 0.1 print('Mean reward:', scores.mean()) rewards = np.repeat(scores[:, np.newaxis], seqLen, 1) return rewards
def create_model(self, pred_sequence, inp_shape): ## change sequence to its decoded value pred_sequence = utils.decode_sequence( utils.vocab_dict(self.target_classes), pred_sequence) ## set optimizer parameters if self.optimizer == 'sgd': optim = optimizers.SGD(lr=self.lr, decay=self.decay, momentum=self.momentum) else: optim = getattr(optimizers, self.optimizer)(lr=self.lr, decay=self.decay) ## generate a sequential architecture for the sequence ## add flatten if data is 3d or more if len(inp_shape) > 1: model = Sequential() model.add(Flatten(name='flatten', input_shape=inp_shape)) for i in range(len(pred_sequence)): if pred_sequence[i] is 'dropout': model.add(Dropout(self.dropout)) else: model.add( Dense(units=pred_sequence[i][0], activation=pred_sequence[i][1])) model.compile(loss=self.loss_func, optimizer=optim, metrics=self.metrics) return model else: model = Sequential() for i in range(len(pred_sequence)): if i == 0: model.add( Dense(units=pred_sequence[i][0], activation=pred_sequence[i][1], input_shape=inp_shape)) elif pred_sequence[i] is 'dropout': model.add(Dropout(self.dropout)) else: model.add( Dense(units=pred_sequence[i][0], activation=pred_sequence[i][1])) model.compile(loss=self.loss_func, optimizer=optim, metrics=self.metrics) return model
def score_trads(preds, trg_loader, eval_kwargs): split = eval_kwargs.get('split', 'val') batch_size = eval_kwargs.get('batch_size', 80) verbose = eval_kwargs.get('verbose', 0) ground_truths = [] trg_loader.reset_iterator(split) n = 0 while True: # get batch data_trg = trg_loader.get_trg_batch(split, range(batch_size), batch_size) output_lines_trg_gold = data_trg['out_labels'] n += batch_size # Decode a minibatch greedily __TODO__ add beam search decoding # Do the same for gold sentences sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg_gold, eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 1000) else: verb = verbose for (l, gl) in zip(preds, sent_gold): ground_truths.append(gl) if verb: lg.print_sampled("", gl, l) ix1 = data_trg['bounds']['it_max'] if data_trg['bounds']['wrapped']: break if n >= ix1: print('Evaluated the required samples (%s)' % n) break bleu_moses, _ = corpus_bleu(preds, ground_truths) scores = {'Bleu': bleu_moses} return scores
def validate(model, criterion, loader, opt, max_iters=None, type='val'): model.eval() loader.reset() num_videos = loader.get_num_videos() batch_size = loader.get_batch_size() if max_iters is None: num_iters = int(math.ceil(num_videos * 1.0 / batch_size)) else: num_iters = max_iters last_batch_size = num_videos % batch_size seq_per_img = loader.get_seq_per_img() model.set_seq_per_img(seq_per_img) loss_sum = 0 logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters, batch_size, seq_per_img) predictions = [] gt_avglogps = [] test_avglogps = [] prec_recs = dict() for ii in range(num_iters): data = loader.get_batch() feats = data['feats'] bfeats = data['bfeats'] if loader.has_label: labels = data['labels'] masks = data['masks'] labels_svo = data['labels_svo'] if ii == (num_iters - 1) and last_batch_size > 0: feats = [f[:last_batch_size] for f in feats] bfeats = [f[:last_batch_size] for f in bfeats] if loader.has_label: labels = labels[:last_batch_size * seq_per_img] # labels shape is DxN masks = masks[:last_batch_size * seq_per_img] labels_svo = labels_svo[:last_batch_size * seq_per_img] # labels shape is DxN if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] bfeats = [bfeat.cuda() for bfeat in bfeats] if loader.has_label: labels = labels.cuda() masks = masks.cuda() labels_svo = labels_svo.cuda() if loader.has_label and model.gt_concepts_while_testing == 0: pred, gt_seq, gt_logseq, _, _, _ = model(feats, bfeats, labels, labels_svo) # memReport() if opt.output_logp == 1: gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data) gt_avglogps.extend(gt_avglogp) loss = criterion(pred, labels[:, 1:], masks[:, 1:]) loss_sum += loss.item() del pred, gt_seq, gt_logseq torch.cuda.empty_cache() seq, logseq, _, concept_seq = model.sample( feats, bfeats, labels_svo, {'beam_size': opt.beam_size}) sents = utils.decode_sequence(opt.vocab, seq) if opt.output_logp == 1: test_avglogp = utils.compute_avglogp(seq, logseq) test_avglogps.extend(test_avglogp) if concept_seq is not None: # if type == 'test': # if concept_seq.shape[0] != 136: # print() labels_svo = torch.reshape( labels_svo, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0] # concept_seq = torch.reshape(concept_seq, (-1, opt.test_seq_per_img, opt.num_concepts))[:, 0] concept_seq_words = utils.decode_sequence(opt.vocab, concept_seq) # Calculate TP,FP,FN for precision and recall calcs if opt.grounder_type in ['niuc', 'nioc', 'iuc', 'ioc']: gt_concept_seq_words = utils.decode_sequence( opt.vocab, labels_svo) gt_concept_seq_words = [ g.split(' ') for g in gt_concept_seq_words ] for bi in range(len(gt_concept_seq_words)): pr_words = list() repeat = int( len(gt_concept_seq_words) / len(concept_seq_words)) for pr in concept_seq_words[int( math.floor(float(bi) / repeat))].split(' '): pr_word = pr.split(' ')[0] pr_words.append(pr_word) if pr_word not in prec_recs: prec_recs[pr_word] = [0, 0, 0] if pr_word in gt_concept_seq_words[bi]: prec_recs[pr_word][0] += 1 # TP else: prec_recs[pr_word][1] += 1 # FP for gt in gt_concept_seq_words[bi]: if gt not in prec_recs: prec_recs[gt] = [0, 0, 0] if gt not in pr_words: prec_recs[gt][2] += 1 # FN try: for jj, (sent, sent_svo) in enumerate(zip(sents, concept_seq_words)): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'svo': sent_svo, 'avglogp': test_avglogp[jj], 'box_att': model.attention_record[jj].tolist() } else: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'svo': sent_svo } #, 'box_att': model.attention_record[jj].tolist()} # todo removed fot transformer model predictions.append(entry) logger.debug('[%d] video %s: %s pr(%s) gt(%s)' % (jj, entry['image_id'], entry['caption'], entry['svo'], gt_concept_seq_words[jj])) except IndexError: print() else: for jj, sent in enumerate(sents): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'avglogp': test_avglogp[jj], 'box_att': model.attention_record[jj].tolist() } else: entry = {'image_id': data['ids'][jj], 'caption': sent} predictions.append(entry) logger.debug('[%d] video %s: %s' % (jj, entry['image_id'], entry['caption'])) del feats, labels, masks, labels_svo, seq, logseq torch.cuda.empty_cache() loss = round(loss_sum / num_iters, 3) results = {} lang_stats = {} if opt.language_eval == 1 and loader.has_label: logger.info('>>> Language evaluating ...') tmp_checkpoint_json = os.path.join( opt.model_file.split('.')[0] + '_' + type + '.json') json.dump(predictions, open(tmp_checkpoint_json, 'w')) lang_stats = utils.language_eval(loader.cocofmt_file, tmp_checkpoint_json) # os.remove(tmp_checkpoint_json) results['predictions'] = predictions results['scores'] = {'Loss': -loss} results['scores'].update(lang_stats) if opt.output_logp == 1: avglogp = sum(test_avglogps) / float(len(test_avglogps)) results['scores'].update({'avglogp': avglogp}) gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img) assert num_videos == gt_avglogps.shape[0] gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl', 1) cPickle.dump(gt_avglogps, open(gt_avglogps_file, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) logger.info('Wrote GT logp to: %s', gt_avglogps_file) if len(prec_recs.keys()) > 0: prec = dict() rec = dict() for k, v in prec_recs.items(): if v[0] + v[1] > 0: prec[k] = v[0] / float(v[0] + v[1]) else: prec[k] = 0 if v[0] + v[2] > 0: rec[k] = v[0] / float(v[0] + v[2]) else: rec[k] = 0 precv = sum(prec.values()) / len(prec_recs) recv = sum(rec.values()) / len(prec_recs) results['scores'].update({'prec': precv, 'rec': recv}) print('prec: ', precv, ' .. rec: ', recv) logger.debug('prec: ' + str(prec)) logger.debug('rec: ' + str(rec)) return results
def validate(model, criterion, loader, opt): model.eval() loader.reset() num_videos = loader.get_num_videos() batch_size = loader.get_batch_size() num_iters = int(math.ceil(num_videos * 1.0 / batch_size)) last_batch_size = num_videos % batch_size seq_per_img = loader.get_seq_per_img() model.set_seq_per_img(seq_per_img) loss_sum = 0 logger.info('#num_iters: %d, batch_size: %d, seg_per_image: %d', num_iters, batch_size, seq_per_img) predictions = [] gt_avglogps = [] test_avglogps = [] for ii in range(num_iters): data = loader.get_batch() feats = [Variable(feat, volatile=True) for feat in data['feats']] if loader.has_label: labels = Variable(data['labels'], volatile=True) masks = Variable(data['masks'], volatile=True) if ii == (num_iters - 1) and last_batch_size > 0: feats = [f[:last_batch_size] for f in feats] if loader.has_label: labels = labels[:last_batch_size * seq_per_img] # labels shape is DxN masks = masks[:last_batch_size * seq_per_img] if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] if loader.has_label: labels = labels.cuda() masks = masks.cuda() if loader.has_label: pred, gt_seq, gt_logseq = model(feats, labels) if opt.output_logp == 1: gt_avglogp = utils.compute_avglogp(gt_seq, gt_logseq.data) gt_avglogps.extend(gt_avglogp) loss = criterion(pred, labels[:, 1:], masks[:, 1:]) loss_sum += loss.data[0] seq, logseq = model.sample(feats, {'beam_size': opt.beam_size}) sents = utils.decode_sequence(opt.vocab, seq) if opt.output_logp == 1: test_avglogp = utils.compute_avglogp(seq, logseq) test_avglogps.extend(test_avglogp) for jj, sent in enumerate(sents): if opt.output_logp == 1: entry = { 'image_id': data['ids'][jj], 'caption': sent, 'avglogp': test_avglogp[jj] } else: entry = {'image_id': data['ids'][jj], 'caption': sent} predictions.append(entry) logger.debug('[%d] video %s: %s' % (jj, entry['image_id'], entry['caption'])) loss = round(loss_sum / num_iters, 3) results = {} lang_stats = {} if opt.language_eval == 1 and loader.has_label: logger.info('>>> Language evaluating ...') tmp_checkpoint_json = os.path.join(opt.model_file + str(uuid.uuid4()) + '.json') json.dump(predictions, open(tmp_checkpoint_json, 'w')) lang_stats = utils.language_eval(loader.cocofmt_file, tmp_checkpoint_json) os.remove(tmp_checkpoint_json) results['predictions'] = predictions results['scores'] = {'Loss': -loss} results['scores'].update(lang_stats) if opt.output_logp == 1: avglogp = sum(test_avglogps) / float(len(test_avglogps)) results['scores'].update({'avglogp': avglogp}) gt_avglogps = np.array(gt_avglogps).reshape(-1, seq_per_img) assert num_videos == gt_avglogps.shape[0] gt_avglogps_file = opt.model_file.replace('.pth', '_gt_avglogps.pkl', 1) cPickle.dump(gt_avglogps, open(gt_avglogps_file, 'w'), protocol=cPickle.HIGHEST_PROTOCOL) logger.info('Wrote GT logp to: %s', gt_avglogps_file) return results
def evaluate_model(model, src_loader, trg_loader, logger, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] batch_size = eval_kwargs.get('batch_size', 1) split = eval_kwargs.get('split', 'val') verbose = eval_kwargs.get('verbose', 0) eval_kwargs['BOS'] = trg_loader.bos eval_kwargs['EOS'] = trg_loader.eos eval_kwargs['PAD'] = trg_loader.pad eval_kwargs['UNK'] = trg_loader.unk # Make sure to be in evaluation mode model.eval() src_loader.reset_iterator(split) trg_loader.reset_iterator(split) n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 while True: # get batch data_src, order = src_loader.get_src_batch(split, batch_size) tmp = [data_src['labels']] input_lines_src, = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp] src_lengths = data_src['lengths'] data_trg = trg_loader.get_trg_batch(split, order, batch_size) tmp = [data_trg['labels'], data_trg['out_labels'], data_trg['mask']] input_lines_trg_gold, output_lines_trg_gold, mask = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp] trg_lengths = data_trg['lengths'] n += batch_size # decoder_logit = model(input_lines_src, input_lines_trg_gold) # if model.opt.sample_reward: # ml_loss, loss, stats = model.crit(model, input_lines_src, input_lines_trg_gold, # output_lines_trg_gold, mask) # else: # ml_loss, loss, stats = model.crit(decoder_logit, output_lines_trg_gold, mask) ml_loss, loss, _ = model.step(input_lines_src, src_lengths, input_lines_trg_gold, trg_lengths, output_lines_trg_gold, mask) loss_sum += loss.data.item() ml_loss_sum += ml_loss.data.item() loss_evals = loss_evals + 1 # Initialize target with <BOS> for every sentence Index = 2 # print('Sampling sentence') # print('GPU:', os.environ['CUDA_VISIBLE_DEVICES']) start = time.time() # print('>>> Sampling:') batch_preds, _ = model.sample(input_lines_src, src_lengths, opt=eval_kwargs) if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos)[0] for pred in batch_preds] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), input_lines_src.data.cpu().numpy(), eos=src_loader.eos, bos=src_loader.bos) sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg_gold.data.cpu().numpy(), eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 300) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) ix1 = data_src['bounds']['it_max'] if data_src['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # print('Predictions lenght:', len(preds), len(ground_truths)) # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0]) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
def eval_ensemble(ens_model, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) seq_length = eval_kwargs.get('seq_length', 16) split = eval_kwargs.get('split', 'test') lang_eval = eval_kwargs.get('language_eval', 0) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) batch_size = eval_kwargs.get('batch_size', 1) val_images_use = eval_kwargs.get('val_images_use', -1) print('Evaluating ', val_images_use, ' images') # Make sure in the evaluation mode for cnn_model in ens_model.cnn_models: cnn_model.eval() for model in ens_model.models: model.eval() loader.reset_iterator(split) n = 0 # loss_sum = 0 # real_loss_sum = 0 # loss_evals = 0 predictions = [] while True: # fetch a batch of data data = loader.get_batch(split, batch_size) n = n + batch_size #evaluate loss if we have the labels # loss = 0 # Get the image features first tmp = [ data['images'], data.get('labels', np.zeros(1)), data.get('masks', np.zeros(1)), data['scores'] ] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] images, labels, masks, scores = tmp att_feats_ens = [] fc_feats_ens = [] for cnn_model in ens_model.cnn_models: att_feats, fc_feats = cnn_model.forward(images) att_feats_ens.append(att_feats) fc_feats_ens.append(fc_feats) # Eavluate the loss: # real_loss, loss = ens_model.step(data) # loss_sum = loss_sum + loss.data[0] # real_loss_sum += real_loss.data[0] # loss_evals = loss_evals + 1 seq, probs = ens_model.sample(fc_feats_ens, att_feats_ens, eval_kwargs) sent_scores = probs.cpu().numpy().sum(axis=1) sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): # print('id:', data['infos'][k]['id']) entry = { 'image_id': data['infos'][k]['id'], 'caption': sent, 'score': str(round(sent_scores[k], 4)), "source": 'gen' } predictions.append(entry) if verbose: print(('image %s (%s) %s' % (entry['image_id'], entry['score'], entry['caption']))) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if val_images_use != -1: ix1 = min(ix1, val_images_use) if data['bounds']['wrapped']: break if n >= ix1: ens_model.logger.warn('Evaluated the required samples (%s)' % n) break lang_stats = None unseen_grams = None if lang_eval == 1: lang_stats, unseen_grams = language_eval(dataset, predictions, ens_opt.logger, get_creativity=False) # Switch back to training mode # model.train() return predictions, lang_stats
def eval_multiple(cnn_model, model, crit, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) score_ground_truth = eval_kwargs.get('score_ground_truth', False) n_gen = eval_kwargs.get('n_gen', 5) num_images = eval_kwargs.get('num_images', -1) seq_length = eval_kwargs.get('seq_length', 16) split = eval_kwargs.get('split', 'test') lang_eval = eval_kwargs.get('language_eval', 0) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) batch_size = eval_kwargs.get('batch_size', 1) # Make sure in the evaluation mode cnn_model.eval() model.eval() loader.reset_iterator(split) n = 0 loss_sum = 0 loss_evals = 1e-8 predictions = [] while True: # fetch a batch of data data = loader.get_batch(split, batch_size) n = n + batch_size #evaluate loss if we have the labels loss = 0 # Get the image features first tmp = [ data['images'], data.get('labels', np.zeros(1)), data.get('masks', np.zeros(1)), data['scores'] ] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] images, labels, masks, scores = tmp att_feats, fc_feats = cnn_model.forward(images) _att_feats = att_feats _fc_feats = fc_feats # forward the model to get loss if data.get('labels', None) is not None: att_feats = att_feats.unsqueeze(1).expand(*(( att_feats.size(0), loader.seq_per_img, ) + att_feats.size()[1:])).contiguous().view( *((att_feats.size(0) * loader.seq_per_img, ) + att_feats.size()[1:])) fc_feats = fc_feats.unsqueeze(1).expand(*(( fc_feats.size(0), loader.seq_per_img, ) + fc_feats.size()[1:])).contiguous().view( *((fc_feats.size(0) * loader.seq_per_img, ) + fc_feats.size()[1:])) input = model(fc_feats, att_feats, labels) probs = input N = input.size(0) mask = masks[:, 1:] target = labels[:, 1:] target = target[:, :input.size(1)] mask = mask[:, :input.size(1)] input = utils.to_contiguous(input).view(-1, input.size(2)) target = utils.to_contiguous(target).view(-1, 1) mask = mask[:, :input.size(1)] mask = utils.to_contiguous(mask).view(-1, 1) output = input.gather(1, target) * mask output = output.cpu().data.numpy() # sum over seq_length gt_scores = [ np.sum(output[seq_length * i:seq_length * (i + 1)]) for i in np.arange(N) ] gt_sents = decode_sequence(loader.get_vocab(), labels[:, 1:].data) real_loss, loss = crit(probs, labels[:, 1:], masks[:, 1:], scores) loss_sum = loss_sum + loss.item() loss_evals = loss_evals + 1 # forward the model to also get generated samples for each image # Only leave one feature for each image, in case duplicate sample fc_feats, att_feats = _fc_feats, _att_feats # forward the model to also get generated samples for each image for _ in range(n_gen): seq, probs = model.sample(fc_feats, att_feats, eval_kwargs) sent_scores = probs.cpu().numpy().sum(axis=1) #set_trace() sents = decode_sequence(loader.get_vocab(), seq) print('Gen:', len(sents), len(sent_scores)) for k, sent in enumerate(sents): # print('id:', data['infos'][k]['id']) if loader.flip: entry = { 'image_id': data['infos'][k // 2]['id'], 'caption': sent, 'score': str(round(sent_scores[k], 4)), "source": 'gen' } if not k % 2: unflipped = entry else: # compare the new entry to unflipped and keep the best candidate # print('Comparing:', entry, ' to ', unflipped) if float(entry['score']) > float(unflipped['score']): predictions.append(entry) # print('picking:', entry) else: predictions.append(unflipped) # print('picking:', unflipped) else: entry = { 'image_id': data['infos'][k]['id'], 'caption': sent, 'score': str(round(sent_scores[k], 4)), "source": 'gen' } predictions.append(entry) if verbose: # print(entry) print( ('%s >> %s' % (entry['image_id'], entry['caption']))) if score_ground_truth: print('Gt:', len(gt_sents), len(gt_scores)) for k, sent in enumerate(gt_sents): if loader.flip: entry = { 'image_id': data['infos'][k // (loader.seq_per_img * 2)]['id'], 'caption': sent, 'score': str(round(gt_scores[k], 4)), "source": 'gt' } else: entry = { 'image_id': data['infos'][k // loader.seq_per_img]['id'], 'caption': sent, 'score': str(round(gt_scores[k], 4)), "source": 'gt' } predictions.append(entry) if verbose: print(( 'image %s (GT : %s) %s' % (entry['image_id'], entry['score'], entry['caption']))) # if we wrapped around the split or used up val imgs budget then bail ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if verbose: print('evaluating validation preformance... %d/%d (%f)' % (ix0 - 1, ix1, loss.item())) if data['bounds']['wrapped']: break if num_images >= 0 and n >= num_images: break lang_stats = None unseen_grams = None if lang_eval == 1: lang_stats, unseen_grams = language_eval(dataset, predictions, logger=None) # FIXME # Switch back to training mode model.train() return loss_sum / loss_evals, predictions, lang_stats, unseen_grams
def eval_split(cnn_model, model, loader, logger, eval_kwargs={}): verbose = eval_kwargs.get('verbose', False) dataset = eval_kwargs.get('dataset', 'coco') split = eval_kwargs.get('split', 'val') val_images_use = eval_kwargs.get('val_images_use', -1) lang_eval = eval_kwargs.get('language_eval', 1) language_creativity = eval_kwargs.get('language_creativity', 1) all_metrics = eval_kwargs.get('all_metrics', 0) single_metrics = eval_kwargs.get('single_metrics', 0) beam_size = eval_kwargs.get('beam_size', 1) sample_max = eval_kwargs.get('sample_max', 1) temperature = eval_kwargs.get('temperature', 0.5) forbid_unk = eval_kwargs.get('forbid_unk', 1) batch_size = eval_kwargs.get('batch_size', 1) seq_per_img = eval_kwargs.get('seq_per_img') region_size = model.region_size # Make sure to be in the evaluation mode cnn_model.eval() model.eval() logger.warn('Evaluating the %s split (%d)' % (split, val_images_use)) loader.reset_iterator(split) n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 predictions = [] while True: data = loader.get_batch(split, batch_size=batch_size, seq_per_img=seq_per_img) n = n + loader.batch_size images = data['images'] images = Variable(torch.from_numpy(images), requires_grad=False).cuda() att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps( images, seq_per_img, return_unique=True) ml_loss, loss, stats = model.step(data, att_feats, fc_feats, train=False) # print('Scores : ', stats) ml_loss_sum += ml_loss.item() loss_sum += loss.item() loss_evals = loss_evals + 1 # TODO Only leave one feature for each image, in case duplicate sample seq, probs = model.sample(fc_unique, att_unique, opt={ 'beam_size': beam_size, "forbid_unk": forbid_unk, "sample_max": sample_max, "temperature": temperature }) sent_scores = probs.cpu().numpy().sum(axis=1) sents = decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): if loader.flip: entry = { 'image_id': data['infos'][k // 2]['id'], 'caption': sent, 'score': sent_scores[k] } if not k % 2: unflipped = entry else: if entry['score'] > unflipped['score']: del entry['score'] predictions.append(entry) else: del unflipped['score'] predictions.append(unflipped) else: entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) print_sampled(entry['image_id'], entry['caption']) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if val_images_use != -1: ix1 = min(ix1, val_images_use) for i in range(n - ix1): predictions.pop() if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break lang_stats = None if lang_eval: lang_stats, preds, _ = language_eval(dataset, predictions, logger, all_metrics, single_metrics, language_creativity) print('preds:', preds) # Back to training: model.train() if model.cnn_finetuning: logger.warn('Finetuning cnn ON, filtering the BN layers') cnn_model.train() cnn_model.filter_bn() return ml_loss_sum / loss_evals, loss_sum / loss_evals, predictions, lang_stats, preds
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' # convnet = 'resnet152' # convnet = 'vgg16' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) #D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\CNN_SaYwh6chmiw_15_40.npy videos = { # 1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19_adversarialWINDOW.avi', 3: 'O2qiPS2NCeY_2_18_adversarialWINDOW.avi', 4: 'kI6MWZrl8v8_149_161_adversarialWINDOW.avi', 5: 'X7sQq-Iu1gQ_12_22_adversarialWINDOW.avi', 6: '77iDIp40m9E_159_181_adversarialWINDOW.avi', 7: 'SaYwh6chmiw_15_40_adversarialWINDOW.avi', 8: 'pFSoWsocv0g_8_17_adversarialWINDOW.avi', 9: 'HmVPxs4ygMc_44_53_adversarialWINDOW.avi', 10: 'glii-kazad8_21_29_adversarialWINDOW.avi', 11: 'AJJ-iQkbRNE_97_109_adversarialWINDOW.avi' } videos_CNN = { # 1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19.avi', 3: 'O2qiPS2NCeY_2_18.avi', 4: 'kI6MWZrl8v8_149_161.avi', 5: 'X7sQq-Iu1gQ_12_22.avi', 6: '77iDIp40m9E_159_181.avi', 7: 'SaYwh6chmiw_15_40.avi', 8: 'pFSoWsocv0g_8_17.avi', 9: 'HmVPxs4ygMc_44_53.avi', 10: 'glii-kazad8_21_29.avi', 11: 'AJJ-iQkbRNE_97_109.avi' } #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # video_path = opt['videos'][0] modelname = 'nasnetalarge' o_video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos_CNN[ 2] video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}Adversarial_'.format(modelname) + \ videos_CNN[2] # video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\vgg16Adversarial_SaYwh6chmiw_15_40.avi' numpy_path = "D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\{}CNN_{}.npy".format( modelname, videos_CNN[2].split('.')[0]) adv_frames = np.load(numpy_path) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray print(video_path) with torch.no_grad(): frames = skvideo.io.vread(o_video_path) batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Original: ", sents[0]) frames = skvideo.io.vread(video_path) print("Total frames: {}".format(len(frames))) # print(frames[[0, 1, 2, 3, 4, 5]].shape) plt.imshow(frames[0] / 255.) plt.show() # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial huffyuv: ", sents[0]) np_frames = adv_frames.astype(np.uint8) print("Numpy CNN frames \nTotal frames: {}".format(len(np_frames))) # print(frames[[0, 1, 2, 3, 4, 5]].shape) plt.imshow(np_frames[0] / 255.) plt.show() # bp --- batches = create_batches(np_frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial numpy: ", sents[0])
def architecture_search(self): ## initialise network modelling and controller instances self.nn = gnn.NeuralNetwork(self.target_classes) self.nn.optimizer = self.nn_optim self.nn.lr = self.nn_lr self.nn.decay = self.nn_decay self.nn.momentum = self.nn_momentum self.nn.dropout = self.dropout self.cntrl = lstm.LSTMController(self.max_len, self.nb_classes, self.target_classes, (1, self.max_len - 1), len(self.data)) self.cntrl.lstm_dim = self.lstm_dim self.cntrl.use_attention = self.controller_attention self.cntrl.optimizer = self.controller_optim self.cntrl.lr = self.controller_lr self.cntrl.decay = self.controller_decay self.cntrl.momentum = self.controller_momentum ## start architecture search for n in range(self.cntrl_epochs): # self.pre_training = False print("Controller epoch:", n + 1) self.curr_epoch = n ## generate sequences using random probabilistic sampling sequences = self.cntrl.sample_arch_sequences(self.mc_samples) ## train predictor and get predicted accuracies for new sequences pred_val_acc = self.cntrl.get_predicted_accuracies_hybrid_model( sequences) ## for each randomly generated sample for i in range(len(sequences)): print("probabilistic sampling. model no:", i + 1) print(utils.decode_sequence(self.vocab, sequences[i])) ## create model. train model print("training model...") if self.target_classes == 2: self.nn.loss_func = 'binary_crossentropy' model = self.nn.create_model(sequences[i], np.shape(self.x_data[0])) print("predicted validation accuracy:", pred_val_acc[i]) x, y = utils.unison_shuffled_copies(self.x_data, self.y_data) history = self.nn.train_model(model, x, y, self.nn_epochs) ## condition to avoid error for nn_epochs = 1 if len(history.history['val_acc']) == 1: self.data.append([ sequences[i], history.history['val_acc'][0], pred_val_acc[i] ]) else: self.data.append([ sequences[i], np.ma.average(history.history['val_acc'], weights=np.arange( 1, len(history.history['val_acc']) + 1), axis=-1), pred_val_acc[i] ]) cntrl_sequences = pad_sequences(sequences, maxlen=self.max_len, padding='post') xc = cntrl_sequences[:, :-1].reshape(len(cntrl_sequences), 1, self.max_len - 1) yc = to_categorical(cntrl_sequences[:, -1], self.nb_classes) ## sequence, validation accuracy data sorted by validation accuracy print("[sequence, val acc, predicted val acc]") for data in self.data: print(data) ## train the controller val_acc_target = [item[1] for item in self.data] self.cntrl.train_hybrid_model(xc, yc, val_acc_target[-self.mc_samples:], self.custom_loss, len(self.data), self.hybrid_model_epochs) val_accs = [item[1] for item in self.data] sorted_idx = np.argsort(val_accs)[::-1] self.data = [self.data[x] for x in sorted_idx] print( "saving tested architectures, their validation accuracy and predicted accuracy..." ) with open( 'logdir/tested_archs_data{}.pkl'.format( datetime.now().strftime("%H%M")), 'wb') as file: pickle.dump(self.data, file) print("saving encoding-decoding dictionary...") with open('logdir/encode_decode_dict.pkl', 'wb') as file: pickle.dump(self.vocab, file) return self.data
def train(model, criterion, optimizer, train_loader, val_loader, opt, rl_criterion=None): infos = { 'iter': 0, 'epoch': 0, 'start_epoch': 0, 'best_score': float('-inf'), 'best_iter': 0, 'best_epoch': opt.max_epochs } checkpoint_checked = False rl_training = False seq_per_img = train_loader.get_seq_per_img() infos_history = {} if os.path.exists(opt.start_from): if os.path.isdir(opt.start_from): # loading the same model file at a different experiment dir start_from_file = os.path.join(opt.start_from, os.path.basename(opt.model_file)) else: start_from_file = opt.start_from logger.info('Loading state from: %s', start_from_file) checkpoint = torch.load(start_from_file) model.load_state_dict(checkpoint['model']) infos = checkpoint['infos'] infos['start_epoch'] = infos['epoch'] checkpoint_checked = True # this epoch is already checked else: logger.info('No checkpoint found! Training from the scratch') if opt.use_rl == 1 and opt.use_rl_after == 0: opt.use_rl_after = infos['epoch'] opt.use_cst_after = infos['epoch'] train_loader.set_current_epoch(infos['epoch']) if opt.grounder_type in ['niuc', 'iuc']: # get class weights one_hot_sums = None totes = 0 cur_index = train_loader.get_current_index() train_loader.reset() ep = infos['epoch'] while True: data = train_loader.get_batch() labels_svo = data['labels_svo'] one_hot = torch.clamp( torch.sum(torch.nn.functional.one_hot( labels_svo, num_classes=model.vocab_size), axis=1), 0, 1) one_hot[:, 0] = 0 # make the padding index 0 totes += one_hot.shape[0] if one_hot_sums is None: one_hot_sums = torch.sum(one_hot, axis=0) else: one_hot_sums += torch.sum(one_hot, axis=0) if ep < train_loader.get_current_epoch(): one_hot_negs = -one_hot_sums + totes pos_weight = one_hot_negs.type(torch.FloatTensor) / ( 1 + one_hot_sums.type(torch.FloatTensor)) pos_weight = pos_weight.cuda() train_loader.set_current_index(index=cur_index) break while True: t_start = time.time() model.train() data = train_loader.get_batch() feats = data['feats'] bfeats = data['bfeats'] labels = data['labels'] masks = data['masks'] labels_svo = data['labels_svo'] masks_svo = data['masks_svo'] if torch.cuda.is_available(): feats = [feat.cuda() for feat in feats] bfeats = [bfeat.cuda() for bfeat in bfeats] labels = labels.cuda() masks = masks.cuda() labels_svo = labels_svo.cuda() masks_svo = masks_svo.cuda() # implement scheduled sampling opt.ss_prob = 0 if opt.use_ss == 1 and infos['epoch'] >= opt.use_ss_after: annealing_prob = opt.ss_k / \ (opt.ss_k + np.exp((infos['epoch'] - opt.use_ss_after) / opt.ss_k)) opt.ss_prob = min(1 - annealing_prob, opt.ss_max_prob) model.set_ss_prob(opt.ss_prob) if opt.use_rl == 1 and infos[ 'epoch'] >= opt.use_rl_after and not rl_training: logger.info('Using RL objective...') rl_training = True bcmr_scorer = { 'Bleu_4': Bleu(), 'CIDEr': Cider(df=opt.train_cached_tokens), 'METEOR': Meteor(), 'ROUGE_L': Rouge(), 'SPICE': Spice() }[opt.eval_metric] #logger.info('loading gt refs: %s', train_loader.cocofmt_file) #gt_refs = utils.load_gt_refs(train_loader.cocofmt_file) mixer_from = opt.mixer_from if opt.use_mixer == 1 and rl_training: #annealing_mixer = opt.ss_k / \ # (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k)) #annealing_mixer = int(round(annealing_mixer * opt.seq_length)) # -1 for annealing if opt.mixer_from == -1: annealing_mixer = opt.seq_length - int( np.ceil((infos['epoch'] - opt.use_rl_after + 1) / float(opt.mixer_descrease_every))) mixer_from = max(1, annealing_mixer) model.set_mixer_from(mixer_from) scb_captions = opt.scb_captions if opt.use_cst == 1 and rl_training: # if opt.use_cst == 1 and opt.ss_k == 0, # then do not using annealing, but the fixed scb_captions provided #annealing_robust = opt.ss_k / \ # (opt.ss_k + np.exp((infos['epoch'] - opt.use_rl_after) / opt.ss_k)) #annealing_robust = int(round((1 - annealing_robust) * seq_per_img)) # do not use robust before fully mixed # if opt.use_mixer == 1 and mixer_from > 1: # opt.use_cst_after = infos['epoch'] # if opt.scb_captions is -1, then use the annealing value, # otherwise, use the set value if opt.scb_captions == -1: annealing_robust = int( np.ceil((infos['epoch'] - opt.use_cst_after + 1) / float(opt.cst_increase_every))) scb_captions = min(annealing_robust, seq_per_img - 1) optimizer.zero_grad() model.set_seq_per_img(seq_per_img) if rl_training: # sampling from model distribution # model_res, logprobs = model.sample( # feats, {'sample_max': 0, 'expand_feat': opt.expand_feat, 'temperature': 1}) # using mixer pred, model_res, logprobs, pred_svo, res_svo, logprobs_svo = model( feats, bfeats, labels, labels_svo) if opt.use_cst == 0: # greedy decoding baseline in SCST paper greedy_baseline, _, _, _ = model.sample( [Variable(f.data, volatile=True) for f in feats], [Variable(f.data, volatile=True) for f in bfeats], { 'sample_max': 1, 'expand_feat': opt.expand_feat }) if opt.use_cst == 1: bcmrscores = data['bcmrscores'] reward, m_score, g_score = utils.get_cst_reward( model_res, data['gts'], bcmr_scorer, bcmrscores=bcmrscores, expand_feat=opt.expand_feat, seq_per_img=train_loader.get_seq_per_img(), scb_captions=scb_captions, scb_baseline=opt.scb_baseline, use_eos=opt.use_eos, use_mixer=opt.use_mixer) else: # use greedy baseline by default, compute self-critical reward reward, m_score, g_score = utils.get_self_critical_reward( model_res, greedy_baseline, data['gts'], bcmr_scorer, expand_feat=opt.expand_feat, seq_per_img=train_loader.get_seq_per_img(), use_eos=opt.use_eos) loss = rl_criterion( model_res, logprobs, Variable(torch.from_numpy(reward).float().cuda(), requires_grad=False)) loss_svo = criterion(pred_svo, labels_svo, torch.ones(labels.shape).cuda()) loss = loss + (opt.labda / 10.0) * loss_svo else: pred, _, _, pred_svo, svo_it, svo_gath = model( feats, bfeats, labels, labels_svo) loss_cap = criterion(pred, labels[:, 1:], masks[:, 1:], bcmrscores=torch.from_numpy( data['bcmrscores'].astype( np.float32)).cuda()) if opt.grounder_type in ['None', 'none']: loss = loss_cap else: if opt.grounder_type in ['niuc', 'iuc']: # unordered svo_criterion = torch.nn.BCEWithLogitsLoss( pos_weight=pos_weight) concepts_one_hot = torch.clamp( torch.sum(torch.nn.functional.one_hot( labels_svo, num_classes=model.vocab_size), axis=1), 0, 1) loss_svo = svo_criterion( pred_svo[:, 0], concepts_one_hot.type(torch.FloatTensor).cuda() ) # pred_svo[: 0] undoes the repeat at the end of non_iterative_grounder() else: loss_svo = criterion(pred_svo, labels_svo, torch.ones(labels.shape).cuda()) # loss_svo = criterion(pred_svo, labels_svo, masks_svo) if random.random() < 0.01: # compare the svos during training print('---------------------') print(utils.decode_sequence(opt.vocab, pred.argmax(-1))) print(utils.decode_sequence(opt.vocab, labels_svo)[0]) print(utils.decode_sequence(opt.vocab, svo_it)[0]) loss = loss_cap + (opt.labda / 10.0) * loss_svo loss.backward() clip_grad_norm_(model.parameters(), opt.grad_clip) optimizer.step() # memReport() del pred, feats, labels, masks, labels_svo torch.cuda.empty_cache() infos['TrainLoss'] = loss.item() infos['CAPTrainLoss'] = loss_cap.item() if opt.grounder_type not in ['None', 'none']: infos['SVOTrainLoss'] = loss_svo.item() else: infos['SVOTrainLoss'] = 0 infos['mixer_from'] = mixer_from infos['scb_captions'] = scb_captions if infos['iter'] % opt.print_log_interval == 0: elapsed_time = time.time() - t_start log_info = [('Epoch', infos['epoch']), ('Iter', infos['iter']), ('Loss', infos['TrainLoss']), ('CAP Loss', infos['CAPTrainLoss']), ('SVO Loss', infos['SVOTrainLoss'])] if rl_training: log_info += [('Reward', np.mean(reward[:, 0])), ('{} (m)'.format(opt.eval_metric), m_score), ('{} (b)'.format(opt.eval_metric), g_score)] if opt.use_ss == 1: log_info += [('ss_prob', opt.ss_prob)] if opt.use_mixer == 1: log_info += [('mixer_from', mixer_from)] if opt.use_cst == 1: log_info += [('scb_captions', scb_captions)] log_info += [('Time', elapsed_time)] logger.info( '%s', '\t'.join(['{}: {}'.format(k, v) for (k, v) in log_info])) infos['iter'] += 1 if infos['epoch'] < train_loader.get_current_epoch(): infos['epoch'] = train_loader.get_current_epoch() checkpoint_checked = False learning_rate = utils.adjust_learning_rate( opt, optimizer, infos['epoch'] - infos['start_epoch']) logger.info('===> Learning rate: %f: ', learning_rate) # checkpoint_checked = False # if 1: todo debuging, jump straight to validation if (infos['epoch'] >= opt.save_checkpoint_from and infos['epoch'] % opt.save_checkpoint_every == 0 and not checkpoint_checked): # evaluate the validation performance results = validate(model, criterion, val_loader, opt) logger.info( 'Validation output: %s', json.dumps(results['scores'], indent=4, sort_keys=True)) # infos.update(results['scores']) # todo added training set eval to check for overfitting cur_index = train_loader.get_current_index() train_loader.reset() results_train = validate(model, criterion, train_loader, opt, max_iters=20, type='train') train_loader.set_current_index(index=cur_index) for k, v in results_train['scores'].items(): results['scores']['Train_' + k] = v logger.info( 'Training output: %s', json.dumps(results_train['scores'], indent=4, sort_keys=True)) infos.update(results['scores']) check_model(model, opt, infos, infos_history) checkpoint_checked = True if (infos['epoch'] >= opt.max_epochs or infos['epoch'] - infos['best_epoch'] > opt.max_patience): logger.info('>>> Terminating...') break return infos
def train(opt): # setup gpu try: import subprocess # gpu_id = subproces.check_output('source gpu_setVisibleDevices.sh', shell=True) gpu_id = int(subprocess.check_output('gpu_getIDs.sh', shell=True)) os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) opt.logger.warn('GPU ID: %s | available memory: %dM' \ % (os.environ['CUDA_VISIBLE_DEVICES'], get_gpu_memory(gpu_id))) except: opt.logger.warn("Requested gpu_id : %s" % opt.gpu_id) os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu_id opt.logger.warn('GPU ID: %s | available memory: %dM' \ % (os.environ['CUDA_VISIBLE_DEVICES'], get_gpu_memory(opt.gpu_id))) from loader import textDataLoader from utils import decode_sequence # reproducibility: opt.logger.info('Reading data ...') src_loader = textDataLoader( { 'h5_file': opt.input_data_src + '.h5', 'infos_file': opt.input_data_src + '.infos', "max_seq_length": opt.max_src_length, 'batch_size': opt.batch_size }, logger=opt.logger) trg_loader = textDataLoader( { 'h5_file': opt.input_data_trg + '.h5', 'infos_file': opt.input_data_trg + '.infos', "max_seq_length": opt.max_trg_length, 'batch_size': opt.batch_size }, logger=opt.logger) goon = True bound = 0 while goon: # Load data from train split (0) data_src, order = src_loader.get_src_batch('test') input_lines_src = data_src['labels'] data_trg = trg_loader.get_trg_batch('test', order) output_lines_trg = data_trg['out_labels'] sent_source = decode_sequence(src_loader.get_vocab(), input_lines_src, eos=src_loader.eos, bos=src_loader.bos) sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg, eos=trg_loader.eos, bos=trg_loader.bos) for i, (src, trg) in enumerate(zip(sent_source, sent_gold)): if bound + i in [134, 1924, 2092]: print(bound + i, '>>>') print('Source:', src) print('Target:', trg) bound = data_src['bounds']['it_pos_now'] goon = bound < 2100
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #Random target caption # target_caption = np.random.choice(viable_target_captions) # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #/96 gives 3 frames length = len(skvideo.io.vread(video_path)) / 96 print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) while (frame_counter < length): print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations))) iteration = iteration + 1 if length - frame_counter < BATCH_SIZE: window = [frame_counter, length] frame_counter = frame_counter + (length - frame_counter) print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) finished_frames = carlini.execute(video_path, window=window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) else: window = [frame_counter, frame_counter + BATCH_SIZE - 1] print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, window=window) finished_frames = carlini.execute(video_path, window=window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) frame_counter = frame_counter + BATCH_SIZE base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') print("\nSaving to: {}".format(adv_path)) adv_frames = np.concatenate(adv_frames, axis=0) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ #huffyuv is lossless. r10k is really good # '-c:v': 'libx264', #libx264 # use the h.264 codec '-c:v': 'huffyuv', #r210 huffyuv r10k # '-pix_fmt': 'rgb32', # '-crf': '0', # set the constant rate factor to 0, which is lossless # '-preset': 'ultrafast' # ultrafast, veryslow the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) writer.close() #ffv1 0.215807946043995 #huffyuv 0.21578424050191813 #libx264 0.2341074901578537 #r210 -0.7831487262059795, -0.7833399258537526 #gif 0.6889478809555243 #png 0.2158991440582696 0.21616862708842177 #qtrle 0.21581286337807626 #flashsv 0.21610510459932186 0.21600030673323545 #ffvhuff 0.21620682250167533 #r10k similar to r210 #rawvideo 0.21595001 with torch.no_grad(): full_decoder = full_decoder.eval() frames = skvideo.io.vread(adv_path) frames = np.float32(frames) difference = np.array(adv_frames) - np.array(frames) np.save('difference_tmp', difference) #loadtxt to load np array from txt exp = np.load('difference_tmp.npy') print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference)) frames = frames + difference # bp --- adv_frames = adv_frames.astype(np.uint8) batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames old: {}".format(sents[0])) batches = exp_create_batches(adv_frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames new: {}".format(sents[0])) frames = frames.astype(np.uint8) batches = create_batches(frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("frames old caption: ", sents[0]) # frames = frames.astype(np.uint8) # batches = create_batches(frames, load_img_fn, tf_img_fn) batches = exp_create_batches(frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() length = len(skvideo.io.vread(video_path)) / 8 print("Total number of frames: {}".format(len( skvideo.io.vread(video_path)))) print("Total number of frames to do: {}".format(length)) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- attn_weights = [] total_iterations = np.ceil(length / BATCH_SIZE) iteration = 1 frame_counter = 0 while (frame_counter < length): if length - frame_counter < BATCH_SIZE: batches = create_batches(frames[frame_counter:int(length)], load_img_fn, tf_img_fn) attn = full_decoder(batches, mode='inference', get_attn=True) frame_counter = frame_counter + (length - frame_counter) else: batches = create_batches( frames[frame_counter:frame_counter + BATCH_SIZE - 1], load_img_fn, tf_img_fn) attn = full_decoder(batches, mode='inference', get_attn=True) frame_counter = frame_counter + BATCH_SIZE # print(attn.shape, attn[0].shape, type(attn)) attn = attn.cpu().detach().numpy().tolist()[0] print("Weights for batch {}: {}".format(iteration, attn)) for f in attn: attn_weights.append(f) iteration = iteration + 1 # attn_weights.append(attn.cpu().detach().numpy().tolist()[0]) batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference', get_attn=False) sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] print(attn_weights) att_window = np.sort( np.argpartition(attn_weights, -ATTACK_BATCH_SIZE)[-ATTACK_BATCH_SIZE:]).tolist() print("Indices of frames with highest attention weights: {}".format( att_window)) #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' adv_frames = [] carlini = CarliniAttack(oracle=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, att_window=att_window) finished_frames = carlini.execute(video_path, att_window=att_window, functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarial.avi') print("\nSaving to: {}".format(adv_path)) adv_frames = np.concatenate(adv_frames, axis=0) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ '-vcodec': 'libx264', # use the h.264 codec '-crf': '0', # set the constant rate factor to 0, which is lossless '-vb': '50M', '-r': '25', '-preset': 'ultrafast' # the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) print(len(adv_frames)) # skvideo.io.vwrite(adv_path, adv_frames) writer.close() with torch.no_grad(): a_frames = skvideo.io.vread(adv_path) # frames = skvideo.io.vread(video_path) # for f in range(0, len(att_window)): # frames[att_window[f]] = a_frames[f] # frames = frames[:50] # frames = adv_frames # print(frames[[0, 1, 2, 3, 4, 5]].shape) # plt.imshow(frames[0]) # plt.show() # # plt.imshow(adv_frames[0]/255.) # plt.show() # bp --- batches = create_batches(a_frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { #lr 0.005 and dimensions 224, c was 100. #Best was 0.06 lr, c = 1 for show and fool. # "batch_size": BATCH_SIZE, "c": 10000, "learning_rate": 0.2, "num_iterations": 1000, "input_shape": (224, 224), "num_frames": 288, "dimensions": 224, "k": 0.1, # "attack_algorithm": "showandfool" "attack_algorithm": "carliniwagner" } convnet = 'vgg16' # convnet = 'nasnetalarge' # convnet = 'resnet152' full_decoder = ConvS2VT(convnet, model, opt) ''' Layer freezing experiment. Top 10 contributing layers: conv.cell_stem_1.comb_iter_0_right.separable_1.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_2_right.separable_2.depthwise_conv2d.weight conv.cell_stem_1.comb_iter_1_right.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_17.comb_iter_4_left.separable_1.depthwise_conv2d.weight conv.cell_16.comb_iter_4_left.separable_1.pointwise_conv2d.weight conv.cell_13.comb_iter_4_left.bn_sep_1.weight conv.reduction_cell_0.conv_prev_1x1.bn.weight conv.cell_17.comb_iter_4_left.separable_2.depthwise_conv2d.weight conv.cell_13.comb_iter_0_left.bn_sep_1.weight ''' top = open("top_layers.txt", "r") top_layers = top.readlines() top.close() print(top_layers) #set the gradients on the layers you don't want to contribute to 0 top_layers = [] for name, parameters in full_decoder.named_parameters(): reset = True for f in top_layers: if name in f: reset = False if reset: parameters.require_grad = False if parameters.grad is not None: print(name) parameters.grad.data.zero_() # for name, parameters in full_decoder.named_parameters(): # for f in top_layers: # if name not in f: # print(name) # parameters.require_grad = False # if parameters.grad is not None: # # parameters.data = 0 # parameters.grad.data.zero_() # else: # # print(name) # continue #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #target_caption = np.random.choice(viable_target_captions) # 5 captions: ''' <sos> A person is typing into a laptop computer <eos> <sos> A boy is kicking a soccer ball into the goal <eos> <sos> Someone is frying fish <eos> <sos> A dog is running with a ball <eos> <sos> The cat approaches on grass <eos> ''' captions = { 1: '<sos> A woman is talking <eos>', 2: '<sos> A boy is kicking a soccer ball into the goal <eos>', 3: '<sos> A man is frying fish <eos>', 4: '<sos> A dog is running with a ball <eos>', 5: '<sos> A cat is walking on grass <eos>' } #1 doesn't work videos = { #2 is too high res or something, replaced X6uJyuD_Zso_3_17.avi with nc8hwLaOyZU_1_19.avi #5,'ceOXCFUmxzA_100_110.avi' out of memory, replaced with 'X7sQq-Iu1gQ_12_22' #1: 'RSx5G0_xH48_12_17.avi', 2: 'nc8hwLaOyZU_1_19.avi', 3: 'O2qiPS2NCeY_2_18.avi', 4: 'kI6MWZrl8v8_149_161.avi', 5: 'X7sQq-Iu1gQ_12_22.avi', 6: '77iDIp40m9E_159_181.avi', 7: 'SaYwh6chmiw_15_40.avi', 8: 'pFSoWsocv0g_8_17.avi', 9: 'HmVPxs4ygMc_44_53.avi', 10: 'glii-kazad8_21_29.avi', 11: 'AJJ-iQkbRNE_97_109.avi' } #"D:\College\Research\December 2018 Video Captioning Attack\video captioner\YouTubeClips\AJJ-iQkbRNE_97_109.avi" # video_path = '' video_path = 'D:\\College\\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\' + videos[ 2] # target_caption = '<sos> A man is moving a toy <eos>' # target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Just switch the number to get a target caption. target_caption = captions[1] #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames # length = math.ceil(len(skvideo.io.vread(video_path,num_frames=config["num_frames"]))/96) #12 frames length = 3 print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = ['Adam', (0.9, 0.999)] crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window while (frame_counter < length): print("\n\n\nIteration {}/{}".format(iteration, int(total_iterations))) iteration = iteration + 1 if length - frame_counter < BATCH_SIZE: window = [frame_counter, length] frame_counter = frame_counter + (length - frame_counter) print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) else: window = [frame_counter, frame_counter + BATCH_SIZE - 1] print("Using frames {}".format(window)) print("Frame counter at: {}\nTotal length is: {}\n".format( frame_counter, length)) attack_package = S2VT_Attack(model=full_decoder, video_path=video_path, target=target_caption, dataset=dataset, config=config, optimizer=optimizer, crit=crit, seq_decoder=seq_decoder, window=window) carlini = Attack(attack_package=attack_package) finished_frames = carlini.execute(functional=True) adv_frames.append(finished_frames.detach().cpu().numpy()) frame_counter = frame_counter + BATCH_SIZE base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') print("\nSaving to: {}".format(adv_path)) # adv_frames_1 = np.concatenate(adv_frames, axis=0) # # batches = create_batches(adv_frames[0].astype(np.uint8), load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames_1.astype(np.uint8), 3) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # print("Adversarial Frames 1: {}".format(sents[0])) adv_frames = np.concatenate(adv_frames, axis=0) # batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Adversarial Frames 2: {}".format(sents[0])) outputfile = adv_path writer = skvideo.io.FFmpegWriter( outputfile, outputdict={ #huffyuv is lossless. r10k is really good # '-c:v': 'libx264', #libx264 # use the h.264 codec '-c:v': 'huffyuv', #r210 huffyuv r10k # '-pix_fmt': 'rgb32', # '-crf': '0', # set the constant rate factor to 0, which is lossless # '-preset': 'ultrafast' # ultrafast, veryslow the slower the better compression, in princple, try }) for f in adv_frames: writer.writeFrame(f) writer.close() # np_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW') # np.save(np_path, adv_frames) #ffv1 0.215807946043995 #huffyuv 0.21578424050191813 #libx264 0.2341074901578537 #r210 -0.7831487262059795, -0.7833399258537526 #gif 0.6889478809555243 #png 0.2158991440582696 0.21616862708842177 #qtrle 0.21581286337807626 #flashsv 0.21610510459932186 0.21600030673323545 #ffvhuff 0.21620682250167533 #r10k similar to r210 #rawvideo 0.21595001 with torch.no_grad(): #getting a new model to see how it actually works now # full_decoder = ConvS2VT(convnet, model, opt) full_decoder = full_decoder.eval() frames = skvideo.io.vread(adv_path) frames = np.float32(frames) plt.imshow(frames[0] / 255.) plt.show() difference = np.array(adv_frames) - np.array(frames) np.save('difference_tmp', difference) #loadtxt to load np array from txt exp = np.load('difference_tmp.npy') # numpy_frames = np.load(np_path+'.npy') # print("Are numpy frames == adv frames: ", np.array_equal(numpy_frames, adv_frames)) # print("Is the saved array equal to loaded array for difference: ", np.array_equal(exp, difference)) frames = frames + difference # batches = exp_create_batches(numpy_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption = sents[0] # # print("Numpy Frames exp: {}".format(numpy_caption)) # # numpy_frames_tensor = torch.tensor(numpy_frames) # numpy_frames_tensor = numpy_frames_tensor.float() # batches = exp_create_batches(numpy_frames_tensor, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # numpy_caption_tensor = sents[0] # # print("Numpy Frames tensor: {}".format(numpy_caption_tensor)) # numpy_frames = numpy_frames.astype(np.uint8) # batches = create_batches(numpy_frames, load_img_fn, tf_img_fn) # # # batches = exp_create_batches(adv_frames, BATCH_SIZE) # # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') # # seq_prob, seq_preds = full_decoder(batches, mode='inference') # sents = utils.decode_sequence(vocab, seq_preds) # # print("Numpy Frames originalscale: {}".format(sents[0])) # # bp --- adv_frames = adv_frames.astype(np.uint8) batches = create_batches(adv_frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(adv_frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames old: {}".format(sents[0])) batches = exp_create_batches(adv_frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("Adversarial Frames new: {}".format(sents[0])) frames = frames.astype(np.uint8) batches = create_batches(frames, load_img_fn, tf_img_fn) # batches = exp_create_batches(frames, BATCH_SIZE) # feats = full_decoder.conv_forward((batches.unsqueeze(0))) # seq_prob, seq_preds = full_decoder.encoder_decoder_forward(feats, mode='inference') seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) print("frames old caption: ", sents[0]) # frames = frames.astype(np.uint8) # batches = create_batches(frames, load_img_fn, tf_img_fn) batches = exp_create_batches(frames, BATCH_SIZE) feats = full_decoder.conv_forward((batches.unsqueeze(0))) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') # seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) adv_caption = sents[0] print( "\nOriginal Caption: {}\nTarget Caption: {}\nAdversarial Caption: {}". format(original_caption, target_caption, adv_caption))
def main(opt): def loss(seq_prob, crit): loss = crit(seq_prob, tlabel[:, 1:].cuda(), tmask[:, 1:].cuda()) return loss def produce_t_mask(): mask = torch.zeros(dataset.max_len) captions = [target_caption.split(' ')] gts = torch.zeros(len(captions), dataset.max_len).long() for i, cap in enumerate(captions): if len(cap) > dataset.max_len: cap = cap[:dataset.max_len] cap[-1] = '<eos>' for j, w in enumerate(cap): gts[i, j] = dataset.word_to_ix[w] label = gts[0] non_zero = (label == 0).nonzero() mask[:int(non_zero[0]) + 1] = 1 return label.unsqueeze(0), mask.unsqueeze(0) dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) #model, videopath, targetcap, dataset, config, optimizer, crit, window #config: batch_size, c, learning rate, num it,input shape config = { "batch_size": BATCH_SIZE, "c": 100, "learning_rate": 0.005, "num_iterations": 1000, "input_shape": (299, 299), "num_frames": 288, "dimensions": 331 } convnet = 'nasnetalarge' full_decoder = ConvS2VT(convnet, model, opt) #'A woman is cutting a green onion' video_path = opt['videos'][0] tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray vocab = dataset.get_vocab() vid_id = video_path.split('/')[-1] vid_id = vid_id.split('.')[0] viable_ids = dataset.splits['test'] + dataset.splits['val'] viable_target_captions = [] for v_id in viable_ids: if v_id == vid_id: continue plausible_caps = [ ' '.join(toks) for toks in dataset.vid_to_meta[v_id]['final_captions'] ] viable_target_captions.extend(plausible_caps) #Random target caption # target_caption = np.random.choice(viable_target_captions) # target_caption = '<sos> A man is moving a toy <eos>' target_caption = '<sos> A boy is kicking a soccer ball into the goal <eos>' #Should use the original caption function we use in the attack because the scaling is sightly different with torch.no_grad(): frames = skvideo.io.vread(video_path, num_frames=config["num_frames"]) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) original_caption = sents[0] #video_path = 'D:\\College\Research\\December 2018 Video Captioning Attack\\video captioner\\YouTubeClips\\ACOmKiJDkA4_49_54.avi' #/96 gives 3 frames length = math.ceil( len(skvideo.io.vread(video_path, num_frames=config["num_frames"])) / 96) print("Total number of frames: {}".format(length)) adv_frames = [] iteration = 1 frame_counter = 0 total_iterations = np.ceil(length / BATCH_SIZE) #model is full_decoder optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) crit = utils.LanguageModelCriterion() seq_decoder = utils.decode_sequence # model, videopath, targetcap, dataset, config, optimizer, crit, window frames = skvideo.io.vread(video_path)[0:BATCH_SIZE] original = torch.tensor(frames) original = (original.float()).cuda() batch = exp_create_batches(frames_to_do=original, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer.zero_grad() cost.backward() original_grads = {} for name, parameter in full_decoder.named_parameters(): original_grads[name] = parameter.grad print(len(original_grads.keys())) # for key, value in original_grads.items(): # print(key) #Adversarial full_decoder = ConvS2VT(convnet, model, opt) base_toks = video_path.split('/') base_dir_toks = base_toks[:-1] base_filename = base_toks[-1] base_name = ''.join(base_filename.split('.')[:-1]) adv_path = os.path.join('/'.join(base_dir_toks), base_name + '_adversarialWINDOW.avi') adv_frames = skvideo.io.vread(adv_path) adv_frames = np.float32(adv_frames) adv_frames = torch.tensor(adv_frames) adv_frames = (adv_frames.float()).cuda() batch = exp_create_batches(frames_to_do=adv_frames, batch_size=BATCH_SIZE) feats = full_decoder.conv_forward(batch.unsqueeze(0)) seq_prob, seq_preds = full_decoder.encoder_decoder_forward( feats, mode='inference') tlabel, tmask = produce_t_mask() cost = loss(seq_prob, crit) optimizer = optim.Adam(full_decoder.parameters(), lr=0.005, betas=(0.9, 0.999)) optimizer.zero_grad() cost.backward() adv_grads = {} for name, parameter in full_decoder.named_parameters(): adv_grads[name] = parameter.grad # for key, value in adv_grads.items(): # print(key) print('\n\n\n------') for key, value in adv_grads.items(): if 'weight' in key: print(key) output = open("s2vt_weightoutput.txt", "w") l2norm_layers = [] for key, value in original_grads.items(): if 'weight' in key: if (value is not None): adv_weight = adv_grads[key] # print(value, adv_weight) diff = value - adv_weight net_change = np.linalg.norm(diff) / np.linalg.norm(value) output.write("{}, {}\n".format(key, net_change)) l2norm_layers.append([key, net_change]) output.close()
def train_best_architectures(self, best_archs, use_shared_weights=False, earlyStopping=True): if use_shared_weights and earlyStopping: mode = 'sw_eS' elif use_shared_weights: mode = 'sw' elif earlyStopping: mode = 'eS' else: mode = 'full' val_accs = [] max_val_acc = 0. for seq in best_archs[:self.nb_final_archs]: if self.target_classes == 2: self.nn.loss_func = 'binary_crossentropy' ## train every model print("architecture:", utils.decode_sequence(self.vocab, seq)) model = self.nn.create_model(seq, np.shape(self.x_data[0])) ## use early stopping if earlyStopping: callbacks = [EarlyStopping(monitor='val_acc', patience=0)] else: callbacks = None x, y = utils.unison_shuffled_copies(self.x_data, self.y_data) if use_shared_weights: ## use pre-trained shared weights without updating them history = self.nn.train_model(model, x, y, self.final_nn_train_epochs, validation_split=0.1, update_shared_weights=False, callbacks=callbacks) else: history = model.fit(x, y, epochs=self.final_nn_train_epochs, validation_split=0.1, callbacks=callbacks) val_accs.append( np.ma.average(history.history['val_acc'], weights=np.arange( 1, len(history.history['val_acc']) + 1), axis=-1)) ## store model, model_weights if mean weighted rolling ## validation accuracy better than previous models if val_accs[-1] > max_val_acc: best_arch_vals = {} best_arch_vals.update({tuple(seq): model.get_weights()}) max_val_acc = val_accs[-1] ## return validation accuracy of all trained architectures ## return best architecture, it's weights best_archs_dict = {} for i in range(self.nb_final_archs): best_archs_dict.update({tuple(best_archs[i]): val_accs[i]}) top_arch = utils.decode_sequence(self.vocab, list(list(best_arch_vals.keys())[0])) print("top {} architectures:".format(self.nb_final_archs), best_archs[:self.nb_final_archs]) print("corresponding validation accuracies:", val_accs) print("best architecture:", top_arch) print("it's validation accuracy:", max_val_acc) print("saving best weights...") best_weights_file = 'logdir/best_arch_weights{}{}.pkl'.format( mode, datetime.now().strftime("%H%M")) with open(best_weights_file, 'wb') as file: pickle.dump(best_arch_vals, file) print("saving top architectures and their validation accuracies...") best_archs_file = 'logdir/top{}archs{}{}.pkl'.format( self.nb_final_archs, mode, datetime.now().strftime("%H%M")) with open(best_archs_file, 'wb') as file: pickle.dump(best_archs_dict, file) return val_accs, top_arch
def generate_caps(encoder, decoder, crit, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) split = eval_kwargs.get('split', 'train') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) beam_size = 1 logger = eval_kwargs.get('logger') lm_model = eval_kwargs.get('lm_model') vocab_size = eval_kwargs.get('vocab_size') sample_max = eval_kwargs.get('sample_max') temperature = eval_kwargs.get('temperature') tries = eval_kwargs.get('tries', 5) sample_limited_vocab = eval_kwargs.get('sample_limited_vocab', 0) output_file = eval_kwargs.get('output_file') print('Using sample_max = %d || temperature %.2f' % (sample_max, temperature)) # Make sure in the evaluation mode encoder.eval() decoder.eval() logger.warn('Generating captions for the full training set') loader.reset_iterator(split) n = 0 blobs = [] SENTS = [] gen_SENTS = [] while True: data = loader.get_batch(split) n = n + loader.batch_size # forward the model to get loss # if n > 100: # break infos = data['infos'] ids = [inf['id'] for inf in infos] assert len(ids) == 1, "Batch size larger than 1" tmp = [data['labels'], data['masks']] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] labels, masks = tmp tr = 0 gt = decode_sequence(loader.get_vocab(), labels[:, 1:].data) SENTS += gt blob_batch = {"id": ids[0], "gt": gt, "sampled": []} for igt in gt: print_sampled(ids[0], gt) while tr < tries: # z_mu, z_var, codes = encoder(labels) if lm_model == "rnn_vae": codes = encoder.sample(labels) elif lm_model == "rnn_multi_vae": codes = encoder.sample_group(labels) # scodes = encoder.sample(labels) else: codes = encoder(labels) if sample_limited_vocab: sample_vocab = np.unique(labels[:, 1:].cpu().data.numpy()) print("sample_vocab:", sample_vocab.tolist()) seq, _ = decoder.sample_ltd( codes, sample_vocab, { 'beam_size': beam_size, "vocab_size": vocab_size, "sample_max": sample_max, "temperature": temperature }) else: seq, _ = decoder.sample( codes, { 'beam_size': beam_size, "vocab_size": vocab_size, "sample_max": sample_max, "temperature": temperature }) sents = decode_sequence(loader.get_vocab(), seq) # ssents = decode_sequence(loader.get_vocab(), sseq) gen_SENTS += sents # gen_SENTS += ssents for isent in sents: print_sampled(0, isent, warn=True) # print '--------------------(SINGLE)------------------------' # for isent in ssents: # print _WARNING + isent + _ENDC print('----------------------------------------------------') blob_batch['sampled'] += sents # blob_batch['sampled'] += ssents tr += 1 ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break blobs.append(blob_batch) # print "Blob batch:", blob_batch json.dump(blobs, open(output_file, 'w')) if lang_eval: lang_stats = language_lm_eval(SENTS, gen_SENTS) print(lang_stats) encoder.train() decoder.train() return 1