def eval_external_ensemble(ensemble, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) num_images = eval_kwargs.get('num_images', -1) split = eval_kwargs.get('split', 'val') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) logger = eval_kwargs.get('logger') caption_model = eval_kwargs.get('caption_model') vocab_size = eval_kwargs.get('vocb_size') dump_path = eval_kwargs.get('dump_path') # Make sure in the evaluation mode for cnn_model in ensemble.cnn_models: cnn_model.eval() for model in ensemble.models: model.eval() loader.reset_iterator(split) n = 0 predictions = [] Feats = [] seq_per_img = 5 while True: data = loader.get_batch(split, seq_per_img=seq_per_img) n = n + loader.batch_size # forward the model to get loss images = data['images'] images = Variable(torch.from_numpy(images), volatile=True).cuda() att_feats_ens, fc_feats_ens = ensemble.get_feats(images) seq, probs = ensemble.sample(fc_feats_ens, att_feats_ens, eval_kwargs) sents = utils.decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): spath = short_path(data['infos'][k]['file_path']) print_sampled(spath, sent) entry = {'image_id': spath, 'caption': sent} predictions.append(entry) # logger.debug('image %s: %s' %(entry['image_id'], entry['caption'])) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] # logger.warn('ix1 = %d - ix0 = %d' % (ix1, ix0)) if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() # logger.debug('validation loss ... %d/%d (%f)' %(ix0 - 1, ix1, loss)) if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # pickle.dump(Feats, open('cnn_features.pkl', 'w')) return predictions
def eval_external(cnn_model, model, loader, eval_kwargs={}): num_images = eval_kwargs.get('num_images', -1) split = eval_kwargs.get('split', 'val') # serves no purpose except to have the same signature for get_batch beam_size = eval_kwargs.get('beam_size', 1) logger = eval_kwargs.get('logger') caption_model = eval_kwargs.get('caption_model') beam_size = eval_kwargs.get('beam_size', 1) sample_max = eval_kwargs.get('sample_max', 1) temperature = eval_kwargs.get('temperature', 0.5) forbid_unk = eval_kwargs.get('forbid_unk', 1) print("Eval %s" % caption_model) # Make sure in the evaluation mode cnn_model.eval() model.eval() loader.reset_iterator(split) n = 0 predictions = [] seq_per_img = 1 while True: data = loader.get_batch(split, seq_per_img=seq_per_img) n = n + loader.batch_size # forward the model to get loss images = data['images'] images = Variable(torch.from_numpy(images), volatile=True).cuda() att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps( images, seq_per_img, return_unique=True) seq, _ = model.sample( fc_feats, att_feats, { 'beam_size': beam_size, 'forbid_unk': forbid_unk, "sample_max": sample_max, "temperature": temperature }) sents = decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): spath = short_path(data['infos'][k]['file_path']) entry = {'image_id': spath, 'caption': sent} print_sampled(spath, sent) predictions.append(entry) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # Switch back to training mode model.train() return predictions
def score_trads(preds, trg_loader, eval_kwargs): split = eval_kwargs.get('split', 'val') batch_size = eval_kwargs.get('batch_size', 80) verbose = eval_kwargs.get('verbose', 0) ground_truths = [] trg_loader.reset_iterator(split) n = 0 while True: # get batch data_trg = trg_loader.get_trg_batch(split, range(batch_size), batch_size) output_lines_trg_gold = data_trg['out_labels'] n += batch_size # Decode a minibatch greedily __TODO__ add beam search decoding # Do the same for gold sentences sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg_gold, eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 1000) else: verb = verbose for (l, gl) in zip(preds, sent_gold): ground_truths.append(gl) if verb: lg.print_sampled("", gl, l) ix1 = data_trg['bounds']['it_max'] if data_trg['bounds']['wrapped']: break if n >= ix1: print('Evaluated the required samples (%s)' % n) break bleu_moses, _ = corpus_bleu(preds, ground_truths) scores = {'Bleu': bleu_moses} return scores
def evaluate_model(model, src_loader, trg_loader, logger, eval_kwargs): """Evaluate model.""" preds = [] ground_truths = [] batch_size = eval_kwargs.get('batch_size', 1) split = eval_kwargs.get('split', 'val') verbose = eval_kwargs.get('verbose', 0) eval_kwargs['BOS'] = trg_loader.bos eval_kwargs['EOS'] = trg_loader.eos eval_kwargs['PAD'] = trg_loader.pad eval_kwargs['UNK'] = trg_loader.unk # Make sure to be in evaluation mode model.eval() src_loader.reset_iterator(split) trg_loader.reset_iterator(split) n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 while True: # get batch data_src, order = src_loader.get_src_batch(split, batch_size) tmp = [data_src['labels']] input_lines_src, = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp] src_lengths = data_src['lengths'] data_trg = trg_loader.get_trg_batch(split, order, batch_size) tmp = [data_trg['labels'], data_trg['out_labels'], data_trg['mask']] input_lines_trg_gold, output_lines_trg_gold, mask = [Variable(torch.from_numpy(_), requires_grad=False).cuda() for _ in tmp] trg_lengths = data_trg['lengths'] n += batch_size # decoder_logit = model(input_lines_src, input_lines_trg_gold) # if model.opt.sample_reward: # ml_loss, loss, stats = model.crit(model, input_lines_src, input_lines_trg_gold, # output_lines_trg_gold, mask) # else: # ml_loss, loss, stats = model.crit(decoder_logit, output_lines_trg_gold, mask) ml_loss, loss, _ = model.step(input_lines_src, src_lengths, input_lines_trg_gold, trg_lengths, output_lines_trg_gold, mask) loss_sum += loss.data.item() ml_loss_sum += ml_loss.data.item() loss_evals = loss_evals + 1 # Initialize target with <BOS> for every sentence Index = 2 # print('Sampling sentence') # print('GPU:', os.environ['CUDA_VISIBLE_DEVICES']) start = time.time() # print('>>> Sampling:') batch_preds, _ = model.sample(input_lines_src, src_lengths, opt=eval_kwargs) if isinstance(batch_preds, list): # wiht beam size unpadded preds sent_preds = [decode_sequence(trg_loader.get_vocab(), np.array(pred).reshape(1, -1), eos=trg_loader.eos, bos=trg_loader.bos)[0] for pred in batch_preds] else: # decode sent_preds = decode_sequence(trg_loader.get_vocab(), batch_preds, eos=trg_loader.eos, bos=trg_loader.bos) # Do the same for gold sentences sent_source = decode_sequence(src_loader.get_vocab(), input_lines_src.data.cpu().numpy(), eos=src_loader.eos, bos=src_loader.bos) sent_gold = decode_sequence(trg_loader.get_vocab(), output_lines_trg_gold.data.cpu().numpy(), eos=trg_loader.eos, bos=trg_loader.bos) if not verbose: verb = not (n % 300) else: verb = verbose for (sl, l, gl) in zip(sent_source, sent_preds, sent_gold): preds.append(l) ground_truths.append(gl) if verb: lg.print_sampled(sl, gl, l) ix1 = data_src['bounds']['it_max'] if data_src['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break # print('Predictions lenght:', len(preds), len(ground_truths)) # assert(len(preds) == trg_loader.h5_file['labels_val'].shape[0]) bleu_moses, _ = corpus_bleu(preds, ground_truths) return preds, ml_loss_sum / loss_evals, loss_sum / loss_evals, bleu_moses
def eval_split(cnn_model, model, loader, logger, eval_kwargs={}): verbose = eval_kwargs.get('verbose', False) dataset = eval_kwargs.get('dataset', 'coco') split = eval_kwargs.get('split', 'val') val_images_use = eval_kwargs.get('val_images_use', -1) lang_eval = eval_kwargs.get('language_eval', 1) language_creativity = eval_kwargs.get('language_creativity', 1) all_metrics = eval_kwargs.get('all_metrics', 0) single_metrics = eval_kwargs.get('single_metrics', 0) beam_size = eval_kwargs.get('beam_size', 1) sample_max = eval_kwargs.get('sample_max', 1) temperature = eval_kwargs.get('temperature', 0.5) forbid_unk = eval_kwargs.get('forbid_unk', 1) batch_size = eval_kwargs.get('batch_size', 1) seq_per_img = eval_kwargs.get('seq_per_img') region_size = model.region_size # Make sure to be in the evaluation mode cnn_model.eval() model.eval() logger.warn('Evaluating the %s split (%d)' % (split, val_images_use)) loader.reset_iterator(split) n = 0 loss_sum = 0 ml_loss_sum = 0 loss_evals = 0 predictions = [] while True: data = loader.get_batch(split, batch_size=batch_size, seq_per_img=seq_per_img) n = n + loader.batch_size images = data['images'] images = Variable(torch.from_numpy(images), requires_grad=False).cuda() att_feats, fc_feats, att_unique, fc_unique = cnn_model.forward_caps( images, seq_per_img, return_unique=True) ml_loss, loss, stats = model.step(data, att_feats, fc_feats, train=False) # print('Scores : ', stats) ml_loss_sum += ml_loss.item() loss_sum += loss.item() loss_evals = loss_evals + 1 # TODO Only leave one feature for each image, in case duplicate sample seq, probs = model.sample(fc_unique, att_unique, opt={ 'beam_size': beam_size, "forbid_unk": forbid_unk, "sample_max": sample_max, "temperature": temperature }) sent_scores = probs.cpu().numpy().sum(axis=1) sents = decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): if loader.flip: entry = { 'image_id': data['infos'][k // 2]['id'], 'caption': sent, 'score': sent_scores[k] } if not k % 2: unflipped = entry else: if entry['score'] > unflipped['score']: del entry['score'] predictions.append(entry) else: del unflipped['score'] predictions.append(unflipped) else: entry = {'image_id': data['infos'][k]['id'], 'caption': sent} predictions.append(entry) print_sampled(entry['image_id'], entry['caption']) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if val_images_use != -1: ix1 = min(ix1, val_images_use) for i in range(n - ix1): predictions.pop() if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break lang_stats = None if lang_eval: lang_stats, preds, _ = language_eval(dataset, predictions, logger, all_metrics, single_metrics, language_creativity) print('preds:', preds) # Back to training: model.train() if model.cnn_finetuning: logger.warn('Finetuning cnn ON, filtering the BN layers') cnn_model.train() cnn_model.filter_bn() return ml_loss_sum / loss_evals, loss_sum / loss_evals, predictions, lang_stats, preds
def generate_caps(encoder, decoder, crit, loader, eval_kwargs={}): verbose = eval_kwargs.get('verbose', True) split = eval_kwargs.get('split', 'train') lang_eval = eval_kwargs.get('language_eval', 1) dataset = eval_kwargs.get('dataset', 'coco') beam_size = eval_kwargs.get('beam_size', 1) beam_size = 1 logger = eval_kwargs.get('logger') lm_model = eval_kwargs.get('lm_model') vocab_size = eval_kwargs.get('vocab_size') sample_max = eval_kwargs.get('sample_max') temperature = eval_kwargs.get('temperature') tries = eval_kwargs.get('tries', 5) sample_limited_vocab = eval_kwargs.get('sample_limited_vocab', 0) output_file = eval_kwargs.get('output_file') print('Using sample_max = %d || temperature %.2f' % (sample_max, temperature)) # Make sure in the evaluation mode encoder.eval() decoder.eval() logger.warn('Generating captions for the full training set') loader.reset_iterator(split) n = 0 blobs = [] SENTS = [] gen_SENTS = [] while True: data = loader.get_batch(split) n = n + loader.batch_size # forward the model to get loss # if n > 100: # break infos = data['infos'] ids = [inf['id'] for inf in infos] assert len(ids) == 1, "Batch size larger than 1" tmp = [data['labels'], data['masks']] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] labels, masks = tmp tr = 0 gt = decode_sequence(loader.get_vocab(), labels[:, 1:].data) SENTS += gt blob_batch = {"id": ids[0], "gt": gt, "sampled": []} for igt in gt: print_sampled(ids[0], gt) while tr < tries: # z_mu, z_var, codes = encoder(labels) if lm_model == "rnn_vae": codes = encoder.sample(labels) elif lm_model == "rnn_multi_vae": codes = encoder.sample_group(labels) # scodes = encoder.sample(labels) else: codes = encoder(labels) if sample_limited_vocab: sample_vocab = np.unique(labels[:, 1:].cpu().data.numpy()) print("sample_vocab:", sample_vocab.tolist()) seq, _ = decoder.sample_ltd( codes, sample_vocab, { 'beam_size': beam_size, "vocab_size": vocab_size, "sample_max": sample_max, "temperature": temperature }) else: seq, _ = decoder.sample( codes, { 'beam_size': beam_size, "vocab_size": vocab_size, "sample_max": sample_max, "temperature": temperature }) sents = decode_sequence(loader.get_vocab(), seq) # ssents = decode_sequence(loader.get_vocab(), sseq) gen_SENTS += sents # gen_SENTS += ssents for isent in sents: print_sampled(0, isent, warn=True) # print '--------------------(SINGLE)------------------------' # for isent in ssents: # print _WARNING + isent + _ENDC print('----------------------------------------------------') blob_batch['sampled'] += sents # blob_batch['sampled'] += ssents tr += 1 ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if data['bounds']['wrapped']: break if n >= ix1: logger.warn('Evaluated the required samples (%s)' % n) break blobs.append(blob_batch) # print "Blob batch:", blob_batch json.dump(blobs, open(output_file, 'w')) if lang_eval: lang_stats = language_lm_eval(SENTS, gen_SENTS) print(lang_stats) encoder.train() decoder.train() return 1