def accuracies_on_ds(data_file, inputs, model, n_ans): train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = inputs model.opt['interpret'] = False batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) predictions = [] pred_answers = {} for i, batch in enumerate(batches): pred = model.predict(batch)[0] predictions.extend(pred) em, f1 = utils.score(predictions, dev_y) print("[EM: {0:.2f} F1: {1:.2f}] on {2}".format(em, f1, data_file)) batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, shuffle=True) model.opt['interpret'] = True t_a, t_total_a = {0.1:0, 0.2:0, 0.3:0, 0.4:0, 0.5:0, 0.6:0, 0.7:0, 0.8:0, 0.9:0}, 0 f1s_a = []; ovs_a = [] # evaluate the model for all interpretations and all answers # if f1 score for all GT answers is > p then count answer as correct for i, batch in tqdm(enumerate(batches)): i_predictions = [] truth = np.take(dev_y, batches.indices[i], 0) if args.n_actions>0: for a in range(args.n_actions): latent_a = Variable(torch.ones(batch[0].size(0))*a).long().cuda() pred = model.predict_inter(batch, latent_a=latent_a) i_predictions.append(pred[0]) else: i_predictions = model.predict(batch)[0] for b in range(batch[0].size(0)): f1s = [] for ta in truth[b]: f1_v = [] for a in range(args.n_actions): _, f1_a = utils.score_test_alli([i_predictions[a][b]], [[ta]]) f1_v += [f1_a] if args.n_actions>0: f1s += [max(f1_v)] else: _, f1_v = utils.score_test_alli([i_predictions[b]], [[ta]]) f1s += [f1_v] f1s = np.array(f1s) for p in t_a.keys(): t_a[p] = t_a[p] + int((f1s>p).sum() == n_ans) f1_i = []; ov_i = [] for a in range(args.n_actions): _, f1_a = utils.score_test_alli([i_predictions[a][b]], [truth[b]]) ov_a = utils.overlap([i_predictions[a][b]], [truth[b]]) f1_i += [f1_a]; ov_i += [ov_a] if args.n_actions == 0: _, f1_i = utils.score_test_alli([i_predictions[b]], [truth[b]]) ov_i = utils.overlap([i_predictions[b]], [truth[b]]) f1s_a += [f1_i]; ovs_a += [ov_i] t_total_a += batch[0].size(0) f1s_a = np.array(f1s_a); ovs_a = np.array(ovs_a) return t_total_a, f1s_a, ovs_a, t_a
def main(): print('[program starts.]') train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = utils.load_data( vars(args), args) if args.resume: print('[loading previous model...]') checkpoint = torch.load( os.path.join(model_dir, args.restore_dir, args.resume)) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = DocReaderModel(opt, embedding, state_dict) else: raise RuntimeError('Include checkpoint of the trained model') if args.cuda: model.cuda() with open(args.data_file, 'rb') as f: data = msgpack.load(f, encoding='utf8') dev_ids = data['dev_ids'] # evaluate restored model model.opt['interpret'] = False batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) predictions = [] for i, batch in enumerate(batches): predictions.extend(model.predict(batch)[0]) em, f1 = utils.score(predictions, dev_y) print("[sampled EM: {} F1: {}]".format(em, f1)) batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) model.opt['interpret'] = True t_em_c, t_f1_c, t_total = [0] * 3 f1s = [] ems = [] pred_answers = {} # evaluate the model for all interpretations and select the one with highest accuracy for i, batch in tqdm(enumerate(batches)): i_predictions = [] truth = np.take(dev_y, batches.indices[i], 0) confidence = [] ans_a = [] for a in range(args.n_actions): latent_a = Variable(torch.ones(batch[0].size(0)) * a).long().cuda() pred = model.predict_inter(batch, latent_a=latent_a) i_predictions.append(pred[0]) computed_a = pred[-1] confidence.append(pred[-2]) ans_a += [pred[0]] confidence = np.array(confidence) for b in range(batch[0].size(0)): em_v, f1_v = [], [] a = np.argmax(confidence[:, b]) em_c, f1_c = utils.score_test_alli([i_predictions[a][b]], [truth[b]]) for a in range(args.n_actions): em_a, f1_a = utils.score_test_alli([i_predictions[a][b]], [truth[b]]) em_v += [em_a] f1_v += [f1_a] pred_answers[dev_ids[i * args.batch_size + b]] = [[a_i[b] for a_i in ans_a], list(map(str, f1_v)), str(computed_a[b])] f1s += [f1_v] ems += [em_v] t_em_c += em_c t_f1_c += f1_c t_total += batch[0].size(0) with open('predictions_a.json', 'w') as f: json.dump(pred_answers, f) def toscore(score): return 100. * score / t_total f1s = np.array(f1s) ems = np.array(ems) print("[max EM: {} F1: {}]".format(toscore(np.max(ems, 1).sum()), toscore(np.max(f1s, 1).sum()))) print("[min EM: {} F1: {}]".format(toscore(np.min(ems, 1).sum()), toscore(np.min(f1s, 1).sum()))) print("[avg EM: {} F1: {}]".format(toscore(np.average(ems, 1).sum()), toscore(np.average(f1s, 1).sum()))) print("[con EM: {} F1: {}]".format(toscore(t_em_c), toscore(t_f1_c)))
def main(): log.info('[program starts.]') train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = utils.load_data( vars(args), args) log.info('[Data loaded.ql_mask]') if args.resume: log.info('[loading previous model...]') checkpoint = torch.load( os.path.join(model_dir, args.restore_dir, args.resume)) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = DocReaderModel(opt, embedding, state_dict) else: raise RuntimeError('Include checkpoint of the trained model') if args.cuda: model.cuda() with open(os.path.join(squad_dir, 'meta.msgpack'), 'rb') as f: meta = msgpack.load(f, encoding='utf8') vocab = meta['vocab'] ids_word = {i: w for i, w in enumerate(vocab)} def to_text(inp): s = "" for ids in inp.numpy(): s += ids_word[ids] + " " return s # evaluate restored model batches = utils.BatchGen(dev, batch_size=100, evaluation=True, gpu=args.cuda) predictions = [] for i, batch in enumerate(batches): predictions.extend(model.predict(batch)[0]) em, f1 = utils.score(predictions, dev_y) log.info("[dev EM: {} F1: {}]".format(em, f1)) batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda) model.opt['interpret'] = True #itrs = [30, 58] itrs = [0, 30] outputs = "" # collect document encodings for induced interpretations (embeds) and interpretations chosen by the model (computed_a) X = [[] for _ in range(itrs[1] - itrs[0] + 1)] for i, batch in enumerate(batches): if i < itrs[0]: continue truth = np.take(dev_y, batches.indices[i], 0) i_predictions = [] for a in range(args.n_actions): latent_a = Variable(torch.ones(args.batch_size) * a).long().cuda() i_predictions.append( model.predict_inter(batch, latent_a=latent_a)[0]) for b in range(len(batch[0])): outputs += batch[-2][b] + '\n' + to_text(batch[5][b]) + '\n' outputs += "idx = {} truth={}".format( (i - itrs[0]) * args.batch_size + b, truth[b]) + '\n' for a in range(args.n_actions): em_v, f1_v = utils.score([i_predictions[a][b]], [truth[b]]) outputs += i_predictions[a][b] + '\n' + "b={0} a={1} ".format( i - itrs[0], a, em_v, f1_v) + '\n' outputs += '\n' for a in range(args.n_actions): latent_a = Variable(torch.ones(args.batch_size) * a).long().cuda() embeds, actions, questions, computed_a = model.get_embeddings( batch, latent_a=[1, latent_a]) X[i - itrs[0]].append([embeds, actions, questions, computed_a]) if i >= itrs[1]: break print(outputs) # rearrange encodings x_emb, x_l, x_q, computed_a = [], [], [], [] for it in range(itrs[1] - itrs[0] + 1): for b in range(args.batch_size): for a in range(args.n_actions): x_emb.append(X[it][a][0][b]) x_l.append(X[it][a][1][b]) x_q.append(X[it][a][2][b]) computed_a.append(X[it][a][3][b]) x_emb = np.array(x_emb) x_l = np.array(x_l) x_q = np.array(x_q) computed_a = np.array(computed_a).astype(int) # 256D -> 2D tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca') tsne_d = tsne_model.fit_transform(x_emb) # find document encodings for selected interpretations a = np.reshape(computed_a, ((itrs[1] - itrs[0] + 1) * args.batch_size)) a_oh = np.expand_dims(np.eye(args.n_actions)[a], -1) tsne_d_r = np.reshape( tsne_d, ((itrs[1] - itrs[0] + 1) * args.batch_size, args.n_actions, -1)) sel_tsne_d = np.sum(tsne_d_r * a_oh, 1) # setup the plot N = args.n_actions c = x_l.astype(int) x = tsne_d[:, 0] y = tsne_d[:, 1] plt.scatter(x, y, c=c, s=40, cmap=discrete_cmap(N, 'jet'), alpha=0.5) names = [str(i // (args.n_actions)) for i in range(tsne_d.shape[0])] for i, txt in enumerate(names): plt.annotate(txt, (x[i], y[i]), size='x-small') c = computed_a.astype(int) x = sel_tsne_d[:, 0] y = sel_tsne_d[:, 1] plt.scatter(x, y, c=c, s=70, marker='x', cmap=discrete_cmap(N, 'jet')) names = [str(i) for i in range(sel_tsne_d.shape[0])] for i, txt in enumerate(names): plt.annotate(txt, (x[i], y[i]), size='x-small') plt.colorbar(ticks=range(N)) plt.clim(-0.5, N - 0.5) plt.title("tSNE") plt.show()
def main(): log.info('[program starts.]') train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = utils.load_data_train( vars(args), args) log.info('[Data loaded.ql_mask]') log.info('vocab size = %d' % opt['vocab_size']) with open(args.data_file, 'rb') as f: data = msgpack.load(f, encoding='utf8') dev_ae = list(data['dev_ans_exists']) trn_ae = list(data['trn_ans_exists']) #dev_ae = [1]*len(dev_y); trn_ae = [1]*len(train_y) if args.resume: log.info('[loading previous model...]') checkpoint = torch.load( os.path.join(model_dir, args.restore_dir, args.resume)) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = DocReaderModel(opt, embedding, state_dict) epoch_0 = checkpoint['epoch'] + 1 indices = list(range(len(train))) for i in range(checkpoint['epoch']): random.shuffle(indices) # synchronize random seed train = [train[i] for i in indices] trn_ae = [trn_ae[i] for i in indices] train_y = [train_y[i] for i in indices] q_labels = [q_labels[i] for i in indices] ql_mask = [ql_mask[i] for i in indices] if args.reduce_lr: utils.lr_decay(model.optimizer, args.reduce_lr, log) else: model = DocReaderModel(opt, embedding) epoch_0 = 1 train_y = np.array(train_y) # text answers for training set q_labels = np.array(q_labels) ql_mask = np.array(ql_mask) print("timestamp {}".format(timestamp)) trn_eval_size = len(trn_ae) dev_y = np.array(dev_y) if args.cuda: model.cuda() # evaluate pre-trained model if args.resume and not args.debug: batches = utils.BatchGen(train[:trn_eval_size], batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] ae_ta = [] for batch in batches: if args.squad == 2: ans_b, _, _, ae_i = model.predict(batch) ae_ta.extend(ae_i) predictions.extend(ans_b) else: predictions.extend(model.predict(batch)[0]) em_t, f1_t = utils.score(predictions, train_y[:trn_eval_size]) if 'exist' in args.ae_archt: em_t, f1_t = utils.score_list(predictions, train_y[:trn_eval_size], trn_ae[:trn_eval_size]) n_ae = sum(trn_ae[:trn_eval_size]) n_dae = trn_eval_size - n_ae print('tot_pos=%d, true_pos=%d, cor_p=%d, cor_n=%d'%(sum(ae_ta), sum(trn_ae[:trn_eval_size]), \ (np.array(trn_ae[:trn_eval_size]).squeeze()*np.array(ae_ta).squeeze()).sum(),\ ((np.array(trn_ae[:trn_eval_size]).squeeze()==0)*(np.array(ae_ta).squeeze()==0)).sum())) log.info("[train EM: {0:.3f} F1: {1:3f}]".format(em_t, f1_t)) batches = utils.BatchGen(dev, batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] ae_ta = [] for batch in batches: if args.squad == 2: ans_b, _, _, ae_i = model.predict(batch) ae_ta.extend(ae_i) predictions.extend(ans_b) else: predictions.extend(model.predict(batch)[0]) em_v, f1_v = utils.score(predictions, dev_y) if 'exist' in args.ae_archt: em_v, f1_v = utils.score_list(predictions, np.array(dev_y), dev_ae) n_ae = sum(dev_ae) n_dae = len(dev_ae) - n_ae print('tot_pos=%d, true_pos=%d, cor_p=%d, cor_n=%d'%(sum(ae_ta), sum(dev_ae), \ (np.array(dev_ae).squeeze()*np.array(ae_ta).squeeze()).sum(),\ ((np.array(dev_ae).squeeze()==0)*(np.array(ae_ta).squeeze()==0)).sum())) log.info("[val EM: {} F1: {}]".format(em_v, f1_v)) best_val_score = f1_v if args.summary: writer.add_scalars('accuracies', { 'em_t': em_t, 'f1_t': f1_t, 'em_v': em_v, 'f1_v': f1_v }, epoch_0 - 1) else: best_val_score = 0.0 if 'const' in args.beta: beta = float(args.beta.split('_')[1]) * 0.1 if 'const' in args.alpha: alpha = float(args.alpha.split('_')[1]) * 0.1 scope = 'pi_q' if args.select_i: scope = 'select_i' dummy_r = np.zeros(args.batch_size) latent_a = None target_i = None indices = None # induced interpretation rewards = dummy_r # training for epoch in range(epoch_0, epoch_0 + args.epochs): log.warn('Epoch {} timestamp {}'.format(epoch, timestamp)) batches = utils.BatchGen(train, batch_size=args.batch_size, gpu=args.cuda) start = datetime.now() if args.vae and not args.select_i: scope = utils.select_scope_update(args, epoch - epoch_0) print("scope = {} beta = {} alpha = {} ".format(scope, beta, alpha)) for i, batch in enumerate(batches): inds = batches.indices[i] # synchronize available interpretations with the current batch labels = np.take(q_labels, inds, 0) l_mask = np.take(ql_mask, inds, 0) if args.vae: # VAE framework if scope == 'rl': if args.rl_tuning == 'pgm': # policy gradient with EM scores for rewards truth = np.take(train_y, inds, 0) pred_m, latent_a, indices = model.predict(batch)[:3] _, f1_m = utils.score_em(None, pred_m, truth) rewards = f1_m # normalize rewards over batch rewards -= rewards.mean() rewards /= (rewards.std() + 1e-08) elif args.rl_tuning == 'pg': # policy gradient with F1 scores for rewards truth = np.take(train_y, inds, 0) pred_m, latent_a, indices = model.predict(batch)[:3] _, f1_m = utils.score_sc(None, pred_m, truth) rewards = f1_m # normalize rewards over batch rewards -= rewards.mean() rewards /= (rewards.std() + 1e-08) elif args.rl_tuning == 'sc': # reward computed by self-critic truth = np.take(train_y, inds, 0) pred_s, pred_m, latent_a, indices = model.predict_self_critic( batch) rs, rm = utils.score_sc(pred_s, pred_m, truth) rewards = rs - rm else: rewards = dummy_r if args.select_i: i_predictions = [] truth = np.take(train_y, batches.indices[i], 0) for a in range(args.n_actions): latent_a = Variable(torch.ones(batch[0].size(0)) * a).long().cuda() i_predictions.append( model.predict_inter(batch, latent_a=latent_a)[0]) f1_all = [] for b in range(batch[0].size(0)): f1_v = [] for a in range(args.n_actions): _, f1_a = utils.score_test_alli( [i_predictions[a][b]], [truth[b]]) f1_v += [f1_a] f1_all += [f1_v] target_i = np.argmax(np.array(f1_all), 1) model.update(batch, q_l=[labels, l_mask], r=rewards, scope=scope, beta=beta, alpha=alpha, \ latent_a=latent_a, target_i=target_i, span=indices) elif args.self_critic: # self-critic framework where rewards are computed as difference between the F1 score produced # by the current model during greedy inference and by sampling truth = np.take(train_y, inds, 0) if args.critic_loss: pred_m, latent_a, indices = model.predict(batch)[:3] _, f1_m = utils.score_sc(None, pred_m, truth) rewards = f1_m else: pred_s, pred_m, latent_a, indices = model.predict_self_critic( batch) rs, rm = utils.score_sc(pred_s, pred_m, truth) rewards = rs - rm model.update(batch, r=rewards, q_l=[labels, l_mask], latent_a=latent_a) else: model.update(batch, q_l=[labels, l_mask]) if i % args.log_per_updates == 0: # printing if args.vae and not args.select_i: log.info('updates[{0:6}] l_p[{1:.3f}] l_q[{2:.3f}] l_rl[{3:.3f}] l_ae[{4:.3f}] l_ce[{5:.3f}] l_cr[{6:.3f}] remaining[{7}]'.format( model.updates, model.train_loss['p'].avg, model.train_loss['q'].avg, model.train_loss['rl'].avg, model.train_loss['ae'].avg,\ model.train_loss['ce'].avg, model.train_loss['cr'].avg, str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) if args.summary: writer.add_scalars('losses', {'p':model.train_loss['p'].avg, 'q':model.train_loss['q'].avg, 'ce':model.train_loss['ce'].avg, \ 'ae':model.train_loss['ae'].avg,'rl':model.train_loss['rl'].avg, 'cr':model.train_loss['cr'].avg,}, (epoch-1)*len(batches)+i) else: log.info( 'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'. format( model.updates, model.train_loss.avg, str((datetime.now() - start) / (i + 1) * (len(batches) - i - 1)).split('.')[0])) if args.summary: writer.add_scalar('loss', model.train_loss.avg, (epoch - 1) * len(batches) + i) if scope == 'rl' and (i % 4 * args.log_per_updates == 0): vbatches = utils.BatchGen(dev, batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] for batch in vbatches: predictions.extend(model.predict(batch)[0]) em_v, f1_v = utils.score(predictions, dev_y) log.warn("val EM: {0:.3f} F1: {1:3f}".format(em_v, f1_v)) # eval if epoch % args.eval_per_epoch == 0: batches = utils.BatchGen(dev, batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] ae_ta = [] for i, batch in enumerate(batches): if args.squad == 2: ans_b, _, _, ae_i = model.predict(batch) ae_ta.extend(ae_i) predictions.extend(ans_b) else: predictions.extend(model.predict(batch)[0]) em_v, f1_v = utils.score(predictions, dev_y) if 'exist' in args.ae_archt: em_v, f1_v = utils.score_list(predictions, dev_y, dev_ae) n_ae = sum(dev_ae[:trn_eval_size]) n_dae = len(dev_ae) - n_ae print('tot_pos=%d, true_pos=%d, cor_p=%d, cor_n=%d'%(sum(ae_ta), sum(dev_ae), \ (np.array(dev_ae).squeeze()*np.array(ae_ta).squeeze()).sum(),\ ((np.array(dev_ae).squeeze()==0)*(np.array(ae_ta).squeeze()==0)).sum())) log.info("[val EM: {} F1: {}]".format(em_v, f1_v)) batches = utils.BatchGen(train[:trn_eval_size], batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] ae_ta = [] for batch in batches: if args.squad == 2: ans_b, _, _, ae_i = model.predict(batch) ae_ta.extend(ae_i) predictions.extend(ans_b) else: predictions.extend(model.predict(batch)[0]) em_t, f1_t = utils.score(predictions, train_y[:trn_eval_size]) if 'exist' in args.ae_archt: em_t, f1_t = utils.score_list(predictions, train_y[:trn_eval_size], trn_ae[:trn_eval_size]) n_ae = sum(trn_ae[:trn_eval_size]) n_dae = trn_eval_size - n_ae print('tot_pos=%d, true_pos=%d, cor_p=%d, cor_n=%d'%(sum(ae_ta), sum(trn_ae[:trn_eval_size]), \ (np.array(trn_ae[:trn_eval_size]).squeeze()*np.array(ae_ta).squeeze()).sum(),\ ((np.array(trn_ae[:trn_eval_size]).squeeze()==0)*(np.array(ae_ta).squeeze()==0)).sum())) log.info("[train EM: {0:.3f} F1: {1:3f}]".format(em_t, f1_t)) print("current_dir {}".format(current_dir)) if args.summary: writer.add_scalars('accuracies', { 'em_t': em_t, 'f1_t': f1_t, 'em_v': em_v, 'f1_v': f1_v }, epoch) # save if not args.save_last_only or epoch == epoch_0 + args.epochs - 1: try: os.remove( os.path.join(current_dir, 'checkpoint_epoch_{}.pt'.format(epoch - 1))) except OSError: pass model_file = os.path.join(current_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) model.save(model_file, epoch) if f1_v > best_val_score: best_val_score = f1_v copyfile(model_file, os.path.join(current_dir, 'best_model.pt')) log.info('[new best model saved.]') # load test data that is the development set train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = utils.load_data( vars(args), args) batches = utils.BatchGen(dev, batch_size=bs_valid, evaluation=True, gpu=args.cuda) predictions = [] ae_ta = [] for batch in batches: if args.squad == 2: ans_b, _, _, ae_i = model.predict(batch) ae_ta.extend(ae_i) predictions.extend(ans_b) else: predictions.extend(model.predict(batch)[0]) em_v, f1_v = utils.score(predictions, dev_y) if 'exist' in args.ae_archt: em_v, f1_v = utils.score_list(predictions, np.array(dev_y), dev_ae) n_ae = sum(dev_ae) n_dae = len(dev_ae) - n_ae print('tot_pos=%d, true_pos=%d, cor_p=%d, cor_n=%d'%(sum(ae_ta), sum(dev_ae), \ (np.array(dev_ae).squeeze()*np.array(ae_ta).squeeze()).sum(),\ ((np.array(dev_ae).squeeze()==0)*(np.array(ae_ta).squeeze()==0)).sum())) log.info("[test EM: {} F1: {}]".format(em_v, f1_v)) if args.summary: # export scalar data to JSON for external processing writer.export_scalars_to_json( os.path.join(current_dir, "all_scalars.json")) writer.close()
def main(): log.info('[program starts.]') train, dev, dev_y, train_y, embedding, opt, q_labels, ql_mask = utils.load_data( vars(args), args) log.info('[Data loaded.ql_mask]') if args.resume: log.info('[loading previous model...]') checkpoint = torch.load( os.path.join(model_dir, args.restore_dir, args.resume)) if args.resume_options: opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = DocReaderModel(opt, embedding, state_dict) else: raise RuntimeError('Include checkpoint of the trained model') if args.cuda: model.cuda() outputs = "" # evaluate restored model model.opt['interpret'] = False batches = utils.BatchGen(dev, batch_size=100, evaluation=True, gpu=args.cuda) predictions = [] for i, batch in enumerate(batches): predictions.extend(model.predict(batch)[0]) em, f1 = utils.score(predictions, dev_y) log.info("[dev EM: {} F1: {}]".format(em, f1)) outputs += "[dev EM: {} F1: {}]\n".format(em, f1) with open(os.path.join(squad_dir, 'meta.msgpack'), 'rb') as f: meta = msgpack.load(f, encoding='utf8') vocab = meta['vocab'] ids_word = {i: w for i, w in enumerate(vocab)} def to_text(inp): s = "" for ids in inp.numpy(): s += ids_word[ids] + " " return s test_int = {i: [] for i in range(args.n_actions)} batches = utils.BatchGen(dev, batch_size=args.batch_size, evaluation=True, gpu=args.cuda, shuffle=True) for i, batch in enumerate(batches): model.opt['interpret'] = False # collect predicted answers for various interpretations predictions, acts = model.predict_inter(batch)[:2] truth = np.take(dev_y, batches.indices[i], 0) for b in range(len(predictions)): em_v, f1_v = utils.score([predictions[b]], [truth[b]]) log.warn("b={0} a={1} EM: {2:.3f} F1: {3:3f}".format( b, acts[b], em_v, f1_v)) model.opt['interpret'] = True i_predictions = [] for a in range(args.n_actions): latent_a = Variable(torch.ones(batch[0].size()[0]) * a).long().cuda() i_predictions.append( model.predict_inter(batch, latent_a=latent_a)[0]) for b in range(batch[0].size()[0]): f1s = [] for a in range(args.n_actions): em_v, f1_v = utils.score([i_predictions[a][b]], [truth[b]]) f1s.append(f1_v) if len(set(f1s)) >= 1: outputs += batch[-2][b] + '\n' + to_text(batch[5][b]) + '\n' outputs += "pred_a={} truth={}".format(acts[b], truth[b]) + '\n' for a in range(args.n_actions): test_int[a] += [i_predictions[a][b]] em_v, f1_v = utils.score([i_predictions[a][b]], [truth[b]]) outputs += i_predictions[a][ b] + '\n' + "b={0} a={1} EM: {2:.3f} F1: {3:3f}".format( b, a, em_v, f1_v) + '\n' log.warn("b={0} a={1} EM: {2:.3f} F1: {3:3f}".format( b, a, em_v, f1_v)) outputs += '\n' with open(os.path.join(current_dir, 'ints.msgpack'), 'wb') as f: msgpack.dump(test_int, f) with open(os.path.join(current_dir, "interpret.txt"), "w") as txtf: txtf.write(outputs)