Example #1
0
def compute_and_log_metrics(model, train_dataloader, test_dataloader,
                            augmentations, device, step, dataset_version):
    training_accuracy, training_IoU = compute_metrics(
        model,
        train_dataloader,
        augmentations,
        device,
        dataset_version=dataset_version)
    test_accuracy, test_IoU = compute_metrics(model,
                                              test_dataloader,
                                              augmentations,
                                              device,
                                              dataset_version=dataset_version)
    wandb.log(
        {
            'training IoU': training_IoU,
            'training accuracy': training_accuracy
        },
        step=iteration * bs)
    wandb.log({
        'test IoU': test_IoU,
        'test accuracy': test_accuracy
    },
              step=iteration * bs)
    return test_accuracy, test_IoU
Example #2
0
def evaluate(nlp, task, docs_golds):
    tok2vec = nlp.get_pipe(PIPES.tok2vec)
    textcat = nlp.get_pipe(PIPES.textcat)
    right = 0
    total = 0
    guesses = []
    truths = []
    labels = textcat.labels
    for batch in minibatch(docs_golds, size=HP.eval_batch_size):
        docs, golds = zip(*batch)
        docs = list(textcat.pipe(tok2vec.pipe(docs)))
        for doc, gold in zip(docs, golds):
            guess, _ = max(doc.cats.items(), key=lambda it: it[1])
            truth, _ = max(gold.cats.items(), key=lambda it: it[1])
            if guess not in labels:
                msg = (f"Unexpected label {guess} predicted. "
                       f"Expectded labels: {', '.join(labels)}")
                raise ValueError(msg)
            if truth not in labels:
                msg = (f"Unexpected label {truth} predicted. "
                       f"Expectded labels: {', '.join(labels)}")
                raise ValueError(msg)
            guesses.append(labels.index(guess))
            truths.append(labels.index(truth))
            right += guess == truth
            total += 1
            free_tensors(doc)
    main_name, metrics = compute_metrics(task, numpy.array(guesses),
                                         numpy.array(truths))
    metrics["_accuracy"] = right / total
    metrics["_right"] = right
    metrics["_total"] = total
    metrics["_main"] = metrics[main_name]
    return metrics[main_name], metrics
Example #3
0
def validate(task,
             val_iter,
             model,
             logger,
             field,
             world_size,
             rank,
             num_print=10,
             args=None):
    model.eval()
    required_names = ['greedy', 'answer']
    optional_names = ['context', 'question']
    loss, predictions, answers, results = gather_results(
        model, val_iter, field, world_size, optional_names=optional_names)
    predictions = [p.replace('UNK', 'OOV') for p in predictions]
    names = required_names + optional_names
    if hasattr(val_iter.dataset.examples[0], 'wikisql_id') or hasattr(
            val_iter.dataset.examples[0], 'squad_id') or hasattr(
                val_iter.dataset.examples[0], 'woz_id'):
        answers = [
            val_iter.dataset.all_answers[sid] for sid in answers.tolist()
        ]
    metrics, answers = compute_metrics(predictions,
                                       answers,
                                       bleu='iwslt' in task
                                       or 'multi30k' in task,
                                       dialogue='woz' in task,
                                       rouge='cnn' in task,
                                       logical_form='sql' in task,
                                       corpus_f1='zre' in task,
                                       args=args)
    results = [predictions, answers] + results
    print_results(names, results, rank=rank, num_print=num_print)

    return loss, metrics
def train_one_epoch(train_loader, model, loss_fn, optimizer):
    losses = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    for ref_cloud, src_cloud, gtR, gtt in tqdm(train_loader):
        ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                         gtR.cuda(), gtt.cuda()
        optimizer.zero_grad()
        R, t, pred_ref_clouds = model(
            src_cloud.permute(0, 2, 1).contiguous(),
            ref_cloud.permute(0, 2, 1).contiguous())
        loss = compute_loss(ref_cloud, pred_ref_clouds, loss_fn)
        loss.backward()
        optimizer.step()

        cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
        cur_t_isotropic = compute_metrics(R, t, gtR, gtt)
        losses.append(loss.item())
        r_mse.append(cur_r_mse)
        r_mae.append(cur_r_mae)
        t_mse.append(cur_t_mse)
        t_mae.append(cur_t_mae)
        r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
        t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)
    results = {
        'loss': np.mean(losses),
        'r_mse': r_mse,
        'r_mae': r_mae,
        't_mse': t_mse,
        't_mae': t_mae,
        'r_isotropic': r_isotropic,
        't_isotropic': t_isotropic
    }
    return results
def test_one_epoch(test_loader, model, loss_fn):
    model.eval()
    losses = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    with torch.no_grad():
        for ref_cloud, src_cloud, gtR, gtt in tqdm(test_loader):
            ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                             gtR.cuda(), gtt.cuda()
            R, t, pred_ref_clouds = model(
                src_cloud.permute(0, 2, 1).contiguous(),
                ref_cloud.permute(0, 2, 1).contiguous())
            loss = compute_loss(ref_cloud, pred_ref_clouds, loss_fn)
            cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
            cur_t_isotropic = compute_metrics(R, t, gtR, gtt)

            losses.append(loss.item())
            r_mse.append(cur_r_mse)
            r_mae.append(cur_r_mae)
            t_mse.append(cur_t_mse)
            t_mae.append(cur_t_mae)
            r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
            t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())
    model.train()
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)
    results = {
        'loss': np.mean(losses),
        'r_mse': r_mse,
        'r_mae': r_mae,
        't_mse': t_mse,
        't_mae': t_mae,
        'r_isotropic': r_isotropic,
        't_isotropic': t_isotropic
    }
    return results
def evaluate(test_loader, model, epoch, args, dataset_name):
    all_txt_embd = []
    all_video_embd = []
    model.eval()
    if args.rank == 0:  
        log('Evaluating on {}'.format(dataset_name), args)
    with torch.no_grad():
        for i_batch, data in enumerate(test_loader):
            text = data['text'].cuda()
            video = data['video'].float().cuda()
            video = video / 255.0
            video = video.view(-1, video.shape[2], video.shape[3], video.shape[4], video.shape[5])
            video_embd, text_embd = model(video, text)
            video_embd = video_embd.view(text_embd.shape[0], args.num_windows_test, text_embd.shape[1])
            video_embd = video_embd.mean(dim=1)
            video_embd = allgather(video_embd, args)
            text_embd = allgather(text_embd, args)
            if args.rank == 0:
                text_embd = text_embd.cpu().numpy()
                video_embd = video_embd.cpu().numpy()
                all_txt_embd.append(text_embd)
                all_video_embd.append(video_embd)
    model.train()
    if args.rank == 0:
        all_txt_embd = np.concatenate(all_txt_embd, axis=0)
        all_video_embd = np.concatenate(all_video_embd, axis=0)
        metrics = compute_metrics(np.dot(all_txt_embd, all_video_embd.T))
        log('Epoch {} results: {}'.format(epoch, metrics), args)
Example #7
0
def eval_step(batch_imgs, depth_gt):
    # batch_imgs: (batch_size, height, width, 9) Three images concatenated on the channels dimension
    # depth_gt: (batch_size, height, width)

    img_before = batch_imgs[:, :, :, :3]
    img_target = batch_imgs[:, :, :, 3:6]
    img_after = batch_imgs[:, :, :, 6:]

    disps = depth_net(
        img_target
    )  # disparities at different scales, in increasing resolution

    # Loss:
    T_before_target = pose_net(concat_images(img_before,
                                             img_target))  # (bs, 6)
    T_target_after = pose_net(concat_images(img_target, img_after))  # (bs, 6)
    matrixT_before_target = make_transformation_matrix(T_before_target,
                                                       False)  # (bs, 4, 4)
    matrixT_after_target = make_transformation_matrix(T_target_after,
                                                      True)  # (bs, 4, 4)
    loss_value, image_from_before, image_from_after = \
        loss_layer(disps, matrixT_before_target, matrixT_after_target, img_before, img_target, img_after)

    metrics = compute_metrics(disps[-1], depth_gt)

    return loss_value, metrics, disps, image_from_before, image_from_after
Example #8
0
def evaluate(args, model, tokenizer, prefix=""): 
    eval_task_names = ("snli",)
    eval_outputs_dirs = (args.output_dir,)
    
    results = {}
    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)

        if not os.path.exists(eval_output_dir):
            os.makedirs(eval_output_dir)

        args.eval_batch_size = 16
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
        logger.info("  Num examples = %d", len(eval_dataset))
        logger.info("  Batch size = %d", args.eval_batch_size)
        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        for batch in tqdm(eval_dataloader, desc="Evaluating", position=0, leave=True, ncols=100):
            model.eval()
            batch = tuple(t.to(args.device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2],
                          'labels':         batch[3],
                          'task':                 0,
                          }
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs['labels'].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = np.argmax(preds, axis=1)

        result = compute_metrics('snli', preds, out_label_ids)
        results.update(result)

        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results {} *****".format(prefix))
            for key in sorted(result.keys()):
                logger.info(" %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
                
    return results
Example #9
0
def evaluate(args, model, eval_dataset, prefix=""):
    eval_output_dir = args.output_dir

    results = {}

    if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(eval_output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[3]}
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs['labels'].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps
    if args.output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif args.output_mode == "regression":
        preds = np.squeeze(preds)
    result = compute_metrics(preds, out_label_ids)
    results.update(result)

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return results
Example #10
0
def compute_classifier_loss(model, data, target, optimizer=dict()):
    shape = data.shape
    x = data.reshape(shape[0], numpy.prod(shape[1:]))
    out = model(x)
    output = out['output']
    # cross entropy
    loss = -(target * torch.log(output) +
             (1 - target) * torch.log(1 - output)).mean()
    if optimizer:
        loss.backward()
    for opt in optimizer.values():
        opt.step()
    result = defaultdict(list)
    out_bin = output.detach().cpu().numpy() > 0.5
    compute_metrics(result, target.detach().cpu().numpy(), out_bin)
    result['loss'].append(loss.cpu().detach().numpy())
    return result
Example #11
0
def test(nn_model, dataset):
    data_loader = data.DataLoader(dataset, batch_size=1)
    cosines_list = []
    similarity_vector_list = []
    for batch in tqdm(data_loader):
        cosines, similarity_vector = test_step(nn_model, batch)
        cosines_list.append(cosines)
        similarity_vector_list.append(similarity_vector)
    return compute_metrics(cosines_list, similarity_vector_list)
Example #12
0
def xgboost_test(extractor, opt):
    import xgboost as xgb
    res = defaultdict(list)
    res_train = defaultdict(list)
    for study_num in range(7):
        #print(study_name)
        train_set, test_set = get_merged_common_dataset(opt,
                                                        skip_study=study_num)
        train_data, train_labels = get_data(train_set)
        val_data, val_labels = get_data(test_set)
        if True:
            train_features = extractor(train_data).detach().numpy()
            val_features = extractor(val_data).detach().numpy()
        else:
            train_features = train_data
            val_features = val_data
        # train the model
        model = xgb.XGBClassifier()
        clf = model.fit(train_features,
                        train_labels.astype(int),
                        eval_set=[(val_features, val_labels)],
                        early_stopping_rounds=50,
                        verbose=True,
                        eval_metric='auc')
        #model = LogisticRegression()
        #model = SVC(probability=True, class_weight='balanced')
        #clf = model.fit(train_features, train_labels.astype(int))

        print(val_data.shape)
        res['bias'].append(val_labels.sum() / len(val_labels))
        print(res['bias'][-1])
        y_pred = clf.predict_proba(val_features)[:, 1]
        x_pred = clf.predict_proba(train_features)[:, 1]
        compute_metrics(res, val_labels.flatten() > 0.5, y_pred > 0.5)
        compute_auc(res, val_labels.flatten() > 0.5, y_pred)
        compute_metrics(res_train, train_labels.flatten() > 0.5, x_pred > 0.5)
        compute_auc(res_train, train_labels.flatten() > 0.5, x_pred)
    for key in res_train:
        ave = numpy.asarray(res_train[key]).mean(axis=0)
        print('Train {0}: {1}'.format(key, ave))
    for key in res:
        ave = numpy.asarray(res[key]).mean(axis=0)
        print('Test {0}: {1}'.format(key, ave))
Example #13
0
def get_test_score(task_eval,qa_results,score_dict):

    score = compute_metrics(
            qa_results,
            bleu='iwslt.en.de' in task_eval or 'multinli.in.out' in task_eval,
            dialogue='woz.en' in task_eval,
            rouge='cnn_dailymail' in task_eval,
            logical_form='wikisql' in task_eval,
            corpus_f1='zre' in task_eval
    )
    score_dict[task_eval] = score
def Eval_retrieval(model, eval_dataloader, dataset_name):
    model.eval()
    print('Evaluating Text-Video retrieval on {} data'.format(dataset_name))
    with th.no_grad():
        for i_batch, data in enumerate(eval_dataloader):
            text = data['text'].cuda()
            video = data['video'].cuda()
            m = model(video, text)
            m = m.cpu().detach().numpy()
            metrics = compute_metrics(m)
            print_computed_metrics(metrics)
Example #15
0
def validation_epoch(cb, opt, model, val_loader):
    """logic for each validation epoch"""
    model.eval()

    # metrics to return
    losses = []
    prec = []
    rec = []
    f1 = []
    ap = []
    iou = []
    l_ship = []
    l_bbox = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            for key in batch.keys():
                batch[key] = batch[key].to(opt.device)

            # validation step
            input, target = batch['input'], batch['target']
            output = model(input)

            loss, _l_ship, _l_bbox = compute_loss(output, target)
            _prec, _rec, _f1, _ap, _iou = compute_metrics(output, target)

            # append incase analysis of distribution is of interest
            losses.append(loss)
            l_ship.append(_l_ship)
            l_bbox.append(_l_bbox)
            prec.append(_prec)
            rec.append(_rec)
            f1.append(_f1)
            ap.append(_ap)
            iou.append(_iou)

    loss_avg = torch.mean(torch.cat(losses))
    l_ship = torch.mean(torch.cat(l_ship))
    l_bbox = torch.mean(torch.cat(l_bbox))

    metrics = {}
    for k, m in zip(["prec", "rec", "f1", "ap", "iou"],
                    [prec, rec, f1, ap, iou]):
        m = sum(m) / len(m)
        metrics[k] = m

    cb.on_validation_end(opt=opt,
                         output=loss_avg,
                         metrics=metrics,
                         l_ship=l_ship,
                         l_bbox=l_bbox)

    return loss_avg
Example #16
0
def evaluate_benchmark_icp(args, test_loader):
    in_dim = 6 if args.normal else 3
    model = IterativeBenchmark(in_dim=in_dim, niters=args.niters, gn=args.gn)
    if args.cuda:
        model = model.cuda()
        model.load_state_dict(torch.load(args.checkpoint))
    else:
        model.load_state_dict(
            torch.load(args.checkpoint, map_location=torch.device('cpu')))
    model.eval()

    dura = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    with torch.no_grad():
        for i, (ref_cloud, src_cloud, gtR,
                gtt) in tqdm(enumerate(test_loader)):
            if args.cuda:
                ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                                 gtR.cuda(), gtt.cuda()
            tic = time.time()
            R1, t1, pred_ref_cloud = model(
                src_cloud.permute(0, 2, 1).contiguous(),
                ref_cloud.permute(0, 2, 1).contiguous())
            ref_cloud = torch.squeeze(ref_cloud).cpu().numpy()
            src_cloud_tmp = torch.squeeze(pred_ref_cloud[-1]).cpu().numpy()
            R2, t2, pred_ref_cloud = icp(npy2pcd(src_cloud_tmp),
                                         npy2pcd(ref_cloud))
            R2, t2 = torch.from_numpy(R2)[None, ...].to(R1), \
                     torch.from_numpy(t2)[None, ...].to(R1)
            R, t = R2 @ R1, torch.squeeze(R2 @ t1[:, :, None], dim=-1) + t2
            toc = time.time()
            dura.append(toc - tic)
            cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
            cur_t_isotropic = compute_metrics(R, t, gtR, gtt)
            r_mse.append(cur_r_mse)
            r_mae.append(cur_r_mae)
            t_mse.append(cur_t_mse)
            t_mae.append(cur_t_mae)
            r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
            t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())

            if args.show:
                src_cloud = torch.squeeze(src_cloud).cpu().numpy()
                pcd1 = npy2pcd(ref_cloud, 0)
                pcd2 = npy2pcd(src_cloud, 1)
                pcd3 = pred_ref_cloud
                o3d.visualization.draw_geometries([pcd1, pcd2, pcd3])

    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)

    return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic
def main():
    granularity = 0.001
    trial_cnt = 10
    fout = open('trial_data.txt', 'w')
    print >> fout, 'DupRatio\tAUC\tBER'
    for dup_ratio in np.arange(0, 1, granularity):
        print '==========DupRatio=%s==========' % dup_ratio
        for _ in range(trial_cnt):
            generate_sample(dup_ratio)
            auc, ber = compute_metrics()
            print >> fout, '%s\t%s\t%s' % (dup_ratio, auc, ber)
    fout.close()
    return
Example #18
0
def compute_and_log_metrics(disp_pred, depth_gt, step_count):
    # disp_pred: (batch_size, height, width, 1)
    # depth_gt: (batch_size, depth_height, depth_width)
    abs_rel, sq_rel, rmse, rmse_log, a1, a2, a3 = compute_metrics(
        disp_pred, depth_gt)
    with train_summary_writer.as_default():
        tf.summary.scalar('abs_rel', abs_rel, step=step_count)
        tf.summary.scalar('sq_rel', sq_rel, step=step_count)
        tf.summary.scalar('rmse', rmse, step=step_count)
        tf.summary.scalar('rmse_log', rmse_log, step=step_count)
        tf.summary.scalar('a1', a1, step=step_count)
        tf.summary.scalar('a2', a2, step=step_count)
        tf.summary.scalar('a3', a3, step=step_count)
Example #19
0
def compute_metrics_i(i):

    if os.path.isfile(get_save_path_probs_i(i)):

        start = time.time()

        probs, gt, _ = probs_gt_load(i)
        metrics, components = compute_metrics(probs, gt)

        metrics_dump(metrics, i)
        components_dump(components, i)

        print("image", i,
              "processed in {}s\r".format(round(time.time() - start)))
Example #20
0
def compute_baselines_part1():
    query_database = UbuntuDatabase()
    validation_set = query_database.get_validation_dataset()
    testing_set = query_database.get_testing_dataset()

    metrics_list = []
    for dataset in (validation_set, testing_set):
        similarity_vector_list = []
        bm25_scores_list = []
        for ind_sample in dataset:
            similarity_vector_list.append(ind_sample["similarity_vec"].numpy())
            bm25_scores_list.append(ind_sample["bm25_scores"].numpy())
        metrics_list.append(
            metrics.compute_metrics(bm25_scores_list, similarity_vector_list))
    return {"validation": metrics_list[0], "testing": metrics_list[1]}
Example #21
0
def validate(task, val_iter, model, logger, field, world_size, rank, num_print=10, args=None):
    with torch.no_grad():
        model.eval()
        required_names = ['greedy', 'answer']
        optional_names = ['context', 'question']
        loss, predictions, answers, results = gather_results(model, val_iter, field, world_size, optional_names=optional_names)
        predictions = [p.replace('UNK', 'OOV') for p in predictions]
        names = required_names + optional_names 
        if hasattr(val_iter.dataset.examples[0], 'wikisql_id') or hasattr(val_iter.dataset.examples[0], 'squad_id') or hasattr(val_iter.dataset.examples[0], 'woz_id'):
            answers = [val_iter.dataset.all_answers[sid] for sid in answers.tolist()]
        metrics, answers = compute_metrics(predictions, answers, bleu='iwslt' in task or 'multi30k' in task, dialogue='woz' in task,
            rouge='cnn' in task, logical_form='sql' in task, corpus_f1='zre' in task, args=args)
        results = [predictions, answers] + results
        print_results(names, results, rank=rank, num_print=num_print)

        return loss, metrics
Example #22
0
def main():
    """Ensemble all models inside the experiments folder"""
    # we assume all the experiments are saved
    # in the experiments folder
    path = Path('experiments')
    # get a list of all experiments name
    experiment_list = os.listdir(path)
    assert len(experiment_list) > 1, \
           'there is not enough experiments to ensemble'
    predictions = []
    # for every experiment
    for experiment in experiment_list:
        # create a path to the valid prediction file
        path_to_pred = path.joinpath(experiment, 'prediction', 'valid.csv')
        if not os.path.exists(path_to_pred):
            continue
        # if this file exists, we read it and
        # set the experiment column to the name of this experiment
        pred_exp = load_data.read_csv(path_to_pred)
        pred_exp = pred_exp.assign(experiment=experiment)
        predictions.append(pred_exp)
    # concat all the predictions
    predictions = pd.concat(predictions)
    # create the target by dropping all duplicates
    target = predictions.drop_duplicates(subset=['period', 'timedelta'])
    target.reset_index(drop=True, inplace=True)
    target.drop(columns=default.yhat, inplace=True)

    # ensemble
    predictions_ensemble = ensemble(predictions)

    target_ensemble = target.merge(predictions_ensemble,
                                   on=['period', 'timedelta'],
                                   how='left')
    # check there is non nan values
    assert target_ensemble[default.yhat].isna().sum().sum() == 0
    # compute the metrics
    ensemble_metrics = compute_metrics(target_ensemble)
    experiment_list = list(predictions['experiment'].unique())
    ensemble_metrics['experiment'] = '__'.join(experiment_list)
    ensemble_metrics['n_model'] = len(experiment_list)
    results = pd.DataFrame([ensemble_metrics])
    # print scores
    print(results.head())
    # save the ensemble results in a CSV file
    results.to_csv(path / 'ensemble_summary.csv', index=False)
Example #23
0
def evaluate_benchmark(args, test_loader):
    model = IterativeBenchmark(in_dim=args.in_dim,
                               niters=args.niters,
                               gn=args.gn)
    if args.cuda:
        model = model.cuda()
        model.load_state_dict(torch.load(args.checkpoint))
    else:
        model.load_state_dict(torch.load(args.checkpoint, map_location=torch.device('cpu')))
    model.eval()

    dura = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    with torch.no_grad():
        for i, (ref_cloud, src_cloud, gtR, gtt) in tqdm(enumerate(test_loader)):
            if args.cuda:
                ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                                 gtR.cuda(), gtt.cuda()
            tic = time.time()
            R, t, pred_ref_cloud = model(src_cloud.permute(0, 2, 1).contiguous(),
                    ref_cloud.permute(0, 2, 1).contiguous())
            toc = time.time()
            dura.append(toc - tic)
            cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
            cur_t_isotropic = compute_metrics(R, t, gtR, gtt)
            r_mse.append(cur_r_mse)
            r_mae.append(cur_r_mae)
            t_mse.append(cur_t_mse)
            t_mae.append(cur_t_mae)
            r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
            t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())

            if args.show:
                ref_cloud = torch.squeeze(ref_cloud).cpu().numpy()
                src_cloud = torch.squeeze(src_cloud).cpu().numpy()
                pred_ref_cloud = torch.squeeze(pred_ref_cloud[-1]).cpu().numpy()
                pcd1 = npy2pcd(ref_cloud, 0)
                pcd2 = npy2pcd(src_cloud, 1)
                pcd3 = npy2pcd(pred_ref_cloud, 2)
                o3d.visualization.draw_geometries([pcd1, pcd2, pcd3])

    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)

    return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic
Example #24
0
def evaluate(model, test_loader):
    y_pred = []
    y_true = []
    for batch in test_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                        return_dict=False)
        loss, output = outputs
        y_pred.extend(torch.argmax(output, 1).tolist())
        y_true.extend(labels.tolist())
    print('Classification Report:')
    metrics = compute_metrics(y_pred, y_true)
    print(metrics)
Example #25
0
def run(num_classes,learning_rate,width,depth,mini_batch_size):

	precision = accuracy = recall = f_score = np.array([])


	X_train,X_test,y_train,y_test,unknown_data = dp.load_data()
	X_train,X_test,y_train,y_test,unknown_data,dtype = dp.prepare_data(X_train,X_test,y_train,y_test,unknown_data)


	for _ in range(1):

		model = NN.Net1(num_classes,depth=depth,width=width).type(dtype)
		opt = optim.SGD(params=model.parameters(),lr=learning_rate,momentum=rp.m,nesterov=True)
		train_losses,test_losses = model.train_validate(X_train,y_train,X_test,y_test,opt,mini_batch_size,dtype)

		model = torch.load("Models/Best_Model.pkl")

		y_pred,_ = model.test(X_test)

		# Calculate metrics
		y_true = y_test.data.cpu().numpy()
		y_pred = y_pred.data.cpu().numpy()
		a,p,r,f = m.compute_metrics(y_true,y_pred)

		accuracy = np.append(accuracy,a)
		precision = np.append(precision,p)
		recall = np.append(recall,r)
		f_score = np.append(f_score,f)


	accuracy = np.mean(accuracy)
	precision = np.mean(precision)
	recall = np.mean(recall)
	f_score = np.mean(f_score)

	m.show_results(accuracy,precision,recall,f_score,num_classes,train_losses,test_losses)
	
	#g.generate_graph(model,X_train)
	
	fw.create_data_csv(learning_rate,depth,width,mini_batch_size,rp.m,len(test_losses)-10,accuracy)

	# Store unknown_data prediction 
	y_pred,_ = model.test(unknown_data)
	fw.store_prediction(y_pred.data.cpu().numpy())
def evaluate_fgr(args, test_loader):
    dura = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    for i, (ref_cloud, src_cloud, gtR, gtt) in tqdm(enumerate(test_loader)):
        if args.cuda:
            ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                             gtR.cuda(), gtt.cuda()

        ref_points = torch.squeeze(ref_cloud).cpu().numpy()[:, :3]
        src_points = torch.squeeze(src_cloud).cpu().numpy()[:, :3]
        ref_normals = torch.squeeze(ref_cloud).cpu().numpy()[:, 3:]
        src_normals = torch.squeeze(src_cloud).cpu().numpy()[:, 3:]

        tic = time.time()
        R, t, pred_ref_cloud = fgr(source=npy2pcd(src_points),
                                   target=npy2pcd(ref_points),
                                   src_normals=src_normals,
                                   tgt_normals=ref_normals)
        toc = time.time()
        R = torch.from_numpy(np.expand_dims(R, 0)).to(gtR)
        t = torch.from_numpy(np.expand_dims(t, 0)).to(gtt)
        dura.append(toc - tic)

        cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
        cur_t_isotropic = compute_metrics(R, t, gtR, gtt)
        r_mse.append(cur_r_mse)
        r_mae.append(cur_r_mae)
        t_mse.append(cur_t_mse)
        t_mae.append(cur_t_mae)
        r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
        t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())

        if args.show:
            print(cur_t_error.item(), cur_R_error.item(),
                  cur_degree_error.item())
            pcd1 = npy2pcd(ref_cloud, 0)
            pcd2 = npy2pcd(src_cloud, 1)
            pcd3 = pred_ref_cloud
            o3d.visualization.draw_geometries([pcd1, pcd2, pcd3])

    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)

    return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic
Example #27
0
def do_eval(model, task_name, eval_dataloader, device, output_mode,
            eval_labels, num_labels):
    eval_loss = 0
    nb_eval_steps = 0
    preds = []

    for batch_ in tqdm(eval_dataloader, desc="Evaluating"):
        batch_ = tuple(t.to(device) for t in batch_)
        with torch.no_grad():
            input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch_

            logits, _, _ = model(input_ids, segment_ids, input_mask)

        # create eval loss and other metric required by the task
        if output_mode == "classification":
            loss_fct = CrossEntropyLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                     label_ids.view(-1))
        elif output_mode == "regression":
            loss_fct = MSELoss()
            tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if len(preds) == 0:
            preds.append(logits.detach().cpu().numpy())
        else:
            preds[0] = np.append(preds[0],
                                 logits.detach().cpu().numpy(),
                                 axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = preds[0]
    if output_mode == "classification":
        preds = np.argmax(preds, axis=1)
    elif output_mode == "regression":
        preds = np.squeeze(preds)
    result = compute_metrics(task_name, preds, eval_labels.numpy())
    result['eval_loss'] = eval_loss

    return result
def evaluate(train_loader, model, args):
    all_txt_embd = []
    all_video_embd = []
    with torch.no_grad():
        for i_batch, data in enumerate(tqdm(train_loader)):
            text = data['text'].cuda()
            video = data['video'].float().cuda()
            video = video / 255.0
            video = video.view(-1, video.shape[2], video.shape[3], video.shape[4], video.shape[5])
            video_embd, text_embd = model(video, text)
            text_embd  = text_embd.cpu().numpy()
            video_embd = video_embd.view(text_embd.shape[0], args.num_windows_test, text_embd.shape[1])
            video_embd = video_embd.mean(dim=1)
            video_embd  = video_embd.cpu().numpy()
            all_txt_embd.append(text_embd)
            all_video_embd.append(video_embd)
    all_txt_embd = np.concatenate(all_txt_embd, axis=0)
    all_video_embd = np.concatenate(all_video_embd, axis=0)
    metrics = compute_metrics(np.dot(all_txt_embd, all_video_embd.T))
    print_computed_metrics(metrics)
Example #29
0
def run_eval(sess, args, igraph, tgraph, data, set_name):
    """ runs one evaluation against the full epoch of data """

    bg = utils.batch_generator(
        (data["encoded_data"][set_name], data["scores"][set_name]),
        args.batch_size,
        skip_last_batch=False,
        num_epochs=1,
        shuffle=False)

    # get all the predicted and true labels in batches
    predicted_scores = np.zeros(data["scores"][set_name].shape)
    true_scores = np.zeros(data["scores"][set_name].shape)

    start = time.time()
    for batch_num, batch_data in enumerate(bg):
        ed_batch, sc_batch = batch_data

        # fill the feed dict with the next batch
        feed_dict = {
            igraph["ph_inputs_dict"]["raw_seqs"]: ed_batch,
            igraph["ph_inputs_dict"]["scores"]: sc_batch
        }

        # start and end index for this batch
        start_index = batch_num * args.batch_size
        end_index = start_index + args.batch_size

        # get predicted labels for evaluating metrics using sklearn
        preds = sess.run(igraph["predictions"], feed_dict=feed_dict)
        predicted_scores[start_index:end_index] = preds
        true_scores[start_index:end_index] = sc_batch
    duration = time.time() - start

    evaluation_dict = metrics.compute_metrics(true_scores, predicted_scores)

    print("Evaluation ({} set) completed in {:.3} sec.".format(
        set_name, duration))

    return evaluation_dict
Example #30
0
def main():
    """Main function"""
    inputs = []
    with io.open("data/test.txt", mode="r", encoding="utf-8") as file_:
        for line in file_:
            if line is not None and len(line) > 0:
                inputs += [remove_diacritics(line)]
    wrapper = RegexWrapper(TEMPLATE)
    templates = []
    for input_ in inputs:
        template = wrapper.fill_template(input_)
        print(exercise_output(template))
        templates.append(template)

    precision, recall = metrics.compute_metrics(templates)
    avg_precision = precision.mean(skipna=True)
    avg_recall = recall.mean(skipna=True)
    print('Precision:\n' + str(precision))
    print('\nRecall:\n' + str(recall))

    print('\n\nAverage Precision: %.4f' % avg_precision)
    print('Average Recall: %.4f' % avg_recall)
Example #31
0
def evaluate_icp(args, test_loader):
    dura = []
    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = [], [], [], [], [], []
    for i, (ref_cloud, src_cloud, gtR, gtt) in tqdm(enumerate(test_loader)):
        if args.cuda:
            ref_cloud, src_cloud, gtR, gtt = ref_cloud.cuda(), src_cloud.cuda(), \
                                             gtR.cuda(), gtt.cuda()

        ref_cloud = torch.squeeze(ref_cloud).cpu().numpy()
        src_cloud = torch.squeeze(src_cloud).cpu().numpy()

        tic = time.time()
        R, t, pred_ref_cloud = icp(npy2pcd(src_cloud), npy2pcd(ref_cloud))
        toc = time.time()
        R = torch.from_numpy(np.expand_dims(R, 0)).to(gtR)
        t = torch.from_numpy(np.expand_dims(t, 0)).to(gtt)
        dura.append(toc - tic)

        cur_r_mse, cur_r_mae, cur_t_mse, cur_t_mae, cur_r_isotropic, \
        cur_t_isotropic = compute_metrics(R, t, gtR, gtt)
        r_mse.append(cur_r_mse)
        r_mae.append(cur_r_mae)
        t_mse.append(cur_t_mse)
        t_mae.append(cur_t_mae)
        r_isotropic.append(cur_r_isotropic.cpu().detach().numpy())
        t_isotropic.append(cur_t_isotropic.cpu().detach().numpy())

        if args.show:
            pcd1 = npy2pcd(ref_cloud, 0)
            pcd2 = npy2pcd(src_cloud, 1)
            pcd3 = pred_ref_cloud
            o3d.visualization.draw_geometries([pcd1, pcd2, pcd3])

    r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic = \
        summary_metrics(r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic)

    return dura, r_mse, r_mae, t_mse, t_mae, r_isotropic, t_isotropic
Example #32
0
def run(args, field, val_sets, model):
    device = set_seed(args)
    print(f'Preparing iterators')
    if len(args.val_batch_size) == 1 and len(val_sets) > 1:
        args.val_batch_size *= len(val_sets)
    iters = [(name, to_iter(x, bs, device)) for name, x, bs in zip(args.tasks, val_sets, args.val_batch_size)]
 
    def mult(ps):
        r = 0
        for p in ps:
            this_r = 1
            for s in p.size():
                this_r *= s
            r += this_r
        return r
    params = list(filter(lambda p: p.requires_grad, model.parameters()))
    num_param = mult(params)
    print(f'{args.model} has {num_param:,} parameters')
    model.to(device)

    decaScore = []
    model.eval()
    with torch.no_grad():
        for task, it in iters:
            print(task)
            prediction_file_name = os.path.join(os.path.splitext(args.best_checkpoint)[0], args.evaluate, task + '.txt')
            answer_file_name = os.path.join(os.path.splitext(args.best_checkpoint)[0], args.evaluate, task + '.gold.txt')
            results_file_name = answer_file_name.replace('gold', 'results')
            if 'sql' in task or 'squad' in task:
                ids_file_name = answer_file_name.replace('gold', 'ids')
            if os.path.exists(prediction_file_name):
                print('** ', prediction_file_name, ' already exists -- this is where predictions are stored **')
                if args.overwrite:
                    print('**** overwriting ', prediction_file_name, ' ****')
            if os.path.exists(answer_file_name):
                print('** ', answer_file_name, ' already exists -- this is where ground truth answers are stored **')
                if args.overwrite:
                    print('**** overwriting ', answer_file_name, ' ****')
            if os.path.exists(results_file_name):
                print('** ', results_file_name, ' already exists -- this is where metrics are stored **')
                if args.overwrite:
                    print('**** overwriting ', results_file_name, ' ****')
                else:
                    with open(results_file_name) as results_file:
                        if not args.silent:
                            for l in results_file:
                                print(l)
                        metrics = json.loads(results_file.readlines()[0])
                        decaScore.append(metrics[args.task_to_metric[task]])
                    continue

            for x in [prediction_file_name, answer_file_name, results_file_name]:
                os.makedirs(os.path.dirname(x), exist_ok=True)
    
            if not os.path.exists(prediction_file_name) or args.overwrite:
                with open(prediction_file_name, 'w') as prediction_file:
                    predictions = []
                    ids = []
                    for batch_idx, batch in enumerate(it):
                        _, p = model(batch)
                        p = field.reverse(p)
                        for i, pp in enumerate(p):
                            if 'sql' in task:
                                ids.append(int(batch.wikisql_id[i]))
                            if 'squad' in task:
                                ids.append(it.dataset.q_ids[int(batch.squad_id[i])])
                            prediction_file.write(pp + '\n')
                            predictions.append(pp) 
                if 'sql' in task:
                    with open(ids_file_name, 'w') as id_file:
                        for i in ids:
                            id_file.write(json.dumps(i) + '\n')
                if 'squad' in task:
                    with open(ids_file_name, 'w') as id_file:
                        for i in ids:
                            id_file.write(i + '\n')
            else:
                with open(prediction_file_name) as prediction_file:
                    predictions = [x.strip() for x in prediction_file.readlines()] 
                if 'sql' in task or 'squad' in task:
                    with open(ids_file_name) as id_file:
                        ids = [int(x.strip()) for x in id_file.readlines()]
   
            def from_all_answers(an):
                return [it.dataset.all_answers[sid] for sid in an.tolist()] 
    
            if not os.path.exists(answer_file_name) or args.overwrite:
                with open(answer_file_name, 'w') as answer_file:
                    answers = []
                    for batch_idx, batch in enumerate(it):
                        if hasattr(batch, 'wikisql_id'):
                            a = from_all_answers(batch.wikisql_id.data.cpu())
                        elif hasattr(batch, 'squad_id'):
                            a = from_all_answers(batch.squad_id.data.cpu())
                        elif hasattr(batch, 'woz_id'):
                            a = from_all_answers(batch.woz_id.data.cpu())
                        else:
                            a = field.reverse(batch.answer.data)
                        for aa in a:
                            answers.append(aa) 
                            answer_file.write(json.dumps(aa) + '\n')
            else:
                with open(answer_file_name) as answer_file:
                    answers = [json.loads(x.strip()) for x in answer_file.readlines()] 
    
            if len(answers) > 0:
                if not os.path.exists(results_file_name) or args.overwrite:
                    metrics, answers = compute_metrics(predictions, answers, bleu='iwslt' in task or 'multi30k' in task or args.bleu, dialogue='woz' in task,
                        rouge='cnn' in task or 'dailymail' in task or args.rouge, logical_form='sql' in task, corpus_f1='zre' in task, args=args)
                    with open(results_file_name, 'w') as results_file:
                        results_file.write(json.dumps(metrics) + '\n')
                else:
                    with open(results_file_name) as results_file:
                        metrics = json.loads(results_file.readlines()[0])
    
                if not args.silent:
                    for i, (p, a) in enumerate(zip(predictions, answers)):
                        print(f'Prediction {i+1}: {p}\nAnswer {i+1}: {a}\n')
                    print(metrics)
                decaScore.append(metrics[args.task_to_metric[task]])

    print(f'Evaluated Tasks:\n')
    for i, (task, _) in enumerate(iters):
        print(f'{task}: {decaScore[i]}')
    print(f'-------------------')
    print(f'DecaScore:  {sum(decaScore)}\n')
    print(f'\nSummary: | {sum(decaScore)} | {" | ".join([str(x) for x in decaScore])} |\n')
Example #33
0
def run(args, field, val_sets, model):
    set_seed(args)
    print(f'Preparing iterators')
    iters = [(name, to_iter(x, bs)) for name, x, bs in zip(args.tasks, val_sets, args.val_batch_size)]
 
    def mult(ps):
        r = 0
        for p in ps:
            this_r = 1
            for s in p.size():
                this_r *= s
            r += this_r
        return r
    params = list(filter(lambda p: p.requires_grad, model.parameters()))
    num_param = mult(params)
    print(f'{args.model} has {num_param:,} parameters')
    if args.gpus > -1:
        model.cuda()

    model.eval()
    for task, it in iters:
        prediction_file_name = os.path.join(os.path.splitext(args.best_checkpoint)[0], args.evaluate, task + '.txt')
        answer_file_name = os.path.join(os.path.splitext(args.best_checkpoint)[0], args.evaluate, task + '.gold.txt')
        results_file_name = answer_file_name.replace('gold', 'results')
        if os.path.exists(prediction_file_name):
            print('** ', prediction_file_name, ' already exists**')
        if os.path.exists(answer_file_name):
            print('** ', answer_file_name, ' already exists**')
        if os.path.exists(results_file_name):
            print('** ', results_file_name, ' already exists**')
            with open(results_file_name) as results_file:
              for l in results_file:
                  print(l)
            continue
        for x in [prediction_file_name, answer_file_name, results_file_name]:
            os.makedirs(os.path.dirname(x), exist_ok=True)

        if not os.path.exists(prediction_file_name):
            with open(prediction_file_name, 'a') as prediction_file:
                predictions = []
                for batch_idx, batch in enumerate(it):
                    _, p = model(batch)
                    p = field.reverse(p)
                    for pp in p:
                        prediction_file.write(pp + '\n')
                        predictions.append(pp) 
        else:
            with open(prediction_file_name) as prediction_file:
                predictions = [x.strip() for x in prediction_file.readlines()] 

        def from_all_answers(an):
            return [it.dataset.all_answers[sid] for sid in an.tolist()] 

        if not os.path.exists(answer_file_name):
            with open(answer_file_name, 'a') as answer_file:
                answers = []
                for batch_idx, batch in enumerate(it):
                    if hasattr(batch, 'wikisql_id'):
                        a = from_all_answers(batch.wikisql_id.data.cpu())
                    elif hasattr(batch, 'squad_id'):
                        a = from_all_answers(batch.squad_id.data.cpu())
                    elif hasattr(batch, 'woz_id'):
                        a = from_all_answers(batch.woz_id.data.cpu())
                    else:
                        a = field.reverse(batch.answer.data)
                    for aa in a:
                        answers.append(aa) 
                        answer_file.write(json.dumps(aa) + '\n')
        else:
            with open(answer_file_name) as answer_file:
                answers = [json.loads(x.strip()) for x in answer_file.readlines()] 

        if len(answers) > 0:
            metrics, answers = compute_metrics(predictions, answers, bleu='iwslt' in task or 'multi30k' in task, dialogue='woz' in task,
                rouge='cnn' in task, logical_form='sql' in task, corpus_f1='zre' in task, args=args)

            print(metrics)
            with open(results_file_name, 'w') as results_file:
                results_file.write(json.dumps(metrics) + '\n')
def train(sess, model, optimizer, log_dir, batch_size, num_sweeps_per_summary,
          num_sweeps_per_save, train_input_seqs, train_reset_seqs,
          train_label_seqs, test_input_seqs, test_reset_seqs, test_label_seqs):
    """ Train a model and export summaries.

    `log_dir` will be *replaced* if it already exists, so it certainly
    shouldn't be anything generic like `/home/user`.

    Args:
        sess: A TensorFlow `Session`.
        model: An `LSTMModel`.
        optimizer: An `Optimizer`.
        log_dir: A string. The full path to the log directory.
        batch_size: An integer. The number of sequences in a batch.
        num_sweeps_per_summary: An integer. The number of sweeps between
            summaries.
        num_sweeps_per_save: An integer. The number of sweeps between saves.
        train_input_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, input_size]`.
        train_reset_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, 1]`.
        train_label_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, 1]`.
        test_input_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, input_size]`.
        test_reset_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, 1]`.
        test_label_seqs: A list of 2-D NumPy arrays, each with shape
            `[duration, 1]`.
    """

    ema = tf.train.ExponentialMovingAverage(decay=0.5)
    update_train_loss_ema = ema.apply([model.loss])
    train_loss_ema = ema.average(model.loss)
    tf.scalar_summary('train_loss_ema', train_loss_ema)

    train_accuracy = tf.placeholder(tf.float32, name='train_accuracy')
    train_edit_dist = tf.placeholder(tf.float32, name='train_edit_dist')
    test_accuracy = tf.placeholder(tf.float32, name='test_accuracy')
    test_edit_dist = tf.placeholder(tf.float32, name='test_edit_dist')
    values = [train_accuracy, train_edit_dist, test_accuracy, test_edit_dist]
    tags = [value.op.name for value in values]
    tf.scalar_summary('learning_rate', optimizer.learning_rate)
    tf.scalar_summary(tags, tf.pack(values))

    summary_op = tf.merge_all_summaries()

    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)
    summary_writer = tf.train.SummaryWriter(logdir=log_dir, graph=sess.graph)
    saver = tf.train.Saver()

    sess.run(tf.initialize_all_variables())

    num_sweeps_visited = 0
    start_time = time.time()
    train_gen = data.sweep_generator(
        [train_input_seqs, train_reset_seqs, train_label_seqs],
        batch_size=batch_size, shuffle=True, num_sweeps=None)
    while num_sweeps_visited <= optimizer.num_train_sweeps:

        if num_sweeps_visited % num_sweeps_per_summary == 0:

            train_prediction_seqs = models.predict(
                sess, model, train_input_seqs, train_reset_seqs)
            train_accuracy_, train_edit_dist_ = metrics.compute_metrics(
                train_prediction_seqs, train_label_seqs)
            test_prediction_seqs = models.predict(
                sess, model, test_input_seqs, test_reset_seqs)
            test_accuracy_, test_edit_dist_ = metrics.compute_metrics(
                test_prediction_seqs, test_label_seqs)
            summary = sess.run(summary_op,
                               feed_dict={train_accuracy: train_accuracy_,
                                          train_edit_dist: train_edit_dist_,
                                          test_accuracy: test_accuracy_,
                                          test_edit_dist: test_edit_dist_})
            summary_writer.add_summary(summary, global_step=num_sweeps_visited)

            status_path = os.path.join(log_dir, 'status.txt')
            with open(status_path, 'w') as f:
                line = '%05.1f      ' % ((time.time() - start_time)/60)
                line += '%04d      ' % num_sweeps_visited
                line += '%.6f  %08.3f     ' % (train_accuracy_,
                                               train_edit_dist_)
                line += '%.6f  %08.3f     ' % (test_accuracy_,
                                               test_edit_dist_)
                print(line, file=f)

            label_path = os.path.join(log_dir, 'test_label_seqs.pkl')
            with open(label_path, 'w') as f:
                cPickle.dump(test_label_seqs, f)

            pred_path = os.path.join(log_dir, 'test_prediction_seqs.pkl')
            with open(pred_path, 'w') as f:
                cPickle.dump(test_prediction_seqs, f)

            vis_filename = 'test_visualizations_%06d.png' % num_sweeps_visited
            vis_path = os.path.join(log_dir, vis_filename)
            fig, axes = data.visualize_predictions(test_prediction_seqs,
                                                   test_label_seqs,
                                                   model.target_size)
            axes[0].set_title(line)
            plt.tight_layout()
            plt.savefig(vis_path)
            plt.close(fig)

        if num_sweeps_visited % num_sweeps_per_save == 0:
            saver.save(sess, os.path.join(log_dir, 'model.ckpt'))

        train_inputs, train_resets, train_labels = train_gen.next()
        # We squeeze here because otherwise the targets would have shape
        # [batch_size, duration, 1, num_classes].
        train_targets = data.one_hot(train_labels, model.target_size)
        train_targets = train_targets.squeeze(axis=2)

        _, _, num_sweeps_visited = sess.run(
            [optimizer.optimize_op,
             update_train_loss_ema,
             optimizer.num_sweeps_visited],
            feed_dict={model.inputs: train_inputs,
                       model.resets: train_resets,
                       model.targets: train_targets,
                       model.training: True})