Beispiel #1
0
    def run(self):
        torch.autograd.set_detect_anomaly(True)  # This makes debugging much easier

        self.config["model_dir"] = self.model_dir

        make_deterministic(self.config['random_seed'])

        location = 'cpu' if self.gpu_id is None else "cuda:%d" % self.gpu_id
        if location is not 'cpu':
            # This fixes the problem that pytorch is always allocating memory on GPU 0 even if this is not included
            # in the list of GPUs to use
            torch.cuda.set_device(torch.device(location))

            # cudnn.benchmark improves training speed when input sizes do not change
            # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
            # It selects the best algorithms as the training iterates over the dataset
            #cudnn.benchmark = True # but it can cause determinism problems, so disable

        hg, hg_config = self.load_hg(self.config["initial_hg"], location)
        pdm, pdm_config = self.load_pdm(self.config["initial_pdm"], location)

        pdm.verbose = not self.is_gridsearch
        pdm.print_losses = False
        pdm.listener = self.receive_pdm_output

        normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
        normTransform = transforms.Normalize(normMean, normStd)

        jitterTransform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)

        transform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(jitterTransform),
            ImageAndLabelTransform(RandomHorizontalFlip()),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        bs = self.config["bs"]
        pin_memory = location != 'cpu'
        num_workers = 8

        with h5py.File(self.config["data"], 'r') as f:
            train_d = FaceLandmarksTrainingData(f, transform=transform)
            train_loader = DataLoader(dataset=train_d, shuffle=self.config["shuffle"], num_workers=num_workers, pin_memory=pin_memory, batch_size=bs)

        results_before = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=True)
        if not self.is_gridsearch:
            print("Before training")
            for model, res in results_before.items():
                print(model, res)

        zs, nr, losses = pdm.end2end_training(hg=hg,
                                              data_loader=train_loader,
                                              hg_opt_config=self.config["hg_optimizer"],
                                              pdm_weight_opt_config=self.config["pdm_weight_optimizer"],
                                              pdm_shape_opt_config=self.config["pdm_shape_optimizer"],
                                              training_schedule=self.config["training_schedule"],
                                              detach_confidence=self.config["detach_confidence"])

        plot_path = os.path.join(self.plot_dir, "losses_%d.png" % self.config["config_id"])
        if not self.is_gridsearch: print("save plot to %s" % plot_path)
        fig, ax = plt.subplots()
        ax.plot(losses)
        ax.set(xlabel='epoch', ylabel='loss', title='loss per epoch')
        ax.grid()
        fig.savefig(plot_path)

        if not self.is_gridsearch: print("save HG")
        torch.save({
            'model': 'pe_hourglass',
            'state_dict': hg.state_dict(),
            'config': hg_config
        }, os.path.join(self.model_dir, "%d_hg_e2e.torch" % self.config["config_id"]))

        if not self.is_gridsearch: print("save PDM")
        pdm.save_pdm(pdm.train_epochs, os.path.join(self.model_dir, "%d_pdm_e2e.torch" % self.config["config_id"]))

        results_after = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=False)

        if not self.is_gridsearch:
            print("Before training")
            for model, res in results_before.items():
                print(model, res)

            print("After training")
            for model, res in results_after.items():
                print(model, res)

        if self.is_gridsearch:
            logpath = os.path.join(self.result_dir, "%d_log.json" % self.config["config_id"])
            json.dump({
                "gt": self.gts,
                "l2d": self.l2d_log,
                "hg": self.hg_coords_log,
                "losses": self.loss_log
            }, open(logpath, "w"))

            return {
                **self.config,
                "min_loss": min(self.loss_log),
                "last_loss" : self.loss_log[-1],
                "hg_before_easy_with" : results_before["hg"]["easy_woutline"],
                "hg_before_easy_without": results_before["hg"]["easy_noutline"],
                "hg_before_hard_with": results_before["hg"]["hard_woutline"],
                "hg_before_hard_without": results_before["hg"]["hard_noutline"],
                "pdm_before_easy_with": results_before["pdm"]["easy_woutline"],
                "pdm_before_easy_without": results_before["pdm"]["easy_noutline"],
                "pdm_before_hard_with": results_before["pdm"]["hard_woutline"],
                "pdm_before_hard_without": results_before["pdm"]["hard_noutline"],
                "hg_after_easy_with": results_after["hg"]["easy_woutline"],
                "hg_after_easy_without": results_after["hg"]["easy_noutline"],
                "hg_after_hard_with": results_after["hg"]["hard_woutline"],
                "hg_after_hard_without": results_after["hg"]["hard_noutline"],
                "pdm_after_easy_with": results_after["pdm"]["easy_woutline"],
                "pdm_after_easy_without": results_after["pdm"]["easy_noutline"],
                "pdm_after_hard_with": results_after["pdm"]["hard_woutline"],
                "pdm_after_hard_without": results_after["pdm"]["hard_noutline"],
            }
def run(pdm, hg_results, gpu):
    location = 'cpu' if gpu is None else "cuda:%d" % gpu

    data = torch.load(pdm, map_location='cpu')
    state_dict = data['state_dict']
    config = data['config']

    make_deterministic(config['random_seed'])

    net = ModelTrainer.create_net(config)
    net.model.load_state_dict(state_dict)
    net.model.eval()
    net.to(location)

    net.bs *= 256

    hg_out = json.load(open(hg_results, "r"))
    #avg_dist = torch.tensor(hg_out["train"]["average_lm_distances"], device=location)
    easy = [x["coord_and_conf"] for x in hg_out["easy"]["results"]]
    easy_gt = torch.tensor([[[y["gt_x"], y["gt_y"]] for y in x] for x in easy],
                           device=location)
    hard = [x["coord_and_conf"] for x in hg_out["hard"]["results"]]
    hard_gt = torch.tensor([[[y["gt_x"], y["gt_y"]] for y in x] for x in hard],
                           device=location)
    train = [x["coord_and_conf"] for x in hg_out["train"]["results"]]

    #gauss = norm(0.0, stddev)
    #easy_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], gauss.pdf(y["dist_x"]), gauss.pdf(y["dist_y"])] for y in x] for x in easy], device=location)
    #hard_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], gauss.pdf(y["dist_x"]), gauss.pdf(y["dist_y"])] for y in x] for x in hard], device=location)

    import math
    import random

    #mp = lambda x: (-5494.5 * x + 1.099)**2
    #mp = lambda x: 1/(100000*x**2+1)
    mp = lambda x: min(1, max(0, 1 / x - 130))
    mp = lambda x: 1 / x

    #print(torch.min(avg_dist), torch.max(avg_dist))
    #exit()
    """
    varx = torch.tensor([[1/y["var_x"] for y in x] for x in easy], device=location)
    vary = torch.tensor([[1/y["var_y"] for y in x] for x in easy], device=location)
    print("easy", torch.min(varx), torch.max(varx))
    print("easy", torch.min(vary), torch.max(vary))
    varx = torch.tensor([[1/y["var_x"] for y in x] for x in hard], device=location)
    vary = torch.tensor([[1/y["var_y"] for y in x] for x in hard], device=location)
    print("hard", torch.min(varx), torch.max(varx))
    print("hard", torch.min(vary), torch.max(vary))
    varx = torch.tensor([[1/y["var_x"] for y in x] for x in train], device=location)
    vary = torch.tensor([[1/y["var_y"] for y in x] for x in train], device=location)
    print("train", torch.min(varx), torch.max(varx))
    print("train", torch.min(vary), torch.max(vary))
    exit()
    """

    #easy_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], mp(avg_dist[i][0]), mp(avg_dist[i][1])] for i,y in enumerate(x)] for x in easy], device=location)
    #hard_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], mp(avg_dist[i][0]), mp(avg_dist[i][1])] for i,y in enumerate(x)] for x in hard], device=location)

    easy_hg_pred = torch.tensor(
        [[[y["pred_x"], y["pred_y"],
           mp(y["var_x"]),
           mp(y["var_y"])] for i, y in enumerate(x)] for x in easy],
        device=location)
    hard_hg_pred = torch.tensor(
        [[[y["pred_x"], y["pred_y"],
           mp(y["var_x"]),
           mp(y["var_y"])] for i, y in enumerate(x)] for x in hard],
        device=location)

    #print(torch.min(easy_hg_pred[:,:,2:]), torch.max(easy_hg_pred[:,:,2:]))
    #print(torch.min(hard_hg_pred[:, :, 2:]), torch.max(hard_hg_pred[:, :, 2:]))
    #exit()

    sample_losses_hg_easy = [
        np.mean((easy_hg_pred[i, :, :2].cpu().numpy() -
                 easy_gt[i].cpu().numpy())**2) for i in range(easy_gt.shape[0])
    ]

    # TODO test() takes pred and conf now separately
    zs, nr, *_ = net.test(easy_hg_pred, verbose=True)
    l2d_easy, _ = net.forward(zs, nr)

    sample_losses_pdm_easy = [
        np.mean((l2d_easy[i].detach().cpu().numpy() -
                 easy_gt[i].detach().cpu().numpy())**2)
        for i in range(easy_gt.shape[0])
    ]

    easy_best = Counter()
    best_coords_easy = []
    worst_coords_easy = []
    for i in range(easy_gt.shape[0]):
        if sample_losses_pdm_easy[i] <= sample_losses_hg_easy[i]:
            easy_best["pdm"] += 1
            best_coords_easy.append(
                l2d_easy[i].cpu().detach().numpy().tolist())
            worst_coords_easy.append(
                easy_hg_pred[i, :, :2].cpu().detach().numpy().tolist())
        else:
            easy_best["hg"] += 1
            best_coords_easy.append(
                easy_hg_pred[i, :, :2].cpu().detach().numpy().tolist())
            worst_coords_easy.append(
                l2d_easy[i].cpu().detach().numpy().tolist())

    sample_losses_hg_hard = [
        np.mean((hard_hg_pred[i, :, :2].cpu().numpy() -
                 hard_gt[i].cpu().numpy())**2) for i in range(hard_gt.shape[0])
    ]

    # TODO test() takes pred and conf now separately
    zs, nr, *_ = net.test(hard_hg_pred, verbose=True)
    l2d_hard, _ = net.forward(zs, nr)

    sample_losses_pdm_hard = [
        np.mean((l2d_hard[i].detach().cpu().numpy() -
                 hard_gt[i].detach().cpu().numpy())**2)
        for i in range(hard_gt.shape[0])
    ]

    hard_best = Counter()
    best_coords_hard = []
    worst_coords_hard = []
    for i in range(hard_gt.shape[0]):
        if sample_losses_pdm_hard[i] <= sample_losses_hg_hard[i]:
            hard_best["pdm"] += 1
            best_coords_hard.append(
                l2d_hard[i].cpu().detach().numpy().tolist())
            worst_coords_hard.append(
                hard_hg_pred[i, :, :2].cpu().detach().numpy().tolist())
        else:
            hard_best["hg"] += 1
            best_coords_hard.append(
                hard_hg_pred[i, :, :2].cpu().detach().numpy().tolist())
            worst_coords_hard.append(
                l2d_hard[i].cpu().detach().numpy().tolist())

    hg_easy_eval = evaluate(easy_hg_pred[:, :, :2], easy_gt)
    all_pdm_easy_eval = evaluate(l2d_easy, easy_gt)
    best_pick_easy = evaluate(
        torch.tensor(best_coords_easy, dtype=torch.float32).cpu(),
        easy_gt.cpu())
    worst_pick_easy = evaluate(
        torch.tensor(worst_coords_easy, dtype=torch.float32).cpu(),
        easy_gt.cpu())
    print("\n---- EASY without outline----")
    print("HG \t\t %0.4f" % hg_easy_eval["without_outline"])
    print("best pick \t %0.4f" % best_pick_easy["without_outline"])
    print("worst pick \t %0.4f" % worst_pick_easy["without_outline"])
    print("all PDM \t %0.4f" % all_pdm_easy_eval["without_outline"])

    print("\n---- EASY with outline----")
    print("HG \t\t %0.4f" % hg_easy_eval["with_outline"])
    print("best pick \t %0.4f" % best_pick_easy["with_outline"])
    print("worst pick \t %0.4f" % worst_pick_easy["with_outline"])
    print("all PDM \t %0.4f" % all_pdm_easy_eval["with_outline"])

    print("easy best", easy_best)

    hg_hard_eval = evaluate(hard_hg_pred[:, :, :2], hard_gt)
    all_pdm_hard_eval = evaluate(l2d_hard, hard_gt)
    best_pick_hard = evaluate(
        torch.tensor(best_coords_hard, dtype=torch.float32).cpu(),
        hard_gt.cpu())
    worst_pick_hard = evaluate(
        torch.tensor(worst_coords_hard, dtype=torch.float32).cpu(),
        hard_gt.cpu())
    print("\n---- HARD without outline----")
    print("HG \t\t %0.4f" % hg_hard_eval["without_outline"])
    print("best pick \t %0.4f" % best_pick_hard["without_outline"])
    print("worst pick \t %0.4f" % worst_pick_hard["without_outline"])
    print("all PDM \t %0.4f" % all_pdm_hard_eval["without_outline"])

    print("\n---- HARD with outline----")
    print("HG \t\t %0.4f" % hg_hard_eval["with_outline"])
    print("best pick \t %0.4f" % best_pick_hard["with_outline"])
    print("worst pick \t %0.4f" % worst_pick_hard["with_outline"])
    print("all PDM \t %0.4f" % all_pdm_hard_eval["with_outline"])

    print("hard_best", hard_best)
def run(*, hg, pdm, data_src, location, hg_bs, encoder=None, verbose=True, random_seed=None, var_thresh=None, menpo=None):
    torch.autograd.set_detect_anomaly(True)  # This makes debugging much easier

    if location is not 'cpu':
        torch.cuda.set_device(torch.device(location))

    if random_seed is not None:
        make_deterministic(random_seed)

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    pin_memory = location != 'cpu'
    num_workers = 4

    with h5py.File(data_src, 'r') as f:
        easy_d = FaceLandmarksEasyTestData(f, transform=transform)
        hard_d = FaceLandmarksHardTestData(f, transform=transform)

    easy_loader = DataLoader(dataset=easy_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory,  batch_size=len(easy_d))
    hard_loader = DataLoader(dataset=hard_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(hard_d))
    pipeline = E2E(hg, pdm, hg_bs, max(len(easy_d), len(hard_d)), encoder=encoder, verbose=verbose, var_thresh=var_thresh)
    e2e_results = run_e2e(pipeline, easy_loader, hard_loader, location)

    hg_results = {
        "easy68": e2e_results["easy"]["eval_hg"]["with_outline"],
        "hard68": e2e_results["hard"]["eval_hg"]["with_outline"],
        "easy49": e2e_results["easy"]["eval_hg"]["without_outline"],
        "hard49": e2e_results["hard"]["eval_hg"]["without_outline"]
    }

    pdm_results = {
        "easy68": e2e_results["easy"]["eval_pdm"]["with_outline"],
        "hard68": e2e_results["hard"]["eval_pdm"]["with_outline"],
        "easy49": e2e_results["easy"]["eval_pdm"]["without_outline"],
        "hard49": e2e_results["hard"]["eval_pdm"]["without_outline"]
    }

    if encoder is not None:
        pdm_encoder_results = {
            "easy68": e2e_results["easy"]["eval_pdm_encoder"]["with_outline"],
            "hard68": e2e_results["hard"]["eval_pdm_encoder"]["with_outline"],
            "easy49": e2e_results["easy"]["eval_pdm_encoder"]["without_outline"],
            "hard49": e2e_results["hard"]["eval_pdm_encoder"]["without_outline"]
        }
    else:
        pdm_encoder_results = {k: 10000000.0 for k in ["easy68", "hard68", "easy49", "hard49"]}

    if menpo is not None:
        with h5py.File(args.menpo, 'r') as f:
            menpo_d = Menpo(f, transform=transform)
            menpo_loader = DataLoader(dataset=menpo_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(menpo_d))

            pipeline = E2E(hg, pdm, hg_bs, len(menpo_d), encoder=encoder, verbose=verbose, var_thresh=var_thresh)
            menpo_res = run_e2e_split(pipeline, menpo_loader, location)
            menpo_gt = menpo_res["gt"]
            menpo_hg_pred = menpo_res["hg_pred"]
            menpo_pdm_pred = menpo_res["pdm_pred"]

            menpo_hg_error = evaluate_menpo(menpo_hg_pred, menpo_gt)
            menpo_pdm_error = evaluate_menpo(menpo_pdm_pred, menpo_gt)

            hg_results["menpo68"] = menpo_hg_error[0]
            hg_results["menpo49"] = menpo_hg_error[1]
            pdm_results["menpo68"] = menpo_pdm_error[0]
            pdm_results["menpo49"] = menpo_pdm_error[1]
    else:
        hg_results["menpo68"] = 10000000.0
        hg_results["menpo49"] = 10000000.0
        pdm_results["menpo68"] = 10000000.0
        pdm_results["menpo49"] = 10000000.0


    res = {
        "hg": hg_results,
        "pdm": pdm_results,
        "pdm_encoder": pdm_encoder_results,
        "gt": {
            "easy": e2e_results["easy"]["gt"],
            "hard": e2e_results["hard"]["gt"]
        },
        "hg_pred": {
            "easy": e2e_results["easy"]["hg_pred"],
            "hard": e2e_results["hard"]["hg_pred"]
        },
        "pdm_pred": {
            "easy": e2e_results["easy"]["pdm_pred"],
            "hard": e2e_results["hard"]["pdm_pred"]
        },
        "pdm_3d": {
            "easy": e2e_results["easy"]["pdm_3d"],
            "hard": e2e_results["hard"]["pdm_3d"]
        },
        "pdm_applied": {
            "easy": e2e_results["easy"]["pdm_applied"],
            "hard": e2e_results["hard"]["pdm_applied"]
        }
    }

    if "pdm_encoder_pred" in e2e_results["easy"]:
        res["pdm_encoder_pred"] = {
            "easy": e2e_results["easy"]["pdm_encoder_pred"],
            "hard": e2e_results["hard"]["pdm_encoder_pred"]
        }
    return res
    def run(self):
        torch.autograd.set_detect_anomaly(
            True)  # This makes debugging much easier
        make_deterministic(self.config['random_seed'])

        encoders = None
        if self.config["encoder"]:
            # This assumes that an encoder has already been trained for the PDM
            # Example: pdm path is my/dir/models/pdm_4.torch
            # Then the encoder is loaded from my/dir/encoders/encoder_4.torch (if it does not exists, the code crashes)
            pdm_filename = os.path.basename(self.config["pdm"])
            if "final" in pdm_filename:
                pdm_id = int(pdm_filename.split(".")[0].split("_")[-1])
            else:
                pdm_id = int(pdm_filename.split("_")[0])
            encoders = {
                49:
                os.path.join(
                    os.path.dirname(os.path.dirname(self.config["pdm"])),
                    "encoders", "encoder_49_%d.torch" % pdm_id),
                68:
                os.path.join(
                    os.path.dirname(os.path.dirname(self.config["pdm"])),
                    "encoders", "encoder_68_%d.torch" % pdm_id),
            }

        if not self.is_gridsearch:
            print("encoder", encoders)

        if "prediction_target" in self.config and self.config[
                "prediction_target"] is not None:
            pred_target = self.config["prediction_target"]
            pred_target_dir = os.path.dirname(pred_target)
            mkdir_if_not_exists(pred_target_dir)
        else:
            pred_target = None

        success = False
        tries = 0
        maxtries = 75
        while not success:
            tries += 1
            try:
                res, hg_config, pdm_config = load_and_run(
                    hg_src=self.config["hg"],
                    pdm_src=self.config["pdm"],
                    data_src=self.data,
                    gpu_id=self.gpu_id,
                    random_seed=self.config["random_seed"],
                    pdm_configurator=self.configure_pdm,
                    verbose=not self.is_gridsearch,
                    var_thresh=self.config["variance_threshold"],
                    encoders=encoders)
                success = True
            except RuntimeError as e:
                txt = str(e)
                if "out of memory" in txt:
                    if tries <= maxtries:
                        waittime = tries * random.randint(1, 5)
                        print(
                            "ERROR! There was a OOM error, wait %d seconds and try again. Try nr. %d"
                            % (waittime, tries))
                        time.sleep(waittime)
                    else:
                        print("ERROR! maxtries (%d) exceeded" % maxtries)
                        raise e
                else:
                    raise e

        results = {
            "hg_easy49":
            res["hg"]["easy49"],
            "hg_hard49":
            res["hg"]["hard49"],
            "hg_easy68":
            res["hg"]["easy68"],
            "hg_hard68":
            res["hg"]["hard68"],
            "pdm_easy49":
            res["pdm"]["easy49"],
            "pdm_hard49":
            res["pdm"]["hard49"],
            "pdm_easy68":
            res["pdm"]["easy68"],
            "pdm_hard68":
            res["pdm"]["hard68"],
            "pdm_encoder_easy49":
            res["pdm_encoder"]["easy49"],
            "pdm_encoder_hard49":
            res["pdm_encoder"]["hard49"],
            "pdm_encoder_easy68":
            res["pdm_encoder"]["easy68"],
            "pdm_encoder_hard68":
            res["pdm_encoder"]["hard68"],
            "easy49_factor":
            res["hg"]["easy49"] / res["pdm"]["easy49"],
            "hard49_factor":
            res["hg"]["hard49"] / res["pdm"]["hard49"],
            "easy68_factor":
            res["hg"]["easy68"] / res["pdm"]["easy68"],
            "hard68_factor":
            res["hg"]["hard68"] / res["pdm"]["hard68"],
            "enc_easy49_factor":
            res["hg"]["easy49"] /
            res["pdm_encoder"]["easy49"] if self.config["encoder"] else 0.0,
            "enc_hard49_factor":
            res["hg"]["hard49"] /
            res["pdm_encoder"]["hard49"] if self.config["encoder"] else 0.0,
            "enc_easy68_factor":
            res["hg"]["easy68"] /
            res["pdm_encoder"]["easy68"] if self.config["encoder"] else 0.0,
            "enc_hard68_factor":
            res["hg"]["hard68"] /
            res["pdm_encoder"]["hard68"] if self.config["encoder"] else 0.0
        }

        print(
            "Config: %d | factor e49: %0.4f | factor h49: %0.4f | factor e68: %0.4f | factor h68: %0.4f"
            % (self.config["config_id"], results["easy49_factor"],
               results["hard49_factor"], results["easy68_factor"],
               results["hard68_factor"]))
        if self.is_gridsearch:
            return {**self.config, **results}
        else:
            for k, v in results.items():
                print(k, v)

        if pred_target:
            output = {
                "meta": {
                    "hg_model": self.config["hg"],
                    "pdm_model": self.config["pdm"],
                    "hg_config": hg_config,
                    "pdm_config": pdm_config,
                    "gapsearch_config": self.config
                },
                "results": results,
                "predictions": {
                    "easy": {
                        "gt":
                        res["gt"]["easy"].cpu().detach().numpy().tolist(),
                        "pdm_pred":
                        res["pdm_pred"]
                        ["easy"].cpu().detach().numpy().tolist(),
                        "hg_pred":
                        res["hg_pred"]["easy"].cpu().detach().numpy().tolist(),
                        "pdm_3d":
                        res["pdm_3d"]["easy"].cpu().detach().numpy().tolist()
                    },
                    "hard": {
                        "gt":
                        res["gt"]["hard"].cpu().detach().numpy().tolist(),
                        "pdm_pred":
                        res["pdm_pred"]
                        ["hard"].cpu().detach().numpy().tolist(),
                        "hg_pred":
                        res["hg_pred"]["hard"].cpu().detach().numpy().tolist(),
                        "pdm_3d":
                        res["pdm_3d"]["hard"].cpu().detach().numpy().tolist()
                    }
                }
            }

            if "pdm_encoder_pred" in res:
                output["predictions"]["easy"]["pdm_encoder_pred"] = res[
                    "pdm_encoder_pred"]["easy"].cpu().detach().numpy().tolist(
                    )
                output["predictions"]["hard"]["pdm_encoder_pred"] = res[
                    "pdm_encoder_pred"]["hard"].cpu().detach().numpy().tolist(
                    )

            json.dump(output, open(pred_target, "w"), indent=2)
            print("Predictions written to", pred_target)
    def run(self):
        self.config["model_dir"] = self.model_dir

        make_deterministic(self.config['random_seed'])

        pdm = ModelTrainer.create_net(self.config)
        self.to_gpu(pdm)
        pdm.verbose = not self.is_gridsearch
        pdm.listener = self.receive_pdm_output

        dt = h5py.File(self.data, "r")
        data_tr = self.to_gpu(
            torch.tensor(dt["300W"]["train_y"], dtype=torch.float32))
        data_te = self.to_gpu(
            torch.tensor(dt["300W"]["test_y"], dtype=torch.float32))

        if self.config["add_multipie"]:
            tmp = self.to_gpu(
                torch.tensor(dt["multipie"]["train_y"], dtype=torch.float32))
            data_tr = torch.cat((data_tr, tmp))

        #print("train", data_tr.shape)
        #print("test", data_te.shape)
        #exit()

        zs_tr, nr_tr, loss_tr = pdm.train(data=data_tr)
        train_reconstructed, _ = pdm.forward(zs_tr, nr_tr)

        zs_te, nr_te, loss_te, *_ = pdm.test(data=data_te, confidence=None)
        test_reconstructed, _ = pdm.forward(zs_te, nr_te)

        target_file = os.path.join(
            self.result_dir, "zs_and_nr_%d.json" % self.config["config_id"])

        json.dump(
            {
                "train": {
                    "zs":
                    zs_tr.detach().cpu().numpy().tolist(),
                    "nr":
                    nr_tr.detach().cpu().numpy().tolist(),
                    "reconstructed":
                    train_reconstructed.detach().cpu().numpy().tolist(),
                    "coords":
                    data_tr.detach().cpu().numpy().tolist()
                },
                "test": {
                    "zs":
                    zs_te.detach().cpu().numpy().tolist(),
                    "nr":
                    nr_te.detach().cpu().numpy().tolist(),
                    "reconstructed":
                    test_reconstructed.detach().cpu().numpy().tolist(),
                    "coords":
                    data_te.detach().cpu().numpy().tolist()
                }
            }, open(target_file, "w"))

        pdm.save_pdm(
            pdm.train_epochs,
            os.path.join(self.model_dir,
                         "final_pdm_%d.torch" % self.config["config_id"]))

        # TODO train ENCODERS DIRECTLY HERE

        if self.is_gridsearch:
            last_train_loss = self.loss_log["train"][-1]
            lowest_train_loss = min(self.loss_log["train"])
            best_train_epoch = min([
                i for i in range(len(self.loss_log["train"]))
                if self.loss_log["train"][i] == lowest_train_loss
            ])

            train_error = evaluate(train_reconstructed, data_tr)
            test_error = evaluate(test_reconstructed, data_te)

            #print(train_error, test_error)

            best_epochs = {
                "best_%s_epoch" % k: v
                for k, v in self.best_epoch.items()
            }
            best_errors = {
                "best_%s" % k: v
                for k, v in self.lowest_error.items()
            }

            return {
                **self.config, "last_train_loss": last_train_loss,
                "lowest_train_loss": lowest_train_loss,
                "best_train_epoch": best_train_epoch,
                "metrics_log": self.metrics_log,
                **best_epochs,
                **best_errors, "train_error_49":
                train_error["without_outline"],
                "train_error_68": train_error["with_outline"],
                "test_error_49": test_error["without_outline"],
                "test_error_68": test_error["with_outline"]
            }
        else:
            # evaluate PDM
            metrics = pdm.eval_on_alpha_hg()
            print(metrics["easy_metrics_last"])
            print(metrics["hard_metrics_last"])
    def run(self):
        torch.cuda.empty_cache()

        starttime = time.time()

        if self.gpu_id is not None:
            # cudnn.benchmark improves training speed when input sizes do not change
            # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
            # It selects the best algorithms as the training iterates over the dataset
            # I found no big difference between True and False, but it also doesn't hurt, so enable it
            #cudnn.benchmark = True # disable for deterministic behavior
            pass

        config = self.config
        config_id = config["config_id"]
        n_lm = config["n_lm"]

        make_deterministic(config['random_seed'])
        torch.autograd.set_detect_anomaly(
            True)  # This makes debugging much easier

        jitterTransform = transforms.ColorJitter(brightness=0.4,
                                                 contrast=0.4,
                                                 saturation=0.4,
                                                 hue=0.1)

        # TODO store these values in h5 files
        normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
        normTransform = transforms.Normalize(normMean, normStd)

        rot_angle = float(config['augment_rotation'])
        rotation_augmentation = RandomRotation(min_angle=-1 * rot_angle,
                                               max_angle=rot_angle,
                                               retain_scale=False,
                                               rotate_landmarks="same")

        trainTransform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(jitterTransform),
            ImageAndLabelTransform(RandomHorizontalFlip()),
            ImageAndLabelTransform(rotation_augmentation),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        testTransform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        # Note: Reading takes only ~0.2s, so it is okay to do this again whenever main.py is called
        # No need to read in trainer.py and pass results here
        with h5py.File(self.data, 'r') as f:
            train_dataset = FaceLandmarksTrainingData(f,
                                                      transform=trainTransform,
                                                      n_lm=n_lm)
            val_dataset = FaceLandmarksAllTestData(f,
                                                   transform=testTransform,
                                                   n_lm=n_lm)
            easy_d = FaceLandmarksEasyTestData(f,
                                               transform=testTransform,
                                               n_lm=n_lm)
            hard_d = FaceLandmarksHardTestData(f,
                                               transform=testTransform,
                                               n_lm=n_lm)

        print("GPU %d.%d" % (self.gpu_id, self.sub_gpu_id),
              "Data: %s" % self.data,
              "Train %d Test %d" % (len(train_dataset), len(val_dataset)))

        dataloader_params = {
            'batch_size': config['batch_size'],
            'pin_memory': self.gpu_id is not None,
            'num_workers': 8
        }

        train_loader = DataLoader(train_dataset,
                                  shuffle=True,
                                  **dataloader_params)
        val_loader = DataLoader(val_dataset,
                                shuffle=False,
                                **dataloader_params)
        easy = DataLoader(easy_d, shuffle=False, **dataloader_params)
        hard = DataLoader(hard_d, shuffle=False, **dataloader_params)

        net = self.create_net(config)
        _, trainable_parameters, _ = count_parameters(net)
        self.to_gpu(net)
        net.train()  # Put net into train mode

        params = [
            {
                "params": net.hourglass.parameters()
            },
            {
                "params": net.regressor.parameters()
            },
        ]

        if config["predict_distances_weight"] > 0:
            # generate ground truth distances
            y = torch.stack([x["landmarks"] for x in train_dataset])
            bs = y.shape[0]
            n_lm = y.shape[1]
            dist_gt = torch.zeros(bs, n_lm, n_lm, 2)
            dist_gt[:, :, :, 0] = y[:, :, 0].view(bs, 1, -1) - y[:, :, 0].view(
                bs, -1, 1)
            dist_gt[:, :, :, 1] = y[:, :, 1].view(bs, 1, -1) - y[:, :, 1].view(
                bs, -1, 1)

        optimizer = optim.Adam(params, lr=config['lr'])

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            'min',
            patience=config['lr_scheduler_patience'],
            verbose=True,
            factor=config['lr_decay_factor'])

        early_stopping_patience = config['lr_scheduler_patience'] * 2 + 1
        early_stopping_max_ratio = 0.975
        should_stop = EarlyStopping(patience=early_stopping_patience,
                                    max_ratio=early_stopping_max_ratio,
                                    verbose=False)

        loss_function = self.get_loss_function(config['regression'],
                                               config['loss_function'])

        category_calculator = {
            "e49":
            lambda metrics: metrics["e49"],
            "h49":
            lambda metrics: metrics["h49"],
            "e68":
            lambda metrics: metrics["e68"],
            "h68":
            lambda metrics: metrics["h68"],
            "49":
            lambda metrics: (metrics["e49"] + metrics["h49"]) / 2,
            "68":
            lambda metrics: (metrics["e68"] + metrics["h68"]) / 2,
            "e":
            lambda metrics: (metrics["e49"] + metrics["e68"]) / 2,
            "h":
            lambda metrics: (metrics["h49"] + metrics["h68"]) / 2,
            "all":
            lambda metrics: (metrics["e49"] + metrics["h49"] + metrics["e68"] +
                             metrics["h68"]) / 4
        }
        categories = category_calculator.keys()
        best_epoch = {k: 0 for k in categories}
        lowest_error = {k: np.Inf for k in categories}
        epoch_train_losses = []
        epoch_val_losses = []

        # Only store models that are better than these values to save storage
        storage_thresholds = {"e49": 2.1, "h49": 3.4, "e68": 2.7, "h68": 4.5}
        storage_thresholds["49"] = category_calculator["49"](
            storage_thresholds)
        storage_thresholds["68"] = category_calculator["68"](
            storage_thresholds)
        storage_thresholds["e"] = category_calculator["e"](storage_thresholds)
        storage_thresholds["h"] = category_calculator["h"](storage_thresholds)
        storage_thresholds["all"] = category_calculator["all"](
            storage_thresholds)

        loss_history = {}
        metric_history = []

        dist_loss_fct = nn.L1Loss()

        epochs = config['n_epoch']
        for epoch in range(epochs):
            epoch_start_time = time.time()

            net.train()
            epoch_train_loss = 0
            epoch_sample_count = 0

            for sample in train_loader:
                x = self.to_gpu(sample['image'].float())
                y = self.to_gpu(sample['landmarks'].float())
                if config["predict_distances_weight"] > 0:
                    indices = self.to_gpu(sample['index'])
                    dist_y = self.to_gpu(dist_gt[indices])
                epoch_sample_count += x.shape[0]

                optimizer.zero_grad()

                coords, heatmaps, var, unnormalized_heatmaps = net(x)

                loss = loss_function(coords, heatmaps, y)
                epoch_train_loss += loss.float().data.item()
                if config["normalize_loss"]:
                    if loss.detach().data.item() > 0:
                        loss = loss / loss.detach()

                if config["predict_distances_weight"] > 0:
                    bs = x.shape[0]
                    distance_pred = torch.zeros(bs, n_lm, n_lm, 2)
                    distance_pred[:, :, :, 0] = coords[:, :, 0].view(
                        bs, 1, -1) - coords[:, :, 0].view(bs, -1, 1)
                    distance_pred[:, :, :, 1] = coords[:, :, 1].view(
                        bs, 1, -1) - coords[:, :, 1].view(bs, -1, 1)
                    distance_pred = self.to_gpu(distance_pred)
                    dist_loss = dist_loss_fct(distance_pred, dist_y)
                    loss = loss + config[
                        "predict_distances_weight"] * dist_loss / dist_loss.detach(
                        )
                else:
                    dist_loss = 0

                if torch.isnan(loss):
                    print_info(
                        "ERROR! Invalid loss (nan). Aborting training for config %d in epoch %d"
                        % (config_id, epoch))
                    raise LossException("loss was nan in config %d, epoch %d" %
                                        (config_id, epoch))
                if torch.isinf(loss):
                    print_info(
                        "ERROR! Invalid loss (inf). Aborting training for config %d in epoch %d"
                        % (config_id, epoch))
                    raise LossException("loss was inf in config %d, epoch %d" %
                                        (config_id, epoch))

                loss.backward()
                optimizer.step()

                #### end batch

            epoch_train_loss /= epoch_sample_count  # normalize loss by images that were processed

            val_loss = self.evaluate_model(val_loader, net, loss_function)
            scheduler.step(val_loss)

            epoch_train_losses.append(epoch_train_loss)
            epoch_val_losses.append(val_loss)
            loss_history[epoch] = {
                'train': epoch_train_losses[-1],
                'val': epoch_val_losses[-1]
            }

            epoch_end_time = time.time()
            epoch_duration = epoch_end_time - epoch_start_time

            metrics = benchmark(net, easy, hard, self.gpu_id)
            all_metrics = {}
            for category, calculator in category_calculator.items():
                error = calculator(metrics)
                all_metrics[category] = error

                if error < lowest_error[
                        category] and error < 1000:  # 100000 is the error for with outline when HG only has 49LM
                    lowest_error[category] = error
                    best_epoch[category] = epoch

                    if error < storage_thresholds[category]:
                        torch.save(
                            {
                                'model': 'pe_hourglass',
                                'epoch': epoch + 1,
                                'state_dict': net.state_dict(),
                                'val_loss': val_loss,
                                'config': config,
                                'category': category,
                                'metrics': all_metrics
                            },
                            os.path.join(
                                self.model_dir,
                                "%d_best_%s.torch" % (config_id, category)))
            metric_history.append(all_metrics)

            print(
                "GPU %d.%d" % (self.gpu_id, self.sub_gpu_id),
                "| conf",
                config_id,
                '| %03d/%03d' % (epoch + 1, epochs),
                '| %ds' % (int(epoch_duration)),
                '| train %0.6f' % epoch_train_losses[-1],
                '| val %0.6f' % epoch_val_losses[-1],
                '| dist %0.6f' % float(dist_loss),
                '| e68 %0.2f [B %0.2f]' %
                (metrics["e68"], lowest_error['e68']),
                '| h68 %0.2f [B %0.2f]' %
                (metrics["h68"], lowest_error['h68']),
                '| e49 %0.2f [B %0.2f]' %
                (metrics["e49"], lowest_error['e49']),
                '| h49 %0.2f [B %0.2f]' %
                (metrics["h49"], lowest_error['h49']),
            )

            if should_stop(val_loss):
                epochs = epoch + 1
                print_info(
                    "EarlyStopping (patience = %d, max_ratio=%f) criterion returned true in epoch %d. Stop training"
                    % (should_stop.patience, should_stop.max_ratio, epochs))
                break

        endtime = time.time()

        # Write a loss plot to CONFIG_ID_loss_plot.txt in the output directory
        # TODO tensorboardX in addition to matplotlib?
        x = np.array(range(epochs))
        plt.plot(x, np.array(epoch_train_losses), 'r', label='Train Loss')
        plt.plot(x, np.array(epoch_val_losses), 'b', label='Val Loss')
        plt.xlabel("Epochs")
        plt.ylabel("Avg. Train and Val Loss")
        plt.title("Variation of train and Val loss with epochs")
        plt.legend(loc='best')
        plt.savefig(os.path.join(self.plot_dir,
                                 "%d_loss_plot.png" % config_id))
        plt.close()

        training_duration = int(endtime - starttime)

        best_epochs = {"best_%s_epoch" % k: v for k, v in best_epoch.items()}
        best_errors = {"best_%s" % k: v for k, v in lowest_error.items()}

        results = {
            "config_id": config_id,
            'dataset': self.data,
            "gpu_id": self.gpu_id,
            "duration_seconds": training_duration,
            "last_epoch":
            epochs,  # is different from n_epoch in case of early stopping
            "trainable_parameters": trainable_parameters,
            **self.config,
            "optimizer_name": optimizer.__class__.__name__,
            **best_epochs,
            "training_loss_last_epoch": epoch_train_losses[-1],
            **best_errors
        }

        # Write results to CONFIG_ID_result.json in the output directory
        with open(os.path.join(self.result_dir, "%d_result.json" % config_id),
                  "w") as f:
            to_write = {
                **results, 'loss_history': loss_history,
                'metric_history': metric_history
            }
            json.dump(to_write, f, indent=4)

        torch.cuda.empty_cache()

        return results
def run_pdm(pdm_path,
            hg_results,
            location=torch.device("cpu"),
            bs=512,
            encoder=None,
            history=False):
    data = torch.load(pdm_path, map_location=location)
    state_dict = data['state_dict']
    config = data['config']

    # Altough the same random seed is used as in training, the results will slightly differ
    # The reason is that the metrics are calculated after the training already run for a few epochs, so the
    # random number generator will be in a different state depending on the training before
    make_deterministic(config['random_seed'])

    pdm = ModelTrainer.create_net(config)
    pdm.model.load_state_dict(state_dict)
    pdm = pdm.to(location)

    if encoder is not None:
        enc_data = torch.load(encoder)
        encoder = Encoder(zs_size=enc_data["zs_size"],
                          nr_size=enc_data["nr_size"])
        encoder.load_state_dict(enc_data["state_dict"])
        encoder = encoder.to(location)
    print("Encoder", encoder)

    hg_coords = torch.tensor([[[lm["pred_x"], lm["pred_y"]]
                               for lm in sample["coord_and_conf"]]
                              for sample in hg_results],
                             device=location)
    gt = torch.tensor([[[lm["gt_x"], lm["gt_y"]]
                        for lm in sample["coord_and_conf"]]
                       for sample in hg_results],
                      device=location)
    #variances = torch.tensor([[[lm["var_x"], lm["var_y"]] for lm in sample["coord_and_conf"]] for sample in hg_results], device=location)
    hg_coords_and_conf = torch.tensor([[[
        lm["pred_x"], lm["pred_y"],
        pdm.variance2confidence(lm["var_x"]),
        pdm.variance2confidence(lm["var_y"])
    ] for lm in sample["coord_and_conf"]] for sample in hg_results],
                                      device=location)

    if history:
        # TODO test() takes pred and conf now separately
        zs, nr, _, history = pdm.test(hg_coords_and_conf,
                                      return_history=True,
                                      encoder=encoder,
                                      only_encoder=False,
                                      bs=bs)
    else:
        # TODO test() takes pred and conf now separately
        zs, nr, *_ = pdm.test(hg_coords_and_conf,
                              return_history=False,
                              encoder=encoder,
                              only_encoder=False,
                              bs=bs)
        history = None
    l2d, _ = pdm.forward(zs, nr)
    l2d = l2d.detach()

    return hg_coords, hg_coords_and_conf, gt, l2d, history