def run_hg(model, images, gpu):
    # Load model
    location = "cpu" if gpu is None else "cuda:%d" % gpu
    data = torch.load(model, map_location=location)
    state_dict = data['state_dict']
    config = data['config']
    net = ModelTrainer.create_net(config, verbose=False)
    net.load_state_dict(state_dict)
    net.eval()
    net.to(location)

    # Define transformations that normalize the image
    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)
    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    # Read images and apply transformations to them
    imgs = []
    #origimg = []
    for img_fn in images:
        img = cv2.imread(img_fn)[:, :, ::-1]  # BGR -> RGB
        img = cv2.resize(img, (128, 128))
        #origimg.append(img)
        imgs.append(
            transform({"image": torch.tensor(img).permute(2, 0, 1)})["image"])

    # Run HG on all images (can crash if too many are used)
    imgs = torch.stack(imgs).to(location)
    with torch.no_grad():
        predictions, *_ = net(imgs)
    """
    for i in range(len(origimg)):
        d = draw_landmarks(origimg[i][:,:,::-1], res[i].cpu().detach().numpy())
        cv2.imwrite("/tmp/pred_%d.jpg" %i, d)
    """

    return predictions.detach()
예제 #2
0
def run(*, hg, pdm, data_src, location, hg_bs, encoder=None, verbose=True, random_seed=None, var_thresh=None, menpo=None):
    torch.autograd.set_detect_anomaly(True)  # This makes debugging much easier

    if location is not 'cpu':
        torch.cuda.set_device(torch.device(location))

    if random_seed is not None:
        make_deterministic(random_seed)

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    pin_memory = location != 'cpu'
    num_workers = 4

    with h5py.File(data_src, 'r') as f:
        easy_d = FaceLandmarksEasyTestData(f, transform=transform)
        hard_d = FaceLandmarksHardTestData(f, transform=transform)

    easy_loader = DataLoader(dataset=easy_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory,  batch_size=len(easy_d))
    hard_loader = DataLoader(dataset=hard_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(hard_d))
    pipeline = E2E(hg, pdm, hg_bs, max(len(easy_d), len(hard_d)), encoder=encoder, verbose=verbose, var_thresh=var_thresh)
    e2e_results = run_e2e(pipeline, easy_loader, hard_loader, location)

    hg_results = {
        "easy68": e2e_results["easy"]["eval_hg"]["with_outline"],
        "hard68": e2e_results["hard"]["eval_hg"]["with_outline"],
        "easy49": e2e_results["easy"]["eval_hg"]["without_outline"],
        "hard49": e2e_results["hard"]["eval_hg"]["without_outline"]
    }

    pdm_results = {
        "easy68": e2e_results["easy"]["eval_pdm"]["with_outline"],
        "hard68": e2e_results["hard"]["eval_pdm"]["with_outline"],
        "easy49": e2e_results["easy"]["eval_pdm"]["without_outline"],
        "hard49": e2e_results["hard"]["eval_pdm"]["without_outline"]
    }

    if encoder is not None:
        pdm_encoder_results = {
            "easy68": e2e_results["easy"]["eval_pdm_encoder"]["with_outline"],
            "hard68": e2e_results["hard"]["eval_pdm_encoder"]["with_outline"],
            "easy49": e2e_results["easy"]["eval_pdm_encoder"]["without_outline"],
            "hard49": e2e_results["hard"]["eval_pdm_encoder"]["without_outline"]
        }
    else:
        pdm_encoder_results = {k: 10000000.0 for k in ["easy68", "hard68", "easy49", "hard49"]}

    if menpo is not None:
        with h5py.File(args.menpo, 'r') as f:
            menpo_d = Menpo(f, transform=transform)
            menpo_loader = DataLoader(dataset=menpo_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(menpo_d))

            pipeline = E2E(hg, pdm, hg_bs, len(menpo_d), encoder=encoder, verbose=verbose, var_thresh=var_thresh)
            menpo_res = run_e2e_split(pipeline, menpo_loader, location)
            menpo_gt = menpo_res["gt"]
            menpo_hg_pred = menpo_res["hg_pred"]
            menpo_pdm_pred = menpo_res["pdm_pred"]

            menpo_hg_error = evaluate_menpo(menpo_hg_pred, menpo_gt)
            menpo_pdm_error = evaluate_menpo(menpo_pdm_pred, menpo_gt)

            hg_results["menpo68"] = menpo_hg_error[0]
            hg_results["menpo49"] = menpo_hg_error[1]
            pdm_results["menpo68"] = menpo_pdm_error[0]
            pdm_results["menpo49"] = menpo_pdm_error[1]
    else:
        hg_results["menpo68"] = 10000000.0
        hg_results["menpo49"] = 10000000.0
        pdm_results["menpo68"] = 10000000.0
        pdm_results["menpo49"] = 10000000.0


    res = {
        "hg": hg_results,
        "pdm": pdm_results,
        "pdm_encoder": pdm_encoder_results,
        "gt": {
            "easy": e2e_results["easy"]["gt"],
            "hard": e2e_results["hard"]["gt"]
        },
        "hg_pred": {
            "easy": e2e_results["easy"]["hg_pred"],
            "hard": e2e_results["hard"]["hg_pred"]
        },
        "pdm_pred": {
            "easy": e2e_results["easy"]["pdm_pred"],
            "hard": e2e_results["hard"]["pdm_pred"]
        },
        "pdm_3d": {
            "easy": e2e_results["easy"]["pdm_3d"],
            "hard": e2e_results["hard"]["pdm_3d"]
        },
        "pdm_applied": {
            "easy": e2e_results["easy"]["pdm_applied"],
            "hard": e2e_results["hard"]["pdm_applied"]
        }
    }

    if "pdm_encoder_pred" in e2e_results["easy"]:
        res["pdm_encoder_pred"] = {
            "easy": e2e_results["easy"]["pdm_encoder_pred"],
            "hard": e2e_results["hard"]["pdm_encoder_pred"]
        }
    return res
예제 #3
0
    def run(self):
        torch.autograd.set_detect_anomaly(True)  # This makes debugging much easier

        self.config["model_dir"] = self.model_dir

        make_deterministic(self.config['random_seed'])

        location = 'cpu' if self.gpu_id is None else "cuda:%d" % self.gpu_id
        if location is not 'cpu':
            # This fixes the problem that pytorch is always allocating memory on GPU 0 even if this is not included
            # in the list of GPUs to use
            torch.cuda.set_device(torch.device(location))

            # cudnn.benchmark improves training speed when input sizes do not change
            # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
            # It selects the best algorithms as the training iterates over the dataset
            #cudnn.benchmark = True # but it can cause determinism problems, so disable

        hg, hg_config = self.load_hg(self.config["initial_hg"], location)
        pdm, pdm_config = self.load_pdm(self.config["initial_pdm"], location)

        pdm.verbose = not self.is_gridsearch
        pdm.print_losses = False
        pdm.listener = self.receive_pdm_output

        normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
        normTransform = transforms.Normalize(normMean, normStd)

        jitterTransform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1)

        transform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(jitterTransform),
            ImageAndLabelTransform(RandomHorizontalFlip()),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        bs = self.config["bs"]
        pin_memory = location != 'cpu'
        num_workers = 8

        with h5py.File(self.config["data"], 'r') as f:
            train_d = FaceLandmarksTrainingData(f, transform=transform)
            train_loader = DataLoader(dataset=train_d, shuffle=self.config["shuffle"], num_workers=num_workers, pin_memory=pin_memory, batch_size=bs)

        results_before = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=True)
        if not self.is_gridsearch:
            print("Before training")
            for model, res in results_before.items():
                print(model, res)

        zs, nr, losses = pdm.end2end_training(hg=hg,
                                              data_loader=train_loader,
                                              hg_opt_config=self.config["hg_optimizer"],
                                              pdm_weight_opt_config=self.config["pdm_weight_optimizer"],
                                              pdm_shape_opt_config=self.config["pdm_shape_optimizer"],
                                              training_schedule=self.config["training_schedule"],
                                              detach_confidence=self.config["detach_confidence"])

        plot_path = os.path.join(self.plot_dir, "losses_%d.png" % self.config["config_id"])
        if not self.is_gridsearch: print("save plot to %s" % plot_path)
        fig, ax = plt.subplots()
        ax.plot(losses)
        ax.set(xlabel='epoch', ylabel='loss', title='loss per epoch')
        ax.grid()
        fig.savefig(plot_path)

        if not self.is_gridsearch: print("save HG")
        torch.save({
            'model': 'pe_hourglass',
            'state_dict': hg.state_dict(),
            'config': hg_config
        }, os.path.join(self.model_dir, "%d_hg_e2e.torch" % self.config["config_id"]))

        if not self.is_gridsearch: print("save PDM")
        pdm.save_pdm(pdm.train_epochs, os.path.join(self.model_dir, "%d_pdm_e2e.torch" % self.config["config_id"]))

        results_after = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=False)

        if not self.is_gridsearch:
            print("Before training")
            for model, res in results_before.items():
                print(model, res)

            print("After training")
            for model, res in results_after.items():
                print(model, res)

        if self.is_gridsearch:
            logpath = os.path.join(self.result_dir, "%d_log.json" % self.config["config_id"])
            json.dump({
                "gt": self.gts,
                "l2d": self.l2d_log,
                "hg": self.hg_coords_log,
                "losses": self.loss_log
            }, open(logpath, "w"))

            return {
                **self.config,
                "min_loss": min(self.loss_log),
                "last_loss" : self.loss_log[-1],
                "hg_before_easy_with" : results_before["hg"]["easy_woutline"],
                "hg_before_easy_without": results_before["hg"]["easy_noutline"],
                "hg_before_hard_with": results_before["hg"]["hard_woutline"],
                "hg_before_hard_without": results_before["hg"]["hard_noutline"],
                "pdm_before_easy_with": results_before["pdm"]["easy_woutline"],
                "pdm_before_easy_without": results_before["pdm"]["easy_noutline"],
                "pdm_before_hard_with": results_before["pdm"]["hard_woutline"],
                "pdm_before_hard_without": results_before["pdm"]["hard_noutline"],
                "hg_after_easy_with": results_after["hg"]["easy_woutline"],
                "hg_after_easy_without": results_after["hg"]["easy_noutline"],
                "hg_after_hard_with": results_after["hg"]["hard_woutline"],
                "hg_after_hard_without": results_after["hg"]["hard_noutline"],
                "pdm_after_easy_with": results_after["pdm"]["easy_woutline"],
                "pdm_after_easy_without": results_after["pdm"]["easy_noutline"],
                "pdm_after_hard_with": results_after["pdm"]["hard_woutline"],
                "pdm_after_hard_without": results_after["pdm"]["hard_noutline"],
            }
def run_hg(model, image, bb=False):
    location = torch.device("cpu")

    data = torch.load(model, map_location=location)
    state_dict = data['state_dict']
    config = data['config']

    net = ModelTrainer.create_net(config, verbose=False)
    net.load_state_dict(state_dict)
    net.eval()
    net.to(location)

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    img = cv2.imread(image)
    imgs = []
    positions = []

    # TODO find a way to detect the whole head
    dnnFaceDetector = dlib.cnn_face_detection_model_v1(
        "../other/mmod_human_face_detector.dat")
    faceRects = dnnFaceDetector(img, 0)

    for rect in faceRects:
        x1 = rect.rect.left()
        y1 = rect.rect.top()
        x2 = rect.rect.right()
        y2 = rect.rect.bottom()

        w = x2 - x1
        h = y2 - y1

        # ensure it is a rectangle
        if h > w:
            diff = h - w
            y2 -= diff
        if w > h:
            diff = w - h
            x2 -= diff

        positions.append((x1, y1, x2, y2))

        face = cv2.resize(img[y1:y2, x1:x2], (128, 128))[:, :, ::-1]
        imgs.append(
            transform({"image": torch.tensor(face).permute(2, 0, 1)})["image"])

        if bb:
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)

    imgs = torch.stack(imgs)
    out = net(imgs)[0]

    for coords, (x1, y1, x2, y2) in zip(out.detach().numpy(), positions):
        img[y1:y2, x1:x2] = draw_landmarks(img[y1:y2, x1:x2], coords, size=1)

    cv2.imshow("landmarks", img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--dataset",
        type=str,
        default=
        "/home/simon/Desktop/InterACT/Masterarbeit/Code/facial_landmarks_from_holmes_ceclm_68_split.h5",
        help="Path to dataste h5 file")
    args = parser.parse_args()

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        #ImageAndLabelTransform(RandomHorizontalFlip()),
        #ImageAndLabelTransform(NormalizeRotation()),
        ImageAndLabelTransform(
            RandomRotation(min_angle=-30,
                           max_angle=30,
                           retain_scale=False,
                           rotate_landmarks="neutral")),
        ImageTransform(transforms.ToTensor()),
        #ImageTransform(normTransform)
    ])

    with h5py.File(args.dataset, 'r') as f:
        easy_d = FaceLandmarksEasyTestData(f, transform=transform)
        hard_d = FaceLandmarksHardTestData(f, transform=transform)
        train = FaceLandmarksTrainingData(f, transform=transform)
def visualize(model,
              dataset,
              target,
              gpu=None,
              splits=["easy", "hard"],
              landmarks_in_heatmaps=True):
    location = 'cpu' if gpu is None else "cuda:%d" % gpu
    if location is not 'cpu':
        # This fixes the problem that pytorch is always allocating memory on GPU 0 even if this is not included
        # in the list of GPUs to use
        torch.cuda.set_device(torch.device(location))

        # cudnn.benchmark improves training speed when input sizes do not change
        # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
        # It selects the best algorithms as the training iterates over the dataset
        cudnn.benchmark = True

    print("Location: ", location)

    data = torch.load(model, map_location=location)
    state_dict = data['state_dict']
    config = data['config']

    num_workers = multiprocessing.cpu_count()
    batch_size = config['batch_size'] if gpu is not None else num_workers
    pin_memory = gpu is not None

    print("Workers: ", num_workers)
    print("Batchsize: ", batch_size)

    net = ModelTrainer.create_net(config, verbose=False)
    net.load_state_dict(state_dict)
    net.eval()

    net = net.to(location)

    mkdir_if_not_exists(target)

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        #ImageAndLabelTransform(RandomHorizontalFlip()),
        #ImageAndLabelTransform(RandomRotation(min_angle=-0, max_angle=0, retain_scale=False)),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    with h5py.File(dataset, 'r') as f:
        if "easy" in splits:
            print("Run on easy")
            easy_d = FaceLandmarksEasyTestData(f, transform=transform)
            #print(len(easy_d))
            easy_loader = DataLoader(dataset=easy_d,
                                     shuffle=False,
                                     num_workers=num_workers,
                                     pin_memory=pin_memory,
                                     batch_size=batch_size)
            visualize_split(net,
                            easy_loader,
                            os.path.join(target, "easy"),
                            location,
                            landmarks_in_heatmaps=landmarks_in_heatmaps)

        if "hard" in splits:
            print("Run on hard")
            hard_d = FaceLandmarksHardTestData(f, transform=transform)
            #print(len(hard_d))
            hard_loader = DataLoader(dataset=hard_d,
                                     shuffle=False,
                                     num_workers=num_workers,
                                     pin_memory=pin_memory,
                                     batch_size=batch_size)
            visualize_split(net,
                            hard_loader,
                            os.path.join(target, "hard"),
                            location,
                            landmarks_in_heatmaps=landmarks_in_heatmaps)

        if "train" in splits:
            print("Run on train")
            train = FaceLandmarksTrainingData(f, transform=transform)
            #print(len(train))
            train_loader = DataLoader(dataset=train,
                                      shuffle=False,
                                      num_workers=num_workers,
                                      pin_memory=pin_memory,
                                      batch_size=batch_size)
            visualize_split(net,
                            train_loader,
                            os.path.join(target, "train"),
                            location,
                            landmarks_in_heatmaps=landmarks_in_heatmaps)
def run(model,
        src_300w,
        src_menpo,
        target,
        gpu=None,
        override_norm_params=False,
        bs_factor=1):
    location = 'cpu' if gpu is None else "cuda:%d" % gpu
    if location is not 'cpu':
        # This fixes the problem that pytorch is always allocating memory on GPU 0 even if this is not included
        # in the list of GPUs to use
        torch.cuda.set_device(torch.device(location))

        # cudnn.benchmark improves training speed when input sizes do not change
        # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
        # It selects the best algorithms as the training iterates over the dataset
        #cudnn.benchmark = True # disable for deterministic behavior

    print("Location: ", location)

    data = torch.load(model, map_location=location)
    state_dict = data['state_dict']
    config = data['config']
    n_lm = config["n_lm"]

    if n_lm == 49:
        print("WARNING! THIS IS A 49 LM model!!!!", n_lm)

    num_workers = multiprocessing.cpu_count()
    batch_size = config[
        'batch_size'] * bs_factor if gpu is not None else num_workers
    pin_memory = gpu is not None

    print("Workers: ", num_workers)
    print("Batchsize: ", batch_size)

    net = ModelTrainer.create_net(config, verbose=False)
    net.load_state_dict(state_dict)
    net.eval()

    net.to(location)

    mkdir_if_not_exists(os.path.dirname(target))

    normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD

    if override_norm_params:
        normMean = tuple(
            np.array([133.0255852472676, 101.61684197664563, 87.4134193236219])
            / 255.0)
        normStd = tuple(
            np.array([71.91047346327116, 62.94368776888253, 61.56865329427311])
            / 255.0)

    normTransform = transforms.Normalize(normMean, normStd)

    transform = transforms.Compose([
        ImageTransform(transforms.ToPILImage()),
        ImageTransform(transforms.ToTensor()),
        ImageTransform(normTransform)
    ])

    with h5py.File(src_300w, 'r') as f:
        print("Run on easy")
        easy_d = FaceLandmarksEasyTestData(f, transform=transform, n_lm=n_lm)
        easy_loader = DataLoader(dataset=easy_d,
                                 shuffle=False,
                                 num_workers=num_workers,
                                 pin_memory=pin_memory,
                                 batch_size=batch_size)
        easy_results = evaluate_split(net,
                                      easy_loader,
                                      location=location,
                                      n_lm=n_lm)

        print("Run on hard")
        hard_d = FaceLandmarksHardTestData(f, transform=transform, n_lm=n_lm)
        hard_loader = DataLoader(dataset=hard_d,
                                 shuffle=False,
                                 num_workers=num_workers,
                                 pin_memory=pin_memory,
                                 batch_size=batch_size)
        hard_results = evaluate_split(net,
                                      hard_loader,
                                      location=location,
                                      n_lm=n_lm)

        print("Run on train")
        train = FaceLandmarksTrainingData(f, transform=transform, n_lm=n_lm)
        train_loader = DataLoader(dataset=train,
                                  shuffle=False,
                                  num_workers=num_workers,
                                  pin_memory=pin_memory,
                                  batch_size=batch_size)
        train_results = evaluate_split(net,
                                       train_loader,
                                       location=location,
                                       n_lm=n_lm)

    with h5py.File(src_menpo, "r") as f:
        print("Run on menpo")
        menpo = Menpo(f, transform=transform, n_lm=n_lm)
        menpo_loader = DataLoader(dataset=menpo,
                                  shuffle=False,
                                  num_workers=num_workers,
                                  pin_memory=pin_memory,
                                  batch_size=batch_size)
        menpo_results = evaluate_split(net,
                                       menpo_loader,
                                       location=location,
                                       n_lm=n_lm)

    res = {
        "easy": easy_results,
        "hard": hard_results,
        "train": train_results,
        "menpo": menpo_results,
        "model_src": model,
        "config": config
    }

    if target is not None:
        json.dump(res, open(target, "w"))
    else:
        return res
예제 #8
0
    def run(self):
        torch.cuda.empty_cache()

        starttime = time.time()

        if self.gpu_id is not None:
            # cudnn.benchmark improves training speed when input sizes do not change
            # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936
            # It selects the best algorithms as the training iterates over the dataset
            # I found no big difference between True and False, but it also doesn't hurt, so enable it
            #cudnn.benchmark = True # disable for deterministic behavior
            pass

        config = self.config
        config_id = config["config_id"]
        n_lm = config["n_lm"]

        make_deterministic(config['random_seed'])
        torch.autograd.set_detect_anomaly(
            True)  # This makes debugging much easier

        jitterTransform = transforms.ColorJitter(brightness=0.4,
                                                 contrast=0.4,
                                                 saturation=0.4,
                                                 hue=0.1)

        # TODO store these values in h5 files
        normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD
        normTransform = transforms.Normalize(normMean, normStd)

        rot_angle = float(config['augment_rotation'])
        rotation_augmentation = RandomRotation(min_angle=-1 * rot_angle,
                                               max_angle=rot_angle,
                                               retain_scale=False,
                                               rotate_landmarks="same")

        trainTransform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(jitterTransform),
            ImageAndLabelTransform(RandomHorizontalFlip()),
            ImageAndLabelTransform(rotation_augmentation),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        testTransform = transforms.Compose([
            ImageTransform(transforms.ToPILImage()),
            ImageTransform(transforms.ToTensor()),
            ImageTransform(normTransform)
        ])

        # Note: Reading takes only ~0.2s, so it is okay to do this again whenever main.py is called
        # No need to read in trainer.py and pass results here
        with h5py.File(self.data, 'r') as f:
            train_dataset = FaceLandmarksTrainingData(f,
                                                      transform=trainTransform,
                                                      n_lm=n_lm)
            val_dataset = FaceLandmarksAllTestData(f,
                                                   transform=testTransform,
                                                   n_lm=n_lm)
            easy_d = FaceLandmarksEasyTestData(f,
                                               transform=testTransform,
                                               n_lm=n_lm)
            hard_d = FaceLandmarksHardTestData(f,
                                               transform=testTransform,
                                               n_lm=n_lm)

        print("GPU %d.%d" % (self.gpu_id, self.sub_gpu_id),
              "Data: %s" % self.data,
              "Train %d Test %d" % (len(train_dataset), len(val_dataset)))

        dataloader_params = {
            'batch_size': config['batch_size'],
            'pin_memory': self.gpu_id is not None,
            'num_workers': 8
        }

        train_loader = DataLoader(train_dataset,
                                  shuffle=True,
                                  **dataloader_params)
        val_loader = DataLoader(val_dataset,
                                shuffle=False,
                                **dataloader_params)
        easy = DataLoader(easy_d, shuffle=False, **dataloader_params)
        hard = DataLoader(hard_d, shuffle=False, **dataloader_params)

        net = self.create_net(config)
        _, trainable_parameters, _ = count_parameters(net)
        self.to_gpu(net)
        net.train()  # Put net into train mode

        params = [
            {
                "params": net.hourglass.parameters()
            },
            {
                "params": net.regressor.parameters()
            },
        ]

        if config["predict_distances_weight"] > 0:
            # generate ground truth distances
            y = torch.stack([x["landmarks"] for x in train_dataset])
            bs = y.shape[0]
            n_lm = y.shape[1]
            dist_gt = torch.zeros(bs, n_lm, n_lm, 2)
            dist_gt[:, :, :, 0] = y[:, :, 0].view(bs, 1, -1) - y[:, :, 0].view(
                bs, -1, 1)
            dist_gt[:, :, :, 1] = y[:, :, 1].view(bs, 1, -1) - y[:, :, 1].view(
                bs, -1, 1)

        optimizer = optim.Adam(params, lr=config['lr'])

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            'min',
            patience=config['lr_scheduler_patience'],
            verbose=True,
            factor=config['lr_decay_factor'])

        early_stopping_patience = config['lr_scheduler_patience'] * 2 + 1
        early_stopping_max_ratio = 0.975
        should_stop = EarlyStopping(patience=early_stopping_patience,
                                    max_ratio=early_stopping_max_ratio,
                                    verbose=False)

        loss_function = self.get_loss_function(config['regression'],
                                               config['loss_function'])

        category_calculator = {
            "e49":
            lambda metrics: metrics["e49"],
            "h49":
            lambda metrics: metrics["h49"],
            "e68":
            lambda metrics: metrics["e68"],
            "h68":
            lambda metrics: metrics["h68"],
            "49":
            lambda metrics: (metrics["e49"] + metrics["h49"]) / 2,
            "68":
            lambda metrics: (metrics["e68"] + metrics["h68"]) / 2,
            "e":
            lambda metrics: (metrics["e49"] + metrics["e68"]) / 2,
            "h":
            lambda metrics: (metrics["h49"] + metrics["h68"]) / 2,
            "all":
            lambda metrics: (metrics["e49"] + metrics["h49"] + metrics["e68"] +
                             metrics["h68"]) / 4
        }
        categories = category_calculator.keys()
        best_epoch = {k: 0 for k in categories}
        lowest_error = {k: np.Inf for k in categories}
        epoch_train_losses = []
        epoch_val_losses = []

        # Only store models that are better than these values to save storage
        storage_thresholds = {"e49": 2.1, "h49": 3.4, "e68": 2.7, "h68": 4.5}
        storage_thresholds["49"] = category_calculator["49"](
            storage_thresholds)
        storage_thresholds["68"] = category_calculator["68"](
            storage_thresholds)
        storage_thresholds["e"] = category_calculator["e"](storage_thresholds)
        storage_thresholds["h"] = category_calculator["h"](storage_thresholds)
        storage_thresholds["all"] = category_calculator["all"](
            storage_thresholds)

        loss_history = {}
        metric_history = []

        dist_loss_fct = nn.L1Loss()

        epochs = config['n_epoch']
        for epoch in range(epochs):
            epoch_start_time = time.time()

            net.train()
            epoch_train_loss = 0
            epoch_sample_count = 0

            for sample in train_loader:
                x = self.to_gpu(sample['image'].float())
                y = self.to_gpu(sample['landmarks'].float())
                if config["predict_distances_weight"] > 0:
                    indices = self.to_gpu(sample['index'])
                    dist_y = self.to_gpu(dist_gt[indices])
                epoch_sample_count += x.shape[0]

                optimizer.zero_grad()

                coords, heatmaps, var, unnormalized_heatmaps = net(x)

                loss = loss_function(coords, heatmaps, y)
                epoch_train_loss += loss.float().data.item()
                if config["normalize_loss"]:
                    if loss.detach().data.item() > 0:
                        loss = loss / loss.detach()

                if config["predict_distances_weight"] > 0:
                    bs = x.shape[0]
                    distance_pred = torch.zeros(bs, n_lm, n_lm, 2)
                    distance_pred[:, :, :, 0] = coords[:, :, 0].view(
                        bs, 1, -1) - coords[:, :, 0].view(bs, -1, 1)
                    distance_pred[:, :, :, 1] = coords[:, :, 1].view(
                        bs, 1, -1) - coords[:, :, 1].view(bs, -1, 1)
                    distance_pred = self.to_gpu(distance_pred)
                    dist_loss = dist_loss_fct(distance_pred, dist_y)
                    loss = loss + config[
                        "predict_distances_weight"] * dist_loss / dist_loss.detach(
                        )
                else:
                    dist_loss = 0

                if torch.isnan(loss):
                    print_info(
                        "ERROR! Invalid loss (nan). Aborting training for config %d in epoch %d"
                        % (config_id, epoch))
                    raise LossException("loss was nan in config %d, epoch %d" %
                                        (config_id, epoch))
                if torch.isinf(loss):
                    print_info(
                        "ERROR! Invalid loss (inf). Aborting training for config %d in epoch %d"
                        % (config_id, epoch))
                    raise LossException("loss was inf in config %d, epoch %d" %
                                        (config_id, epoch))

                loss.backward()
                optimizer.step()

                #### end batch

            epoch_train_loss /= epoch_sample_count  # normalize loss by images that were processed

            val_loss = self.evaluate_model(val_loader, net, loss_function)
            scheduler.step(val_loss)

            epoch_train_losses.append(epoch_train_loss)
            epoch_val_losses.append(val_loss)
            loss_history[epoch] = {
                'train': epoch_train_losses[-1],
                'val': epoch_val_losses[-1]
            }

            epoch_end_time = time.time()
            epoch_duration = epoch_end_time - epoch_start_time

            metrics = benchmark(net, easy, hard, self.gpu_id)
            all_metrics = {}
            for category, calculator in category_calculator.items():
                error = calculator(metrics)
                all_metrics[category] = error

                if error < lowest_error[
                        category] and error < 1000:  # 100000 is the error for with outline when HG only has 49LM
                    lowest_error[category] = error
                    best_epoch[category] = epoch

                    if error < storage_thresholds[category]:
                        torch.save(
                            {
                                'model': 'pe_hourglass',
                                'epoch': epoch + 1,
                                'state_dict': net.state_dict(),
                                'val_loss': val_loss,
                                'config': config,
                                'category': category,
                                'metrics': all_metrics
                            },
                            os.path.join(
                                self.model_dir,
                                "%d_best_%s.torch" % (config_id, category)))
            metric_history.append(all_metrics)

            print(
                "GPU %d.%d" % (self.gpu_id, self.sub_gpu_id),
                "| conf",
                config_id,
                '| %03d/%03d' % (epoch + 1, epochs),
                '| %ds' % (int(epoch_duration)),
                '| train %0.6f' % epoch_train_losses[-1],
                '| val %0.6f' % epoch_val_losses[-1],
                '| dist %0.6f' % float(dist_loss),
                '| e68 %0.2f [B %0.2f]' %
                (metrics["e68"], lowest_error['e68']),
                '| h68 %0.2f [B %0.2f]' %
                (metrics["h68"], lowest_error['h68']),
                '| e49 %0.2f [B %0.2f]' %
                (metrics["e49"], lowest_error['e49']),
                '| h49 %0.2f [B %0.2f]' %
                (metrics["h49"], lowest_error['h49']),
            )

            if should_stop(val_loss):
                epochs = epoch + 1
                print_info(
                    "EarlyStopping (patience = %d, max_ratio=%f) criterion returned true in epoch %d. Stop training"
                    % (should_stop.patience, should_stop.max_ratio, epochs))
                break

        endtime = time.time()

        # Write a loss plot to CONFIG_ID_loss_plot.txt in the output directory
        # TODO tensorboardX in addition to matplotlib?
        x = np.array(range(epochs))
        plt.plot(x, np.array(epoch_train_losses), 'r', label='Train Loss')
        plt.plot(x, np.array(epoch_val_losses), 'b', label='Val Loss')
        plt.xlabel("Epochs")
        plt.ylabel("Avg. Train and Val Loss")
        plt.title("Variation of train and Val loss with epochs")
        plt.legend(loc='best')
        plt.savefig(os.path.join(self.plot_dir,
                                 "%d_loss_plot.png" % config_id))
        plt.close()

        training_duration = int(endtime - starttime)

        best_epochs = {"best_%s_epoch" % k: v for k, v in best_epoch.items()}
        best_errors = {"best_%s" % k: v for k, v in lowest_error.items()}

        results = {
            "config_id": config_id,
            'dataset': self.data,
            "gpu_id": self.gpu_id,
            "duration_seconds": training_duration,
            "last_epoch":
            epochs,  # is different from n_epoch in case of early stopping
            "trainable_parameters": trainable_parameters,
            **self.config,
            "optimizer_name": optimizer.__class__.__name__,
            **best_epochs,
            "training_loss_last_epoch": epoch_train_losses[-1],
            **best_errors
        }

        # Write results to CONFIG_ID_result.json in the output directory
        with open(os.path.join(self.result_dir, "%d_result.json" % config_id),
                  "w") as f:
            to_write = {
                **results, 'loss_history': loss_history,
                'metric_history': metric_history
            }
            json.dump(to_write, f, indent=4)

        torch.cuda.empty_cache()

        return results