def run(self): torch.autograd.set_detect_anomaly(True) # This makes debugging much easier self.config["model_dir"] = self.model_dir make_deterministic(self.config['random_seed']) location = 'cpu' if self.gpu_id is None else "cuda:%d" % self.gpu_id if location is not 'cpu': # This fixes the problem that pytorch is always allocating memory on GPU 0 even if this is not included # in the list of GPUs to use torch.cuda.set_device(torch.device(location)) # cudnn.benchmark improves training speed when input sizes do not change # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 # It selects the best algorithms as the training iterates over the dataset #cudnn.benchmark = True # but it can cause determinism problems, so disable hg, hg_config = self.load_hg(self.config["initial_hg"], location) pdm, pdm_config = self.load_pdm(self.config["initial_pdm"], location) pdm.verbose = not self.is_gridsearch pdm.print_losses = False pdm.listener = self.receive_pdm_output normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD normTransform = transforms.Normalize(normMean, normStd) jitterTransform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1) transform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(jitterTransform), ImageAndLabelTransform(RandomHorizontalFlip()), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) bs = self.config["bs"] pin_memory = location != 'cpu' num_workers = 8 with h5py.File(self.config["data"], 'r') as f: train_d = FaceLandmarksTrainingData(f, transform=transform) train_loader = DataLoader(dataset=train_d, shuffle=self.config["shuffle"], num_workers=num_workers, pin_memory=pin_memory, batch_size=bs) results_before = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=True) if not self.is_gridsearch: print("Before training") for model, res in results_before.items(): print(model, res) zs, nr, losses = pdm.end2end_training(hg=hg, data_loader=train_loader, hg_opt_config=self.config["hg_optimizer"], pdm_weight_opt_config=self.config["pdm_weight_optimizer"], pdm_shape_opt_config=self.config["pdm_shape_optimizer"], training_schedule=self.config["training_schedule"], detach_confidence=self.config["detach_confidence"]) plot_path = os.path.join(self.plot_dir, "losses_%d.png" % self.config["config_id"]) if not self.is_gridsearch: print("save plot to %s" % plot_path) fig, ax = plt.subplots() ax.plot(losses) ax.set(xlabel='epoch', ylabel='loss', title='loss per epoch') ax.grid() fig.savefig(plot_path) if not self.is_gridsearch: print("save HG") torch.save({ 'model': 'pe_hourglass', 'state_dict': hg.state_dict(), 'config': hg_config }, os.path.join(self.model_dir, "%d_hg_e2e.torch" % self.config["config_id"])) if not self.is_gridsearch: print("save PDM") pdm.save_pdm(pdm.train_epochs, os.path.join(self.model_dir, "%d_pdm_e2e.torch" % self.config["config_id"])) results_after = run_e2e(hg, pdm, self.config["data"], location, self.config["bs"], verbose=False) if not self.is_gridsearch: print("Before training") for model, res in results_before.items(): print(model, res) print("After training") for model, res in results_after.items(): print(model, res) if self.is_gridsearch: logpath = os.path.join(self.result_dir, "%d_log.json" % self.config["config_id"]) json.dump({ "gt": self.gts, "l2d": self.l2d_log, "hg": self.hg_coords_log, "losses": self.loss_log }, open(logpath, "w")) return { **self.config, "min_loss": min(self.loss_log), "last_loss" : self.loss_log[-1], "hg_before_easy_with" : results_before["hg"]["easy_woutline"], "hg_before_easy_without": results_before["hg"]["easy_noutline"], "hg_before_hard_with": results_before["hg"]["hard_woutline"], "hg_before_hard_without": results_before["hg"]["hard_noutline"], "pdm_before_easy_with": results_before["pdm"]["easy_woutline"], "pdm_before_easy_without": results_before["pdm"]["easy_noutline"], "pdm_before_hard_with": results_before["pdm"]["hard_woutline"], "pdm_before_hard_without": results_before["pdm"]["hard_noutline"], "hg_after_easy_with": results_after["hg"]["easy_woutline"], "hg_after_easy_without": results_after["hg"]["easy_noutline"], "hg_after_hard_with": results_after["hg"]["hard_woutline"], "hg_after_hard_without": results_after["hg"]["hard_noutline"], "pdm_after_easy_with": results_after["pdm"]["easy_woutline"], "pdm_after_easy_without": results_after["pdm"]["easy_noutline"], "pdm_after_hard_with": results_after["pdm"]["hard_woutline"], "pdm_after_hard_without": results_after["pdm"]["hard_noutline"], }
def run(pdm, hg_results, gpu): location = 'cpu' if gpu is None else "cuda:%d" % gpu data = torch.load(pdm, map_location='cpu') state_dict = data['state_dict'] config = data['config'] make_deterministic(config['random_seed']) net = ModelTrainer.create_net(config) net.model.load_state_dict(state_dict) net.model.eval() net.to(location) net.bs *= 256 hg_out = json.load(open(hg_results, "r")) #avg_dist = torch.tensor(hg_out["train"]["average_lm_distances"], device=location) easy = [x["coord_and_conf"] for x in hg_out["easy"]["results"]] easy_gt = torch.tensor([[[y["gt_x"], y["gt_y"]] for y in x] for x in easy], device=location) hard = [x["coord_and_conf"] for x in hg_out["hard"]["results"]] hard_gt = torch.tensor([[[y["gt_x"], y["gt_y"]] for y in x] for x in hard], device=location) train = [x["coord_and_conf"] for x in hg_out["train"]["results"]] #gauss = norm(0.0, stddev) #easy_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], gauss.pdf(y["dist_x"]), gauss.pdf(y["dist_y"])] for y in x] for x in easy], device=location) #hard_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], gauss.pdf(y["dist_x"]), gauss.pdf(y["dist_y"])] for y in x] for x in hard], device=location) import math import random #mp = lambda x: (-5494.5 * x + 1.099)**2 #mp = lambda x: 1/(100000*x**2+1) mp = lambda x: min(1, max(0, 1 / x - 130)) mp = lambda x: 1 / x #print(torch.min(avg_dist), torch.max(avg_dist)) #exit() """ varx = torch.tensor([[1/y["var_x"] for y in x] for x in easy], device=location) vary = torch.tensor([[1/y["var_y"] for y in x] for x in easy], device=location) print("easy", torch.min(varx), torch.max(varx)) print("easy", torch.min(vary), torch.max(vary)) varx = torch.tensor([[1/y["var_x"] for y in x] for x in hard], device=location) vary = torch.tensor([[1/y["var_y"] for y in x] for x in hard], device=location) print("hard", torch.min(varx), torch.max(varx)) print("hard", torch.min(vary), torch.max(vary)) varx = torch.tensor([[1/y["var_x"] for y in x] for x in train], device=location) vary = torch.tensor([[1/y["var_y"] for y in x] for x in train], device=location) print("train", torch.min(varx), torch.max(varx)) print("train", torch.min(vary), torch.max(vary)) exit() """ #easy_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], mp(avg_dist[i][0]), mp(avg_dist[i][1])] for i,y in enumerate(x)] for x in easy], device=location) #hard_hg_pred = torch.tensor([[[y["pred_x"], y["pred_y"], mp(avg_dist[i][0]), mp(avg_dist[i][1])] for i,y in enumerate(x)] for x in hard], device=location) easy_hg_pred = torch.tensor( [[[y["pred_x"], y["pred_y"], mp(y["var_x"]), mp(y["var_y"])] for i, y in enumerate(x)] for x in easy], device=location) hard_hg_pred = torch.tensor( [[[y["pred_x"], y["pred_y"], mp(y["var_x"]), mp(y["var_y"])] for i, y in enumerate(x)] for x in hard], device=location) #print(torch.min(easy_hg_pred[:,:,2:]), torch.max(easy_hg_pred[:,:,2:])) #print(torch.min(hard_hg_pred[:, :, 2:]), torch.max(hard_hg_pred[:, :, 2:])) #exit() sample_losses_hg_easy = [ np.mean((easy_hg_pred[i, :, :2].cpu().numpy() - easy_gt[i].cpu().numpy())**2) for i in range(easy_gt.shape[0]) ] # TODO test() takes pred and conf now separately zs, nr, *_ = net.test(easy_hg_pred, verbose=True) l2d_easy, _ = net.forward(zs, nr) sample_losses_pdm_easy = [ np.mean((l2d_easy[i].detach().cpu().numpy() - easy_gt[i].detach().cpu().numpy())**2) for i in range(easy_gt.shape[0]) ] easy_best = Counter() best_coords_easy = [] worst_coords_easy = [] for i in range(easy_gt.shape[0]): if sample_losses_pdm_easy[i] <= sample_losses_hg_easy[i]: easy_best["pdm"] += 1 best_coords_easy.append( l2d_easy[i].cpu().detach().numpy().tolist()) worst_coords_easy.append( easy_hg_pred[i, :, :2].cpu().detach().numpy().tolist()) else: easy_best["hg"] += 1 best_coords_easy.append( easy_hg_pred[i, :, :2].cpu().detach().numpy().tolist()) worst_coords_easy.append( l2d_easy[i].cpu().detach().numpy().tolist()) sample_losses_hg_hard = [ np.mean((hard_hg_pred[i, :, :2].cpu().numpy() - hard_gt[i].cpu().numpy())**2) for i in range(hard_gt.shape[0]) ] # TODO test() takes pred and conf now separately zs, nr, *_ = net.test(hard_hg_pred, verbose=True) l2d_hard, _ = net.forward(zs, nr) sample_losses_pdm_hard = [ np.mean((l2d_hard[i].detach().cpu().numpy() - hard_gt[i].detach().cpu().numpy())**2) for i in range(hard_gt.shape[0]) ] hard_best = Counter() best_coords_hard = [] worst_coords_hard = [] for i in range(hard_gt.shape[0]): if sample_losses_pdm_hard[i] <= sample_losses_hg_hard[i]: hard_best["pdm"] += 1 best_coords_hard.append( l2d_hard[i].cpu().detach().numpy().tolist()) worst_coords_hard.append( hard_hg_pred[i, :, :2].cpu().detach().numpy().tolist()) else: hard_best["hg"] += 1 best_coords_hard.append( hard_hg_pred[i, :, :2].cpu().detach().numpy().tolist()) worst_coords_hard.append( l2d_hard[i].cpu().detach().numpy().tolist()) hg_easy_eval = evaluate(easy_hg_pred[:, :, :2], easy_gt) all_pdm_easy_eval = evaluate(l2d_easy, easy_gt) best_pick_easy = evaluate( torch.tensor(best_coords_easy, dtype=torch.float32).cpu(), easy_gt.cpu()) worst_pick_easy = evaluate( torch.tensor(worst_coords_easy, dtype=torch.float32).cpu(), easy_gt.cpu()) print("\n---- EASY without outline----") print("HG \t\t %0.4f" % hg_easy_eval["without_outline"]) print("best pick \t %0.4f" % best_pick_easy["without_outline"]) print("worst pick \t %0.4f" % worst_pick_easy["without_outline"]) print("all PDM \t %0.4f" % all_pdm_easy_eval["without_outline"]) print("\n---- EASY with outline----") print("HG \t\t %0.4f" % hg_easy_eval["with_outline"]) print("best pick \t %0.4f" % best_pick_easy["with_outline"]) print("worst pick \t %0.4f" % worst_pick_easy["with_outline"]) print("all PDM \t %0.4f" % all_pdm_easy_eval["with_outline"]) print("easy best", easy_best) hg_hard_eval = evaluate(hard_hg_pred[:, :, :2], hard_gt) all_pdm_hard_eval = evaluate(l2d_hard, hard_gt) best_pick_hard = evaluate( torch.tensor(best_coords_hard, dtype=torch.float32).cpu(), hard_gt.cpu()) worst_pick_hard = evaluate( torch.tensor(worst_coords_hard, dtype=torch.float32).cpu(), hard_gt.cpu()) print("\n---- HARD without outline----") print("HG \t\t %0.4f" % hg_hard_eval["without_outline"]) print("best pick \t %0.4f" % best_pick_hard["without_outline"]) print("worst pick \t %0.4f" % worst_pick_hard["without_outline"]) print("all PDM \t %0.4f" % all_pdm_hard_eval["without_outline"]) print("\n---- HARD with outline----") print("HG \t\t %0.4f" % hg_hard_eval["with_outline"]) print("best pick \t %0.4f" % best_pick_hard["with_outline"]) print("worst pick \t %0.4f" % worst_pick_hard["with_outline"]) print("all PDM \t %0.4f" % all_pdm_hard_eval["with_outline"]) print("hard_best", hard_best)
def run(*, hg, pdm, data_src, location, hg_bs, encoder=None, verbose=True, random_seed=None, var_thresh=None, menpo=None): torch.autograd.set_detect_anomaly(True) # This makes debugging much easier if location is not 'cpu': torch.cuda.set_device(torch.device(location)) if random_seed is not None: make_deterministic(random_seed) normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD normTransform = transforms.Normalize(normMean, normStd) transform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) pin_memory = location != 'cpu' num_workers = 4 with h5py.File(data_src, 'r') as f: easy_d = FaceLandmarksEasyTestData(f, transform=transform) hard_d = FaceLandmarksHardTestData(f, transform=transform) easy_loader = DataLoader(dataset=easy_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(easy_d)) hard_loader = DataLoader(dataset=hard_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(hard_d)) pipeline = E2E(hg, pdm, hg_bs, max(len(easy_d), len(hard_d)), encoder=encoder, verbose=verbose, var_thresh=var_thresh) e2e_results = run_e2e(pipeline, easy_loader, hard_loader, location) hg_results = { "easy68": e2e_results["easy"]["eval_hg"]["with_outline"], "hard68": e2e_results["hard"]["eval_hg"]["with_outline"], "easy49": e2e_results["easy"]["eval_hg"]["without_outline"], "hard49": e2e_results["hard"]["eval_hg"]["without_outline"] } pdm_results = { "easy68": e2e_results["easy"]["eval_pdm"]["with_outline"], "hard68": e2e_results["hard"]["eval_pdm"]["with_outline"], "easy49": e2e_results["easy"]["eval_pdm"]["without_outline"], "hard49": e2e_results["hard"]["eval_pdm"]["without_outline"] } if encoder is not None: pdm_encoder_results = { "easy68": e2e_results["easy"]["eval_pdm_encoder"]["with_outline"], "hard68": e2e_results["hard"]["eval_pdm_encoder"]["with_outline"], "easy49": e2e_results["easy"]["eval_pdm_encoder"]["without_outline"], "hard49": e2e_results["hard"]["eval_pdm_encoder"]["without_outline"] } else: pdm_encoder_results = {k: 10000000.0 for k in ["easy68", "hard68", "easy49", "hard49"]} if menpo is not None: with h5py.File(args.menpo, 'r') as f: menpo_d = Menpo(f, transform=transform) menpo_loader = DataLoader(dataset=menpo_d, shuffle=False, num_workers=num_workers, pin_memory=pin_memory, batch_size=len(menpo_d)) pipeline = E2E(hg, pdm, hg_bs, len(menpo_d), encoder=encoder, verbose=verbose, var_thresh=var_thresh) menpo_res = run_e2e_split(pipeline, menpo_loader, location) menpo_gt = menpo_res["gt"] menpo_hg_pred = menpo_res["hg_pred"] menpo_pdm_pred = menpo_res["pdm_pred"] menpo_hg_error = evaluate_menpo(menpo_hg_pred, menpo_gt) menpo_pdm_error = evaluate_menpo(menpo_pdm_pred, menpo_gt) hg_results["menpo68"] = menpo_hg_error[0] hg_results["menpo49"] = menpo_hg_error[1] pdm_results["menpo68"] = menpo_pdm_error[0] pdm_results["menpo49"] = menpo_pdm_error[1] else: hg_results["menpo68"] = 10000000.0 hg_results["menpo49"] = 10000000.0 pdm_results["menpo68"] = 10000000.0 pdm_results["menpo49"] = 10000000.0 res = { "hg": hg_results, "pdm": pdm_results, "pdm_encoder": pdm_encoder_results, "gt": { "easy": e2e_results["easy"]["gt"], "hard": e2e_results["hard"]["gt"] }, "hg_pred": { "easy": e2e_results["easy"]["hg_pred"], "hard": e2e_results["hard"]["hg_pred"] }, "pdm_pred": { "easy": e2e_results["easy"]["pdm_pred"], "hard": e2e_results["hard"]["pdm_pred"] }, "pdm_3d": { "easy": e2e_results["easy"]["pdm_3d"], "hard": e2e_results["hard"]["pdm_3d"] }, "pdm_applied": { "easy": e2e_results["easy"]["pdm_applied"], "hard": e2e_results["hard"]["pdm_applied"] } } if "pdm_encoder_pred" in e2e_results["easy"]: res["pdm_encoder_pred"] = { "easy": e2e_results["easy"]["pdm_encoder_pred"], "hard": e2e_results["hard"]["pdm_encoder_pred"] } return res
def run(self): torch.autograd.set_detect_anomaly( True) # This makes debugging much easier make_deterministic(self.config['random_seed']) encoders = None if self.config["encoder"]: # This assumes that an encoder has already been trained for the PDM # Example: pdm path is my/dir/models/pdm_4.torch # Then the encoder is loaded from my/dir/encoders/encoder_4.torch (if it does not exists, the code crashes) pdm_filename = os.path.basename(self.config["pdm"]) if "final" in pdm_filename: pdm_id = int(pdm_filename.split(".")[0].split("_")[-1]) else: pdm_id = int(pdm_filename.split("_")[0]) encoders = { 49: os.path.join( os.path.dirname(os.path.dirname(self.config["pdm"])), "encoders", "encoder_49_%d.torch" % pdm_id), 68: os.path.join( os.path.dirname(os.path.dirname(self.config["pdm"])), "encoders", "encoder_68_%d.torch" % pdm_id), } if not self.is_gridsearch: print("encoder", encoders) if "prediction_target" in self.config and self.config[ "prediction_target"] is not None: pred_target = self.config["prediction_target"] pred_target_dir = os.path.dirname(pred_target) mkdir_if_not_exists(pred_target_dir) else: pred_target = None success = False tries = 0 maxtries = 75 while not success: tries += 1 try: res, hg_config, pdm_config = load_and_run( hg_src=self.config["hg"], pdm_src=self.config["pdm"], data_src=self.data, gpu_id=self.gpu_id, random_seed=self.config["random_seed"], pdm_configurator=self.configure_pdm, verbose=not self.is_gridsearch, var_thresh=self.config["variance_threshold"], encoders=encoders) success = True except RuntimeError as e: txt = str(e) if "out of memory" in txt: if tries <= maxtries: waittime = tries * random.randint(1, 5) print( "ERROR! There was a OOM error, wait %d seconds and try again. Try nr. %d" % (waittime, tries)) time.sleep(waittime) else: print("ERROR! maxtries (%d) exceeded" % maxtries) raise e else: raise e results = { "hg_easy49": res["hg"]["easy49"], "hg_hard49": res["hg"]["hard49"], "hg_easy68": res["hg"]["easy68"], "hg_hard68": res["hg"]["hard68"], "pdm_easy49": res["pdm"]["easy49"], "pdm_hard49": res["pdm"]["hard49"], "pdm_easy68": res["pdm"]["easy68"], "pdm_hard68": res["pdm"]["hard68"], "pdm_encoder_easy49": res["pdm_encoder"]["easy49"], "pdm_encoder_hard49": res["pdm_encoder"]["hard49"], "pdm_encoder_easy68": res["pdm_encoder"]["easy68"], "pdm_encoder_hard68": res["pdm_encoder"]["hard68"], "easy49_factor": res["hg"]["easy49"] / res["pdm"]["easy49"], "hard49_factor": res["hg"]["hard49"] / res["pdm"]["hard49"], "easy68_factor": res["hg"]["easy68"] / res["pdm"]["easy68"], "hard68_factor": res["hg"]["hard68"] / res["pdm"]["hard68"], "enc_easy49_factor": res["hg"]["easy49"] / res["pdm_encoder"]["easy49"] if self.config["encoder"] else 0.0, "enc_hard49_factor": res["hg"]["hard49"] / res["pdm_encoder"]["hard49"] if self.config["encoder"] else 0.0, "enc_easy68_factor": res["hg"]["easy68"] / res["pdm_encoder"]["easy68"] if self.config["encoder"] else 0.0, "enc_hard68_factor": res["hg"]["hard68"] / res["pdm_encoder"]["hard68"] if self.config["encoder"] else 0.0 } print( "Config: %d | factor e49: %0.4f | factor h49: %0.4f | factor e68: %0.4f | factor h68: %0.4f" % (self.config["config_id"], results["easy49_factor"], results["hard49_factor"], results["easy68_factor"], results["hard68_factor"])) if self.is_gridsearch: return {**self.config, **results} else: for k, v in results.items(): print(k, v) if pred_target: output = { "meta": { "hg_model": self.config["hg"], "pdm_model": self.config["pdm"], "hg_config": hg_config, "pdm_config": pdm_config, "gapsearch_config": self.config }, "results": results, "predictions": { "easy": { "gt": res["gt"]["easy"].cpu().detach().numpy().tolist(), "pdm_pred": res["pdm_pred"] ["easy"].cpu().detach().numpy().tolist(), "hg_pred": res["hg_pred"]["easy"].cpu().detach().numpy().tolist(), "pdm_3d": res["pdm_3d"]["easy"].cpu().detach().numpy().tolist() }, "hard": { "gt": res["gt"]["hard"].cpu().detach().numpy().tolist(), "pdm_pred": res["pdm_pred"] ["hard"].cpu().detach().numpy().tolist(), "hg_pred": res["hg_pred"]["hard"].cpu().detach().numpy().tolist(), "pdm_3d": res["pdm_3d"]["hard"].cpu().detach().numpy().tolist() } } } if "pdm_encoder_pred" in res: output["predictions"]["easy"]["pdm_encoder_pred"] = res[ "pdm_encoder_pred"]["easy"].cpu().detach().numpy().tolist( ) output["predictions"]["hard"]["pdm_encoder_pred"] = res[ "pdm_encoder_pred"]["hard"].cpu().detach().numpy().tolist( ) json.dump(output, open(pred_target, "w"), indent=2) print("Predictions written to", pred_target)
def run(self): self.config["model_dir"] = self.model_dir make_deterministic(self.config['random_seed']) pdm = ModelTrainer.create_net(self.config) self.to_gpu(pdm) pdm.verbose = not self.is_gridsearch pdm.listener = self.receive_pdm_output dt = h5py.File(self.data, "r") data_tr = self.to_gpu( torch.tensor(dt["300W"]["train_y"], dtype=torch.float32)) data_te = self.to_gpu( torch.tensor(dt["300W"]["test_y"], dtype=torch.float32)) if self.config["add_multipie"]: tmp = self.to_gpu( torch.tensor(dt["multipie"]["train_y"], dtype=torch.float32)) data_tr = torch.cat((data_tr, tmp)) #print("train", data_tr.shape) #print("test", data_te.shape) #exit() zs_tr, nr_tr, loss_tr = pdm.train(data=data_tr) train_reconstructed, _ = pdm.forward(zs_tr, nr_tr) zs_te, nr_te, loss_te, *_ = pdm.test(data=data_te, confidence=None) test_reconstructed, _ = pdm.forward(zs_te, nr_te) target_file = os.path.join( self.result_dir, "zs_and_nr_%d.json" % self.config["config_id"]) json.dump( { "train": { "zs": zs_tr.detach().cpu().numpy().tolist(), "nr": nr_tr.detach().cpu().numpy().tolist(), "reconstructed": train_reconstructed.detach().cpu().numpy().tolist(), "coords": data_tr.detach().cpu().numpy().tolist() }, "test": { "zs": zs_te.detach().cpu().numpy().tolist(), "nr": nr_te.detach().cpu().numpy().tolist(), "reconstructed": test_reconstructed.detach().cpu().numpy().tolist(), "coords": data_te.detach().cpu().numpy().tolist() } }, open(target_file, "w")) pdm.save_pdm( pdm.train_epochs, os.path.join(self.model_dir, "final_pdm_%d.torch" % self.config["config_id"])) # TODO train ENCODERS DIRECTLY HERE if self.is_gridsearch: last_train_loss = self.loss_log["train"][-1] lowest_train_loss = min(self.loss_log["train"]) best_train_epoch = min([ i for i in range(len(self.loss_log["train"])) if self.loss_log["train"][i] == lowest_train_loss ]) train_error = evaluate(train_reconstructed, data_tr) test_error = evaluate(test_reconstructed, data_te) #print(train_error, test_error) best_epochs = { "best_%s_epoch" % k: v for k, v in self.best_epoch.items() } best_errors = { "best_%s" % k: v for k, v in self.lowest_error.items() } return { **self.config, "last_train_loss": last_train_loss, "lowest_train_loss": lowest_train_loss, "best_train_epoch": best_train_epoch, "metrics_log": self.metrics_log, **best_epochs, **best_errors, "train_error_49": train_error["without_outline"], "train_error_68": train_error["with_outline"], "test_error_49": test_error["without_outline"], "test_error_68": test_error["with_outline"] } else: # evaluate PDM metrics = pdm.eval_on_alpha_hg() print(metrics["easy_metrics_last"]) print(metrics["hard_metrics_last"])
def run(self): torch.cuda.empty_cache() starttime = time.time() if self.gpu_id is not None: # cudnn.benchmark improves training speed when input sizes do not change # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 # It selects the best algorithms as the training iterates over the dataset # I found no big difference between True and False, but it also doesn't hurt, so enable it #cudnn.benchmark = True # disable for deterministic behavior pass config = self.config config_id = config["config_id"] n_lm = config["n_lm"] make_deterministic(config['random_seed']) torch.autograd.set_detect_anomaly( True) # This makes debugging much easier jitterTransform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1) # TODO store these values in h5 files normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD normTransform = transforms.Normalize(normMean, normStd) rot_angle = float(config['augment_rotation']) rotation_augmentation = RandomRotation(min_angle=-1 * rot_angle, max_angle=rot_angle, retain_scale=False, rotate_landmarks="same") trainTransform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(jitterTransform), ImageAndLabelTransform(RandomHorizontalFlip()), ImageAndLabelTransform(rotation_augmentation), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) testTransform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) # Note: Reading takes only ~0.2s, so it is okay to do this again whenever main.py is called # No need to read in trainer.py and pass results here with h5py.File(self.data, 'r') as f: train_dataset = FaceLandmarksTrainingData(f, transform=trainTransform, n_lm=n_lm) val_dataset = FaceLandmarksAllTestData(f, transform=testTransform, n_lm=n_lm) easy_d = FaceLandmarksEasyTestData(f, transform=testTransform, n_lm=n_lm) hard_d = FaceLandmarksHardTestData(f, transform=testTransform, n_lm=n_lm) print("GPU %d.%d" % (self.gpu_id, self.sub_gpu_id), "Data: %s" % self.data, "Train %d Test %d" % (len(train_dataset), len(val_dataset))) dataloader_params = { 'batch_size': config['batch_size'], 'pin_memory': self.gpu_id is not None, 'num_workers': 8 } train_loader = DataLoader(train_dataset, shuffle=True, **dataloader_params) val_loader = DataLoader(val_dataset, shuffle=False, **dataloader_params) easy = DataLoader(easy_d, shuffle=False, **dataloader_params) hard = DataLoader(hard_d, shuffle=False, **dataloader_params) net = self.create_net(config) _, trainable_parameters, _ = count_parameters(net) self.to_gpu(net) net.train() # Put net into train mode params = [ { "params": net.hourglass.parameters() }, { "params": net.regressor.parameters() }, ] if config["predict_distances_weight"] > 0: # generate ground truth distances y = torch.stack([x["landmarks"] for x in train_dataset]) bs = y.shape[0] n_lm = y.shape[1] dist_gt = torch.zeros(bs, n_lm, n_lm, 2) dist_gt[:, :, :, 0] = y[:, :, 0].view(bs, 1, -1) - y[:, :, 0].view( bs, -1, 1) dist_gt[:, :, :, 1] = y[:, :, 1].view(bs, 1, -1) - y[:, :, 1].view( bs, -1, 1) optimizer = optim.Adam(params, lr=config['lr']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=config['lr_scheduler_patience'], verbose=True, factor=config['lr_decay_factor']) early_stopping_patience = config['lr_scheduler_patience'] * 2 + 1 early_stopping_max_ratio = 0.975 should_stop = EarlyStopping(patience=early_stopping_patience, max_ratio=early_stopping_max_ratio, verbose=False) loss_function = self.get_loss_function(config['regression'], config['loss_function']) category_calculator = { "e49": lambda metrics: metrics["e49"], "h49": lambda metrics: metrics["h49"], "e68": lambda metrics: metrics["e68"], "h68": lambda metrics: metrics["h68"], "49": lambda metrics: (metrics["e49"] + metrics["h49"]) / 2, "68": lambda metrics: (metrics["e68"] + metrics["h68"]) / 2, "e": lambda metrics: (metrics["e49"] + metrics["e68"]) / 2, "h": lambda metrics: (metrics["h49"] + metrics["h68"]) / 2, "all": lambda metrics: (metrics["e49"] + metrics["h49"] + metrics["e68"] + metrics["h68"]) / 4 } categories = category_calculator.keys() best_epoch = {k: 0 for k in categories} lowest_error = {k: np.Inf for k in categories} epoch_train_losses = [] epoch_val_losses = [] # Only store models that are better than these values to save storage storage_thresholds = {"e49": 2.1, "h49": 3.4, "e68": 2.7, "h68": 4.5} storage_thresholds["49"] = category_calculator["49"]( storage_thresholds) storage_thresholds["68"] = category_calculator["68"]( storage_thresholds) storage_thresholds["e"] = category_calculator["e"](storage_thresholds) storage_thresholds["h"] = category_calculator["h"](storage_thresholds) storage_thresholds["all"] = category_calculator["all"]( storage_thresholds) loss_history = {} metric_history = [] dist_loss_fct = nn.L1Loss() epochs = config['n_epoch'] for epoch in range(epochs): epoch_start_time = time.time() net.train() epoch_train_loss = 0 epoch_sample_count = 0 for sample in train_loader: x = self.to_gpu(sample['image'].float()) y = self.to_gpu(sample['landmarks'].float()) if config["predict_distances_weight"] > 0: indices = self.to_gpu(sample['index']) dist_y = self.to_gpu(dist_gt[indices]) epoch_sample_count += x.shape[0] optimizer.zero_grad() coords, heatmaps, var, unnormalized_heatmaps = net(x) loss = loss_function(coords, heatmaps, y) epoch_train_loss += loss.float().data.item() if config["normalize_loss"]: if loss.detach().data.item() > 0: loss = loss / loss.detach() if config["predict_distances_weight"] > 0: bs = x.shape[0] distance_pred = torch.zeros(bs, n_lm, n_lm, 2) distance_pred[:, :, :, 0] = coords[:, :, 0].view( bs, 1, -1) - coords[:, :, 0].view(bs, -1, 1) distance_pred[:, :, :, 1] = coords[:, :, 1].view( bs, 1, -1) - coords[:, :, 1].view(bs, -1, 1) distance_pred = self.to_gpu(distance_pred) dist_loss = dist_loss_fct(distance_pred, dist_y) loss = loss + config[ "predict_distances_weight"] * dist_loss / dist_loss.detach( ) else: dist_loss = 0 if torch.isnan(loss): print_info( "ERROR! Invalid loss (nan). Aborting training for config %d in epoch %d" % (config_id, epoch)) raise LossException("loss was nan in config %d, epoch %d" % (config_id, epoch)) if torch.isinf(loss): print_info( "ERROR! Invalid loss (inf). Aborting training for config %d in epoch %d" % (config_id, epoch)) raise LossException("loss was inf in config %d, epoch %d" % (config_id, epoch)) loss.backward() optimizer.step() #### end batch epoch_train_loss /= epoch_sample_count # normalize loss by images that were processed val_loss = self.evaluate_model(val_loader, net, loss_function) scheduler.step(val_loss) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(val_loss) loss_history[epoch] = { 'train': epoch_train_losses[-1], 'val': epoch_val_losses[-1] } epoch_end_time = time.time() epoch_duration = epoch_end_time - epoch_start_time metrics = benchmark(net, easy, hard, self.gpu_id) all_metrics = {} for category, calculator in category_calculator.items(): error = calculator(metrics) all_metrics[category] = error if error < lowest_error[ category] and error < 1000: # 100000 is the error for with outline when HG only has 49LM lowest_error[category] = error best_epoch[category] = epoch if error < storage_thresholds[category]: torch.save( { 'model': 'pe_hourglass', 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'val_loss': val_loss, 'config': config, 'category': category, 'metrics': all_metrics }, os.path.join( self.model_dir, "%d_best_%s.torch" % (config_id, category))) metric_history.append(all_metrics) print( "GPU %d.%d" % (self.gpu_id, self.sub_gpu_id), "| conf", config_id, '| %03d/%03d' % (epoch + 1, epochs), '| %ds' % (int(epoch_duration)), '| train %0.6f' % epoch_train_losses[-1], '| val %0.6f' % epoch_val_losses[-1], '| dist %0.6f' % float(dist_loss), '| e68 %0.2f [B %0.2f]' % (metrics["e68"], lowest_error['e68']), '| h68 %0.2f [B %0.2f]' % (metrics["h68"], lowest_error['h68']), '| e49 %0.2f [B %0.2f]' % (metrics["e49"], lowest_error['e49']), '| h49 %0.2f [B %0.2f]' % (metrics["h49"], lowest_error['h49']), ) if should_stop(val_loss): epochs = epoch + 1 print_info( "EarlyStopping (patience = %d, max_ratio=%f) criterion returned true in epoch %d. Stop training" % (should_stop.patience, should_stop.max_ratio, epochs)) break endtime = time.time() # Write a loss plot to CONFIG_ID_loss_plot.txt in the output directory # TODO tensorboardX in addition to matplotlib? x = np.array(range(epochs)) plt.plot(x, np.array(epoch_train_losses), 'r', label='Train Loss') plt.plot(x, np.array(epoch_val_losses), 'b', label='Val Loss') plt.xlabel("Epochs") plt.ylabel("Avg. Train and Val Loss") plt.title("Variation of train and Val loss with epochs") plt.legend(loc='best') plt.savefig(os.path.join(self.plot_dir, "%d_loss_plot.png" % config_id)) plt.close() training_duration = int(endtime - starttime) best_epochs = {"best_%s_epoch" % k: v for k, v in best_epoch.items()} best_errors = {"best_%s" % k: v for k, v in lowest_error.items()} results = { "config_id": config_id, 'dataset': self.data, "gpu_id": self.gpu_id, "duration_seconds": training_duration, "last_epoch": epochs, # is different from n_epoch in case of early stopping "trainable_parameters": trainable_parameters, **self.config, "optimizer_name": optimizer.__class__.__name__, **best_epochs, "training_loss_last_epoch": epoch_train_losses[-1], **best_errors } # Write results to CONFIG_ID_result.json in the output directory with open(os.path.join(self.result_dir, "%d_result.json" % config_id), "w") as f: to_write = { **results, 'loss_history': loss_history, 'metric_history': metric_history } json.dump(to_write, f, indent=4) torch.cuda.empty_cache() return results
def run_pdm(pdm_path, hg_results, location=torch.device("cpu"), bs=512, encoder=None, history=False): data = torch.load(pdm_path, map_location=location) state_dict = data['state_dict'] config = data['config'] # Altough the same random seed is used as in training, the results will slightly differ # The reason is that the metrics are calculated after the training already run for a few epochs, so the # random number generator will be in a different state depending on the training before make_deterministic(config['random_seed']) pdm = ModelTrainer.create_net(config) pdm.model.load_state_dict(state_dict) pdm = pdm.to(location) if encoder is not None: enc_data = torch.load(encoder) encoder = Encoder(zs_size=enc_data["zs_size"], nr_size=enc_data["nr_size"]) encoder.load_state_dict(enc_data["state_dict"]) encoder = encoder.to(location) print("Encoder", encoder) hg_coords = torch.tensor([[[lm["pred_x"], lm["pred_y"]] for lm in sample["coord_and_conf"]] for sample in hg_results], device=location) gt = torch.tensor([[[lm["gt_x"], lm["gt_y"]] for lm in sample["coord_and_conf"]] for sample in hg_results], device=location) #variances = torch.tensor([[[lm["var_x"], lm["var_y"]] for lm in sample["coord_and_conf"]] for sample in hg_results], device=location) hg_coords_and_conf = torch.tensor([[[ lm["pred_x"], lm["pred_y"], pdm.variance2confidence(lm["var_x"]), pdm.variance2confidence(lm["var_y"]) ] for lm in sample["coord_and_conf"]] for sample in hg_results], device=location) if history: # TODO test() takes pred and conf now separately zs, nr, _, history = pdm.test(hg_coords_and_conf, return_history=True, encoder=encoder, only_encoder=False, bs=bs) else: # TODO test() takes pred and conf now separately zs, nr, *_ = pdm.test(hg_coords_and_conf, return_history=False, encoder=encoder, only_encoder=False, bs=bs) history = None l2d, _ = pdm.forward(zs, nr) l2d = l2d.detach() return hg_coords, hg_coords_and_conf, gt, l2d, history