def fit(self, config, train_dl, val_dl): # Try loading previous model checkpoint = self.load_checkpoint(self.checkpoints_dirpath) # Try last checkpoint if checkpoint is None and self.init_checkpoints_dirpath is not None: # Try with init_checkpoints_dirpath: checkpoint = self.load_checkpoint(self.init_checkpoints_dirpath) if checkpoint is None: checkpoint = { "epoch": -1, } start_epoch = checkpoint["epoch"] + 1 # Start at next epoch fit_pbar = tqdm(range(start_epoch, config["max_epoch"]), desc="Fitting: ", initial=start_epoch, total=config["max_epoch"]) train_loss = None val_loss = None for epoch_index, epoch in enumerate(fit_pbar): self.model.train() train_loss = self.run_epoch("Train", train_dl, config["train_log_step"], opt=self.optimizer) self.model.eval() with torch.no_grad(): val_loss = self.run_epoch("Val", val_dl, max(len(val_dl) // 4, 1)) fit_pbar.set_postfix(train_loss="{:04f}".format(train_loss), val_loss="{:04f}".format(val_loss)) if epoch_index % config["checkpoint_epoch"] == 0: self.save_checkpoint(epoch, train_loss, val_loss) self.save_checkpoint(epoch, train_loss, val_loss) python_utils.save_json(os.path.join(self.logs_dirpath, "final_losses.json"), { 'train_loss': train_loss, 'val_loss': val_loss, })
def main(_): working_dir = os.path.dirname(os.path.abspath(__file__)) # print FLAGS print("#--- FLAGS: ---#") print("config: {}".format(FLAGS.config)) print("new_run: {}".format(FLAGS.new_run)) print("init_run_name: {}".format(FLAGS.init_run_name)) print("run_name: {}".format(FLAGS.run_name)) print("batch_size: {}".format(FLAGS.batch_size)) # load config file config = run_utils.load_config(FLAGS.config) # Check config setting coherences assert len(config["level_loss_coefs_params"]) == config["pool_count"], \ "level_loss_coefs_params ({} elements) must have model_res_levels ({}) elements".format( len(config["level_loss_coefs_params"]), config["pool_count"]) tfrecords_dirpath_list = [ os.path.join(working_dir, tfrecords_dirpath) for tfrecords_dirpath in config["tfrecords_partial_dirpath_list"] ] ds_repeat_list = config["ds_repeat_list"] # setup init run directory of one is specified: if FLAGS.init_run_name is not None: init_run_dirpath = run_utils.setup_run_dir(config["runs_dirname"], FLAGS.init_run_name) else: init_run_dirpath = None # setup run directory: runs_dir = os.path.join(working_dir, config["runs_dirname"]) current_run_dirpath = run_utils.setup_run_dir(runs_dir, FLAGS.run_name, FLAGS.new_run) # save config in logs directory run_utils.save_config(config, current_run_dirpath) # save FLAGS FLAGS_filepath = os.path.join(current_run_dirpath, "FLAGS.json") python_utils.save_json( FLAGS_filepath, { "run_name": FLAGS.run_name, "new_run": FLAGS.new_run, "batch_size": FLAGS.batch_size }) train(config, tfrecords_dirpath_list, init_run_dirpath, current_run_dirpath, FLAGS.batch_size, ds_repeat_list)
def main(_): # Print flags print("#--- Flags: ---#") print("new_run: {}".format(FLAGS.new_run)) print("init_run_name: {}".format(FLAGS.init_run_name)) print("run_name: {}".format(FLAGS.run_name)) print("batch_size: {}".format(FLAGS.batch_size)) print("ds_fac: {}".format(FLAGS.ds_fac)) if FLAGS.ds_fac is not None: ds_fac_list = [FLAGS.ds_fac] ds_repeat_list = [1] else: ds_fac_list = config.DS_FAC_LIST ds_repeat_list = config.DS_REPEAT_LIST # Setup init run directory of one is specified: if FLAGS.init_run_name is not None: init_run_dirpath = run_utils.setup_run_dir(config.RUNS_DIR, FLAGS.init_run_name) else: init_run_dirpath = None # Setup run directory: current_run_dirpath = run_utils.setup_run_dir(config.RUNS_DIR, FLAGS.run_name, FLAGS.new_run) # Save config.py in logs directory run_utils.save_config(config.PROJECT_DIR, current_run_dirpath) # Save flags flags_filepath = os.path.join(current_run_dirpath, "flags.json") python_utils.save_json( flags_filepath, { "run_name": FLAGS.run_name, "new_run": FLAGS.new_run, "batch_size": FLAGS.batch_size, "ds_fac": FLAGS.ds_fac, }) train(init_run_dirpath, current_run_dirpath, FLAGS.batch_size, ds_fac_list, ds_repeat_list)
def launch_experiments(config, exp_dirpath, new_exp, recompute_stats, params, stats_params): config = config.copy() config["runs_dirpath"] = exp_dirpath # Setup progress filepaths finished_exps_filepath = os.path.join(exp_dirpath, 'finished_exps.json') # Start a new experiment or recompute stats from an existing experiment remaining_exp_list = [] for n, f in itertools.product(params["sample_count"], params["frequency"]): for run in range(params["run_count"]): all_param = { "run": run, "sample_count": n, "frequency": f, "noise_std": params["noise_std"], "distribution": params["distribution"], } remaining_exp_list.append(all_param) if new_exp or recompute_stats: finished_exp_list = [] python_utils.save_json(finished_exps_filepath, finished_exp_list) else: # Continue a previous experiment. Load exp_param_lists finished_exp_list = python_utils.load_json(finished_exps_filepath) # Remove finished experiments from remaining_exp_list: remaining_exp_list = [item for item in remaining_exp_list if item not in finished_exp_list] finished_exp_count = len(finished_exp_list) total_exp_count = len(remaining_exp_list + finished_exp_list) remaining_exp_list_to_save = remaining_exp_list.copy() exp_pbar = tqdm(remaining_exp_list, desc="Running exps: ", initial=finished_exp_count, total=total_exp_count) for all_params in exp_pbar: launch_one_experiment(config, all_params, stats_params) remaining_exp_list_to_save.remove(all_params) finished_exp_list.append(all_params) python_utils.save_json(finished_exps_filepath, finished_exp_list)
def train(config, run_params, dataset_params): # print("# --- Starting training --- #") run_name = run_params["run_name"] new_run = run_params["new_run"] init_run_name = run_params["init_run_name"] working_dir = os.path.dirname(os.path.abspath(__file__)) # Find data_dir data_dirpath = python_utils.choose_first_existing_path( config["data_dir_candidates"]) if data_dirpath is None: print_utils.print_error("ERROR: Data directory not found!") exit() # print_utils.print_info("Using data from {}".format(data_dirpath)) root_dir = os.path.join(data_dirpath, config["data_root_partial_dirpath"]) # setup init checkpoints directory path if one is specified: if init_run_name is not None: init_run_dirpath = run_utils.setup_run_dir(config["runs_dirpath"], init_run_name) _, init_checkpoints_dirpath = run_utils.setup_run_subdirs( init_run_dirpath) else: init_checkpoints_dirpath = None # setup run directory: runs_dir = os.path.join(working_dir, config["runs_dirpath"]) run_dirpath = run_utils.setup_run_dir(runs_dir, run_name, new_run) # save config in logs directory run_utils.save_config(config, run_dirpath) # save args args_filepath = os.path.join(run_dirpath, "args.json") python_utils.save_json( args_filepath, { "run_name": run_name, "new_run": new_run, "init_run_name": init_run_name, "batch_size": config["batch_size"], }) # Choose device # dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") dev = "cpu" # For small networks and experiments, cpu is much faster # Instantiate dataset # sobol_generator = rand_utils.SobolGenerator() sobol_generator = None train_ds = Synthetic1DDataset(root_dir=root_dir, params=dataset_params, split_name="train", sobol_generator=sobol_generator, transform=torchvision.transforms.Compose([ transforms.ToTensor(), transforms.ToDevice(device=dev) ])) val_ds = Synthetic1DDataset(root_dir=root_dir, params=dataset_params, split_name="val", sobol_generator=sobol_generator, transform=torchvision.transforms.Compose([ transforms.ToTensor(), transforms.ToDevice(device=dev) ])) # print(train_ds.alpha) # print(val_ds.alpha) # exit() # Generate test dataset here because if using Sobel numbers, all datasets should be using the same SobolGenerator # so that they do not generate the same samples. test_ds = Synthetic1DDataset(root_dir=root_dir, params=dataset_params, split_name="test", sobol_generator=sobol_generator, transform=torchvision.transforms.Compose([ transforms.ToTensor(), transforms.ToDevice(device=dev) ])) train_dl = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True) val_dl = DataLoader(val_ds, batch_size=config["batch_size"]) success = False while not success: try: model = Simple1DInputNet(config) model.to(dev) optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"]) loss_func = measures.l1_loss trainer = Trainer(config, model, optimizer, loss_func, init_checkpoints_dirpath, run_dirpath) trainer.fit(config, train_dl, val_dl) success = True except ValueError: # Catches NaN errors # Try again run_utils.wipe_run_subdirs(run_dirpath) print("\nTry again\n") pass
def main(_): working_dir = os.path.dirname(os.path.abspath(__file__)) config_dir = os.path.dirname(os.path.realpath(__file__)) # print FLAGS print("#--- FLAGS: ---#") print("config: {}".format(FLAGS.config)) print("new_run: {}".format(FLAGS.new_run)) print("init_run_name: {}".format(FLAGS.init_run_name)) print("run_name: {}".format(FLAGS.run_name)) print("batch_size: {}".format(FLAGS.batch_size)) print("ds_fac: {}".format(FLAGS.ds_fac)) # load config file config = run_utils.load_config(FLAGS.config, config_dir) # Check config setting coherences assert len(config["level_loss_coefs_params"]) == config["pool_count"], \ "level_loss_coefs_params ({} elements) must have model_res_levels ({}) elements".format( len(config["level_loss_coefs_params"]), config["pool_count"]) # Find data_dir data_dir = python_utils.choose_first_existing_path( config["data_dir_candidates"]) if data_dir is None: print("ERROR: Data directory not found!") exit() else: print("Using data from {}".format(data_dir)) # Setup dataset dirpaths tfrecords_dirpath_list = [ os.path.join(data_dir, tfrecords_dirpath) for tfrecords_dirpath in config["tfrecords_partial_dirpath_list"] ] # Overwrite config ds_fac if FLAGS specify them if FLAGS.ds_fac is not None: ds_fac_list = [FLAGS.ds_fac] ds_repeat_list = [1] else: ds_fac_list = config["ds_fac_list"] ds_repeat_list = config["ds_repeat_list"] # setup init run directory of one is specified: if FLAGS.init_run_name is not None: init_run_dirpath = run_utils.setup_run_dir(config["runs_dirname"], FLAGS.init_run_name) else: init_run_dirpath = None # setup run directory: runs_dir = os.path.join(working_dir, config["runs_dirname"]) current_run_dirpath = run_utils.setup_run_dir(runs_dir, FLAGS.run_name, FLAGS.new_run) # save config in logs directory run_utils.save_config(config, current_run_dirpath) # save FLAGS FLAGS_filepath = os.path.join(current_run_dirpath, "FLAGS.json") python_utils.save_json( FLAGS_filepath, { "run_name": FLAGS.run_name, "new_run": FLAGS.new_run, "batch_size": FLAGS.batch_size, "ds_fac": FLAGS.ds_fac, }) train(config, tfrecords_dirpath_list, init_run_dirpath, current_run_dirpath, FLAGS.batch_size, ds_fac_list, ds_repeat_list)