コード例 #1
0
ファイル: trainer.py プロジェクト: vineet1992/netsimilarity
    def fit(self, config, train_dl, val_dl):

        # Try loading previous model
        checkpoint = self.load_checkpoint(self.checkpoints_dirpath)  # Try last checkpoint
        if checkpoint is None and self.init_checkpoints_dirpath is not None:
            # Try with init_checkpoints_dirpath:
            checkpoint = self.load_checkpoint(self.init_checkpoints_dirpath)
        if checkpoint is None:
            checkpoint = {
                "epoch": -1,
            }
        start_epoch = checkpoint["epoch"] + 1  # Start at next epoch

        fit_pbar = tqdm(range(start_epoch, config["max_epoch"]), desc="Fitting: ", initial=start_epoch, total=config["max_epoch"])
        train_loss = None
        val_loss = None
        for epoch_index, epoch in enumerate(fit_pbar):
            self.model.train()
            train_loss = self.run_epoch("Train", train_dl, config["train_log_step"], opt=self.optimizer)

            self.model.eval()
            with torch.no_grad():
                val_loss = self.run_epoch("Val", val_dl, max(len(val_dl) // 4, 1))

            fit_pbar.set_postfix(train_loss="{:04f}".format(train_loss), val_loss="{:04f}".format(val_loss))

            if epoch_index % config["checkpoint_epoch"] == 0:
                self.save_checkpoint(epoch, train_loss, val_loss)
        self.save_checkpoint(epoch, train_loss, val_loss)

        python_utils.save_json(os.path.join(self.logs_dirpath, "final_losses.json"), {
            'train_loss': train_loss,
            'val_loss': val_loss,
        })
コード例 #2
0
def main(_):
    working_dir = os.path.dirname(os.path.abspath(__file__))

    # print FLAGS
    print("#--- FLAGS: ---#")
    print("config: {}".format(FLAGS.config))
    print("new_run: {}".format(FLAGS.new_run))
    print("init_run_name: {}".format(FLAGS.init_run_name))
    print("run_name: {}".format(FLAGS.run_name))
    print("batch_size: {}".format(FLAGS.batch_size))

    # load config file
    config = run_utils.load_config(FLAGS.config)

    # Check config setting coherences
    assert len(config["level_loss_coefs_params"]) == config["pool_count"], \
        "level_loss_coefs_params ({} elements) must have model_res_levels ({}) elements".format(
            len(config["level_loss_coefs_params"]), config["pool_count"])

    tfrecords_dirpath_list = [
        os.path.join(working_dir, tfrecords_dirpath)
        for tfrecords_dirpath in config["tfrecords_partial_dirpath_list"]
    ]

    ds_repeat_list = config["ds_repeat_list"]

    # setup init run directory of one is specified:
    if FLAGS.init_run_name is not None:
        init_run_dirpath = run_utils.setup_run_dir(config["runs_dirname"],
                                                   FLAGS.init_run_name)
    else:
        init_run_dirpath = None

    # setup run directory:
    runs_dir = os.path.join(working_dir, config["runs_dirname"])
    current_run_dirpath = run_utils.setup_run_dir(runs_dir, FLAGS.run_name,
                                                  FLAGS.new_run)

    # save config in logs directory
    run_utils.save_config(config, current_run_dirpath)

    # save FLAGS
    FLAGS_filepath = os.path.join(current_run_dirpath, "FLAGS.json")
    python_utils.save_json(
        FLAGS_filepath, {
            "run_name": FLAGS.run_name,
            "new_run": FLAGS.new_run,
            "batch_size": FLAGS.batch_size
        })

    train(config, tfrecords_dirpath_list, init_run_dirpath,
          current_run_dirpath, FLAGS.batch_size, ds_repeat_list)
コード例 #3
0
ファイル: 1_train.py プロジェクト: dtekeshe/ml
def main(_):
    # Print flags
    print("#--- Flags: ---#")
    print("new_run: {}".format(FLAGS.new_run))
    print("init_run_name: {}".format(FLAGS.init_run_name))
    print("run_name: {}".format(FLAGS.run_name))
    print("batch_size: {}".format(FLAGS.batch_size))
    print("ds_fac: {}".format(FLAGS.ds_fac))

    if FLAGS.ds_fac is not None:
        ds_fac_list = [FLAGS.ds_fac]
        ds_repeat_list = [1]
    else:
        ds_fac_list = config.DS_FAC_LIST
        ds_repeat_list = config.DS_REPEAT_LIST

    # Setup init run directory of one is specified:
    if FLAGS.init_run_name is not None:
        init_run_dirpath = run_utils.setup_run_dir(config.RUNS_DIR,
                                                   FLAGS.init_run_name)
    else:
        init_run_dirpath = None

    # Setup run directory:
    current_run_dirpath = run_utils.setup_run_dir(config.RUNS_DIR,
                                                  FLAGS.run_name,
                                                  FLAGS.new_run)

    # Save config.py in logs directory
    run_utils.save_config(config.PROJECT_DIR, current_run_dirpath)

    # Save flags
    flags_filepath = os.path.join(current_run_dirpath, "flags.json")
    python_utils.save_json(
        flags_filepath, {
            "run_name": FLAGS.run_name,
            "new_run": FLAGS.new_run,
            "batch_size": FLAGS.batch_size,
            "ds_fac": FLAGS.ds_fac,
        })

    train(init_run_dirpath, current_run_dirpath, FLAGS.batch_size, ds_fac_list,
          ds_repeat_list)
コード例 #4
0
def launch_experiments(config, exp_dirpath, new_exp, recompute_stats, params, stats_params):
    config = config.copy()
    config["runs_dirpath"] = exp_dirpath

    # Setup progress filepaths
    finished_exps_filepath = os.path.join(exp_dirpath, 'finished_exps.json')

    # Start a new experiment or recompute stats from an existing experiment
    remaining_exp_list = []
    for n, f in itertools.product(params["sample_count"], params["frequency"]):
        for run in range(params["run_count"]):
            all_param = {
                "run": run,
                "sample_count": n,
                "frequency": f,
                "noise_std": params["noise_std"],
                "distribution": params["distribution"],
            }
            remaining_exp_list.append(all_param)

    if new_exp or recompute_stats:
        finished_exp_list = []
        python_utils.save_json(finished_exps_filepath, finished_exp_list)
    else:
        # Continue a previous experiment. Load exp_param_lists
        finished_exp_list = python_utils.load_json(finished_exps_filepath)

    # Remove finished experiments from remaining_exp_list:
    remaining_exp_list = [item for item in remaining_exp_list if item not in finished_exp_list]

    finished_exp_count = len(finished_exp_list)
    total_exp_count = len(remaining_exp_list + finished_exp_list)

    remaining_exp_list_to_save = remaining_exp_list.copy()

    exp_pbar = tqdm(remaining_exp_list, desc="Running exps: ", initial=finished_exp_count, total=total_exp_count)
    for all_params in exp_pbar:
        launch_one_experiment(config, all_params, stats_params)
        remaining_exp_list_to_save.remove(all_params)
        finished_exp_list.append(all_params)
        python_utils.save_json(finished_exps_filepath, finished_exp_list)
コード例 #5
0
def train(config, run_params, dataset_params):
    # print("# --- Starting training --- #")

    run_name = run_params["run_name"]
    new_run = run_params["new_run"]
    init_run_name = run_params["init_run_name"]

    working_dir = os.path.dirname(os.path.abspath(__file__))

    # Find data_dir
    data_dirpath = python_utils.choose_first_existing_path(
        config["data_dir_candidates"])
    if data_dirpath is None:
        print_utils.print_error("ERROR: Data directory not found!")
        exit()
    # print_utils.print_info("Using data from {}".format(data_dirpath))
    root_dir = os.path.join(data_dirpath, config["data_root_partial_dirpath"])

    # setup init checkpoints directory path if one is specified:
    if init_run_name is not None:
        init_run_dirpath = run_utils.setup_run_dir(config["runs_dirpath"],
                                                   init_run_name)
        _, init_checkpoints_dirpath = run_utils.setup_run_subdirs(
            init_run_dirpath)
    else:
        init_checkpoints_dirpath = None

    # setup run directory:
    runs_dir = os.path.join(working_dir, config["runs_dirpath"])
    run_dirpath = run_utils.setup_run_dir(runs_dir, run_name, new_run)

    # save config in logs directory
    run_utils.save_config(config, run_dirpath)

    # save args
    args_filepath = os.path.join(run_dirpath, "args.json")
    python_utils.save_json(
        args_filepath, {
            "run_name": run_name,
            "new_run": new_run,
            "init_run_name": init_run_name,
            "batch_size": config["batch_size"],
        })

    # Choose device
    # dev = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    dev = "cpu"  # For small networks and experiments, cpu is much faster

    # Instantiate dataset
    # sobol_generator = rand_utils.SobolGenerator()
    sobol_generator = None
    train_ds = Synthetic1DDataset(root_dir=root_dir,
                                  params=dataset_params,
                                  split_name="train",
                                  sobol_generator=sobol_generator,
                                  transform=torchvision.transforms.Compose([
                                      transforms.ToTensor(),
                                      transforms.ToDevice(device=dev)
                                  ]))
    val_ds = Synthetic1DDataset(root_dir=root_dir,
                                params=dataset_params,
                                split_name="val",
                                sobol_generator=sobol_generator,
                                transform=torchvision.transforms.Compose([
                                    transforms.ToTensor(),
                                    transforms.ToDevice(device=dev)
                                ]))

    # print(train_ds.alpha)
    # print(val_ds.alpha)
    # exit()

    # Generate test dataset here because if using Sobel numbers, all datasets should be using the same SobolGenerator
    # so that they do not generate the same samples.
    test_ds = Synthetic1DDataset(root_dir=root_dir,
                                 params=dataset_params,
                                 split_name="test",
                                 sobol_generator=sobol_generator,
                                 transform=torchvision.transforms.Compose([
                                     transforms.ToTensor(),
                                     transforms.ToDevice(device=dev)
                                 ]))
    train_dl = DataLoader(train_ds,
                          batch_size=config["batch_size"],
                          shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=config["batch_size"])

    success = False
    while not success:
        try:
            model = Simple1DInputNet(config)
            model.to(dev)
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=config["lr"],
                                         weight_decay=config["weight_decay"])
            loss_func = measures.l1_loss

            trainer = Trainer(config, model, optimizer, loss_func,
                              init_checkpoints_dirpath, run_dirpath)
            trainer.fit(config, train_dl, val_dl)
            success = True
        except ValueError:  # Catches NaN errors
            # Try again
            run_utils.wipe_run_subdirs(run_dirpath)
            print("\nTry again\n")
            pass
コード例 #6
0
ファイル: 1_train.py プロジェクト: savoga/mapalignment
def main(_):
    working_dir = os.path.dirname(os.path.abspath(__file__))
    config_dir = os.path.dirname(os.path.realpath(__file__))

    # print FLAGS
    print("#--- FLAGS: ---#")
    print("config: {}".format(FLAGS.config))
    print("new_run: {}".format(FLAGS.new_run))
    print("init_run_name: {}".format(FLAGS.init_run_name))
    print("run_name: {}".format(FLAGS.run_name))
    print("batch_size: {}".format(FLAGS.batch_size))
    print("ds_fac: {}".format(FLAGS.ds_fac))

    # load config file
    config = run_utils.load_config(FLAGS.config, config_dir)

    # Check config setting coherences
    assert len(config["level_loss_coefs_params"]) == config["pool_count"], \
        "level_loss_coefs_params ({} elements) must have model_res_levels ({}) elements".format(
            len(config["level_loss_coefs_params"]), config["pool_count"])

    # Find data_dir
    data_dir = python_utils.choose_first_existing_path(
        config["data_dir_candidates"])
    if data_dir is None:
        print("ERROR: Data directory not found!")
        exit()
    else:
        print("Using data from {}".format(data_dir))

    # Setup dataset dirpaths
    tfrecords_dirpath_list = [
        os.path.join(data_dir, tfrecords_dirpath)
        for tfrecords_dirpath in config["tfrecords_partial_dirpath_list"]
    ]

    # Overwrite config ds_fac if FLAGS specify them
    if FLAGS.ds_fac is not None:
        ds_fac_list = [FLAGS.ds_fac]
        ds_repeat_list = [1]
    else:
        ds_fac_list = config["ds_fac_list"]
        ds_repeat_list = config["ds_repeat_list"]

    # setup init run directory of one is specified:
    if FLAGS.init_run_name is not None:
        init_run_dirpath = run_utils.setup_run_dir(config["runs_dirname"],
                                                   FLAGS.init_run_name)
    else:
        init_run_dirpath = None

    # setup run directory:
    runs_dir = os.path.join(working_dir, config["runs_dirname"])
    current_run_dirpath = run_utils.setup_run_dir(runs_dir, FLAGS.run_name,
                                                  FLAGS.new_run)

    # save config in logs directory
    run_utils.save_config(config, current_run_dirpath)

    # save FLAGS
    FLAGS_filepath = os.path.join(current_run_dirpath, "FLAGS.json")
    python_utils.save_json(
        FLAGS_filepath, {
            "run_name": FLAGS.run_name,
            "new_run": FLAGS.new_run,
            "batch_size": FLAGS.batch_size,
            "ds_fac": FLAGS.ds_fac,
        })

    train(config, tfrecords_dirpath_list, init_run_dirpath,
          current_run_dirpath, FLAGS.batch_size, ds_fac_list, ds_repeat_list)