Esempio n. 1
0
def train(cfg):
    """Train model.

    Parameters
    ----------
    cfg : Dict
        Dictionary containing the run config
    """
    # fix random seeds
    random.seed(cfg["seed"])
    np.random.seed(cfg["seed"])
    torch.cuda.manual_seed(cfg["seed"])
    torch.manual_seed(cfg["seed"])

    basins = cfg["basins"]

    # create folder structure for this run
    cfg = _setup_run(cfg)

    # prepare data for training
    cfg = _prepare_data(cfg=cfg, basins=basins)

    # prepare PyTorch DataLoader
    ds = CamelsH5(h5_file=cfg["train_file"],
                  basins=basins,
                  db_path=cfg["db_path"],
                  concat_static=cfg["concat_static"],
                  cache=cfg["cache_data"],
                  no_static=cfg["no_static"])
    loader = DataLoader(ds,
                        batch_size=cfg["batch_size"],
                        shuffle=True,
                        num_workers=cfg["num_workers"])

    # create model and optimizer
    input_size_stat = 0 if cfg["no_static"] else 27
    input_size_dyn = 5 if (cfg["no_static"] or not cfg["concat_static"]) else 32
    model = Model(input_size_dyn=input_size_dyn,
                  input_size_stat=input_size_stat,
                  hidden_size=cfg["hidden_size"],
                  initial_forget_bias=cfg["initial_forget_gate_bias"],
                  dropout=cfg["dropout"],
                  concat_static=cfg["concat_static"],
                  no_static=cfg["no_static"]).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=cfg["learning_rate"])

    # define loss function
    if cfg["use_mse"]:
        loss_func = nn.MSELoss()
    else:
        loss_func = NSELoss()

    # reduce learning rates after each 10 epochs
    learning_rates = {11: 5e-4, 21: 1e-4}

    for epoch in range(1, cfg["epochs"] + 1):
        # set new learning rate
        if epoch in learning_rates.keys():
            for param_group in optimizer.param_groups:
                param_group["lr"] = learning_rates[epoch]

        train_epoch(model, optimizer, loss_func, loader, cfg, epoch, cfg["use_mse"])

        model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt"
        torch.save(model.state_dict(), str(model_path))
def train(cfg):
    """Train model.

    Parameters
    ----------
    cfg : Dict
        Dictionary containing the run config
    """
    # fix random seeds
    random.seed(cfg["seed"])
    np.random.seed(cfg["seed"])
    torch.cuda.manual_seed(cfg["seed"])
    torch.manual_seed(cfg["seed"])

    basins = cfg["basins"]

    # create folder structure for this run
    cfg = _setup_run(cfg)

    # prepare data for training
    cfg = _prepare_data(cfg=cfg, basins=basins)

    # prepare Dataset
    ds = CamelsH5(h5_file=cfg["train_file"],
                  basins=basins,
                  db_path=cfg["db_path"],
                  concat_static=False,
                  cache=True,
                  no_static=cfg["no_static"])

    # Create train/val sets
    x = ds.x.reshape(len(ds.x), -1)
    y = ds.y.reshape(len(ds.y))
    if not cfg["no_static"]:
        attr_indices = np.searchsorted(ds.df.index, ds.sample_2_basin)
        attributes = ds.df.iloc[attr_indices].values
        x = np.concatenate([x, attributes], axis=1)

    # define loss function
    if not cfg["use_mse"]:
        # slight hack to enable NSE on XGBoost: replace the target with a unique id
        # so we can figure out the corresponding q_std during the loss calculation.
        y_actual = y.copy()
        y = np.arange(len(y))
        loss = NSEObjective(y, y_actual, ds.q_stds)
        objective = loss.nse_objective
        eval_metric = loss.nse_metric
        scoring = loss.neg_nse_metric_sklearn
    else:
        objective = 'reg:squarederror'
        eval_metric = 'rmse'
        scoring = 'neg_mean_squared_error'

    num_val_samples = int(len(x) * 0.1)
    val_indices = np.random.choice(range(len(x)),
                                   size=num_val_samples,
                                   replace=False)
    train_indices = np.setdiff1d(range(len(x)), val_indices)

    val = [(x[train_indices], y[train_indices]),
           (x[val_indices], y[val_indices])]

    if cfg["model_dir"] is None:

        def param_search(param_dist, n_iter):
            model = xgb.XGBRegressor(
                n_estimators=cfg["param_search_n_estimators"],
                objective=objective,
                n_jobs=1,
                random_state=cfg["seed"])
            model = model_selection.RandomizedSearchCV(
                model,
                param_dist,
                n_iter=n_iter,
                cv=cfg["n_cv"],
                return_train_score=True,
                scoring=scoring,
                n_jobs=cfg["num_workers"],
                random_state=cfg["seed"],
                refit=False,
                verbose=5)
            model.fit(x[train_indices],
                      y[train_indices],
                      eval_set=val,
                      eval_metric=eval_metric,
                      early_stopping_rounds=cfg[
                          "param_search_early_stopping_rounds"],
                      verbose=False)
            return model

        best_params = param_search(cfg["param_dist"],
                                   cfg["param_search_n_iter"]).best_params_
        print(f"Best parameters: {best_params}")

        # Find regularization parameters in separate search
        for k, v in best_params.items():
            cfg["reg_param_dist"][k] = [v]
        model = param_search(cfg["reg_param_dist"], cfg["reg_search_n_iter"])
        print(f"Best regularization parameters: {model.best_params_}")

        cv_results = pd.DataFrame(model.cv_results_).sort_values(
            by='mean_test_score', ascending=False)
        print(
            cv_results.filter(regex='param_|mean_test_score|mean_train_score',
                              axis=1).head())
        print(cv_results.loc[model.best_index_,
                             ['mean_train_score', 'mean_test_score']])

        xgb_params = model.best_params_

    else:
        print('Using model parameters from {}'.format(cfg["model_dir"]))
        model = pickle.load(open(cfg["model_dir"] / "model.pkl", "rb"))
        xgb_params = model.get_xgb_params()

    xgb_params['learning_rate'] = cfg["final_learning_rate"]
    xgb_params['n_estimators'] = cfg["final_n_estimators"]
    model = xgb.XGBRegressor()
    model.set_params(**xgb_params)
    model.objective = objective
    model.random_state = cfg["seed"]
    model.n_jobs = cfg["num_workers"]
    print(model.get_xgb_params())

    model.fit(x[train_indices],
              y[train_indices],
              eval_set=val,
              eval_metric=eval_metric,
              early_stopping_rounds=cfg["final_early_stopping_rounds"],
              verbose=True)

    model_path = cfg["run_dir"] / "model.pkl"
    pickle.dump(model, open(str(model_path), 'wb'))
Esempio n. 3
0
def evaluate(user_cfg: Dict):
    """Train model for a single epoch.

    Parameters
    ----------
    user_cfg : Dict
        Dictionary containing the user entered evaluation config
        
    """
    with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp:
        run_cfg = json.load(fp)

    if user_cfg["split_file"] is not None:
        with Path(user_cfg["split_file"]).open('rb') as fp:
            splits = pickle.load(fp)
        basins = splits[run_cfg["split"]]["test"]
    else:
        basins = get_basin_list()

    # get attribute means/stds from trainings dataset
    train_file = user_cfg["run_dir"] / "data/train/train_data.h5"
    db_path = str(user_cfg["run_dir"] / "attributes.db")
    ds_train = CamelsH5(h5_file=train_file,
                        db_path=db_path,
                        basins=basins,
                        concat_static=run_cfg["concat_static"])
    means = ds_train.get_attribute_means()
    stds = ds_train.get_attribute_stds()

    # create model
    input_size_dyn = 5 if (run_cfg["no_static"]
                           or not run_cfg["concat_static"]) else 32
    model = Model(input_size_dyn=input_size_dyn,
                  hidden_size=run_cfg["hidden_size"],
                  dropout=run_cfg["dropout"],
                  concat_static=run_cfg["concat_static"],
                  no_static=run_cfg["no_static"]).to(DEVICE)

    # load trained model
    weight_file = user_cfg["run_dir"] / 'model_epoch30.pt'
    model.load_state_dict(torch.load(weight_file, map_location=DEVICE))

    date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"],
                               end=GLOBAL_SETTINGS["val_end"])
    results = {}
    for basin in tqdm(basins):
        ds_test = CamelsTXT(
            camels_root=user_cfg["camels_root"],
            basin=basin,
            dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]],
            is_train=False,
            seq_length=run_cfg["seq_length"],
            with_attributes=True,
            attribute_means=means,
            attribute_stds=stds,
            concat_static=run_cfg["concat_static"],
            db_path=db_path)
        loader = DataLoader(ds_test,
                            batch_size=1024,
                            shuffle=False,
                            num_workers=4)

        preds, obs = evaluate_basin(model, loader)

        df = pd.DataFrame(data={
            'qobs': obs.flatten(),
            'qsim': preds.flatten()
        },
                          index=date_range)

        results[basin] = df

    _store_results(user_cfg, run_cfg, results)
Esempio n. 4
0
def dist_train(rank, world_size, cfg):
    """Train model.

    Parameters
    ----------
    cfg : Dict
        Dictionary containing the run config
    """

    print(f"Running basic DDP example on rank {rank}. {world_size}")
    setup(rank, world_size)

    # fix random seeds
    random.seed(cfg["seed"])
    np.random.seed(cfg["seed"])
    torch.cuda.manual_seed(cfg["seed"])
    torch.manual_seed(cfg["seed"])

    basins = get_basin_list()

    if rank == 0:
        # create folder structure for this run
        cfg = _setup_run(cfg)

        # prepare data for training
        cfg = _prepare_data(cfg=cfg, basins=basins)

        with open(str(cfg["camels_root"]) + '/cfg.pkl', 'wb') as f:
            pickle.dump(cfg, f, pickle.HIGHEST_PROTOCOL)

    dist.barrier()

    with open(str(cfg["camels_root"]) + '/cfg.pkl', 'rb') as f:
        cfg = pickle.load(f)

    # prepare PyTorch DataLoader
    ds = CamelsH5(h5_file=cfg["train_file"],
                  basins=basins,
                  db_path=cfg["db_path"],
                  concat_static=cfg["concat_static"],
                  cache=cfg["cache_data"],
                  no_static=cfg["no_static"])

    sampler = torch.utils.data.distributed.DistributedSampler(
        ds,
        num_replicas=world_size,
        rank=rank
    )

    loader = DataLoader(ds,
                        batch_size=cfg["batch_size"],
                        shuffle=False,
                        num_workers=cfg["num_workers"],
                        sampler=sampler,
                        pin_memory=True)

    # create model and optimizer
    input_size_stat = 0 if cfg["no_static"] else 27
    input_size_dyn = 5 if (cfg["no_static"] or not cfg["concat_static"]) else 32

    model = Model(input_size_dyn=input_size_dyn,
                  input_size_stat=input_size_stat,
                  hidden_size=cfg["hidden_size"],
                  initial_forget_bias=cfg["initial_forget_gate_bias"],
                  dropout=cfg["dropout"],
                  concat_static=cfg["concat_static"],
                  no_static=cfg["no_static"])

    # if cfg["initial_forget_gate_bias"] != 0:
    #     model.bias.shape

    ddp_model = DDP(model.to(rank), device_ids=[rank])

    optimizer = torch.optim.Adam(ddp_model.parameters(),
                                 lr=cfg["learning_rate"])

    # define loss function
    if cfg["use_mse"]:
        loss_func = nn.MSELoss()
    else:
        loss_func = NSELoss()

    # reduce learning rates after each 10 epochs
    learning_rates = {11: 5e-4, 21: 1e-4}

    CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint"

    map_location = {'cuda:%d' % 0: 'cuda:%d' % rank}

    for epoch in range(1, math.ceil(cfg["epochs"] / world_size) + 1):
        # set new learning rate
        if epoch in learning_rates.keys():
            for param_group in optimizer.param_groups:
                param_group["lr"] = learning_rates[epoch]

        if rank == 0:
            torch.save(ddp_model.state_dict(), CHECKPOINT_PATH)

        dist.barrier()

        ddp_model.load_state_dict(
            torch.load(CHECKPOINT_PATH, map_location=map_location))

        #         optimizer.zero_grad()

        train_epoch(ddp_model, optimizer, loss_func, loader, cfg, epoch,
                    cfg["use_mse"], rank)

    #         model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt"
    #         torch.save(ddp_model.state_dict(), str(model_path))

    #     if rank == 0:
    #         os.remove(CHECKPOINT_PATH)

    cleanup()