def train(cfg): """Train model. Parameters ---------- cfg : Dict Dictionary containing the run config """ # fix random seeds random.seed(cfg["seed"]) np.random.seed(cfg["seed"]) torch.cuda.manual_seed(cfg["seed"]) torch.manual_seed(cfg["seed"]) basins = cfg["basins"] # create folder structure for this run cfg = _setup_run(cfg) # prepare data for training cfg = _prepare_data(cfg=cfg, basins=basins) # prepare PyTorch DataLoader ds = CamelsH5(h5_file=cfg["train_file"], basins=basins, db_path=cfg["db_path"], concat_static=cfg["concat_static"], cache=cfg["cache_data"], no_static=cfg["no_static"]) loader = DataLoader(ds, batch_size=cfg["batch_size"], shuffle=True, num_workers=cfg["num_workers"]) # create model and optimizer input_size_stat = 0 if cfg["no_static"] else 27 input_size_dyn = 5 if (cfg["no_static"] or not cfg["concat_static"]) else 32 model = Model(input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=cfg["hidden_size"], initial_forget_bias=cfg["initial_forget_gate_bias"], dropout=cfg["dropout"], concat_static=cfg["concat_static"], no_static=cfg["no_static"]).to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=cfg["learning_rate"]) # define loss function if cfg["use_mse"]: loss_func = nn.MSELoss() else: loss_func = NSELoss() # reduce learning rates after each 10 epochs learning_rates = {11: 5e-4, 21: 1e-4} for epoch in range(1, cfg["epochs"] + 1): # set new learning rate if epoch in learning_rates.keys(): for param_group in optimizer.param_groups: param_group["lr"] = learning_rates[epoch] train_epoch(model, optimizer, loss_func, loader, cfg, epoch, cfg["use_mse"]) model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt" torch.save(model.state_dict(), str(model_path))
def train(cfg): """Train model. Parameters ---------- cfg : Dict Dictionary containing the run config """ # fix random seeds random.seed(cfg["seed"]) np.random.seed(cfg["seed"]) torch.cuda.manual_seed(cfg["seed"]) torch.manual_seed(cfg["seed"]) basins = cfg["basins"] # create folder structure for this run cfg = _setup_run(cfg) # prepare data for training cfg = _prepare_data(cfg=cfg, basins=basins) # prepare Dataset ds = CamelsH5(h5_file=cfg["train_file"], basins=basins, db_path=cfg["db_path"], concat_static=False, cache=True, no_static=cfg["no_static"]) # Create train/val sets x = ds.x.reshape(len(ds.x), -1) y = ds.y.reshape(len(ds.y)) if not cfg["no_static"]: attr_indices = np.searchsorted(ds.df.index, ds.sample_2_basin) attributes = ds.df.iloc[attr_indices].values x = np.concatenate([x, attributes], axis=1) # define loss function if not cfg["use_mse"]: # slight hack to enable NSE on XGBoost: replace the target with a unique id # so we can figure out the corresponding q_std during the loss calculation. y_actual = y.copy() y = np.arange(len(y)) loss = NSEObjective(y, y_actual, ds.q_stds) objective = loss.nse_objective eval_metric = loss.nse_metric scoring = loss.neg_nse_metric_sklearn else: objective = 'reg:squarederror' eval_metric = 'rmse' scoring = 'neg_mean_squared_error' num_val_samples = int(len(x) * 0.1) val_indices = np.random.choice(range(len(x)), size=num_val_samples, replace=False) train_indices = np.setdiff1d(range(len(x)), val_indices) val = [(x[train_indices], y[train_indices]), (x[val_indices], y[val_indices])] if cfg["model_dir"] is None: def param_search(param_dist, n_iter): model = xgb.XGBRegressor( n_estimators=cfg["param_search_n_estimators"], objective=objective, n_jobs=1, random_state=cfg["seed"]) model = model_selection.RandomizedSearchCV( model, param_dist, n_iter=n_iter, cv=cfg["n_cv"], return_train_score=True, scoring=scoring, n_jobs=cfg["num_workers"], random_state=cfg["seed"], refit=False, verbose=5) model.fit(x[train_indices], y[train_indices], eval_set=val, eval_metric=eval_metric, early_stopping_rounds=cfg[ "param_search_early_stopping_rounds"], verbose=False) return model best_params = param_search(cfg["param_dist"], cfg["param_search_n_iter"]).best_params_ print(f"Best parameters: {best_params}") # Find regularization parameters in separate search for k, v in best_params.items(): cfg["reg_param_dist"][k] = [v] model = param_search(cfg["reg_param_dist"], cfg["reg_search_n_iter"]) print(f"Best regularization parameters: {model.best_params_}") cv_results = pd.DataFrame(model.cv_results_).sort_values( by='mean_test_score', ascending=False) print( cv_results.filter(regex='param_|mean_test_score|mean_train_score', axis=1).head()) print(cv_results.loc[model.best_index_, ['mean_train_score', 'mean_test_score']]) xgb_params = model.best_params_ else: print('Using model parameters from {}'.format(cfg["model_dir"])) model = pickle.load(open(cfg["model_dir"] / "model.pkl", "rb")) xgb_params = model.get_xgb_params() xgb_params['learning_rate'] = cfg["final_learning_rate"] xgb_params['n_estimators'] = cfg["final_n_estimators"] model = xgb.XGBRegressor() model.set_params(**xgb_params) model.objective = objective model.random_state = cfg["seed"] model.n_jobs = cfg["num_workers"] print(model.get_xgb_params()) model.fit(x[train_indices], y[train_indices], eval_set=val, eval_metric=eval_metric, early_stopping_rounds=cfg["final_early_stopping_rounds"], verbose=True) model_path = cfg["run_dir"] / "model.pkl" pickle.dump(model, open(str(model_path), 'wb'))
def evaluate(user_cfg: Dict): """Train model for a single epoch. Parameters ---------- user_cfg : Dict Dictionary containing the user entered evaluation config """ with open(user_cfg["run_dir"] / 'cfg.json', 'r') as fp: run_cfg = json.load(fp) if user_cfg["split_file"] is not None: with Path(user_cfg["split_file"]).open('rb') as fp: splits = pickle.load(fp) basins = splits[run_cfg["split"]]["test"] else: basins = get_basin_list() # get attribute means/stds from trainings dataset train_file = user_cfg["run_dir"] / "data/train/train_data.h5" db_path = str(user_cfg["run_dir"] / "attributes.db") ds_train = CamelsH5(h5_file=train_file, db_path=db_path, basins=basins, concat_static=run_cfg["concat_static"]) means = ds_train.get_attribute_means() stds = ds_train.get_attribute_stds() # create model input_size_dyn = 5 if (run_cfg["no_static"] or not run_cfg["concat_static"]) else 32 model = Model(input_size_dyn=input_size_dyn, hidden_size=run_cfg["hidden_size"], dropout=run_cfg["dropout"], concat_static=run_cfg["concat_static"], no_static=run_cfg["no_static"]).to(DEVICE) # load trained model weight_file = user_cfg["run_dir"] / 'model_epoch30.pt' model.load_state_dict(torch.load(weight_file, map_location=DEVICE)) date_range = pd.date_range(start=GLOBAL_SETTINGS["val_start"], end=GLOBAL_SETTINGS["val_end"]) results = {} for basin in tqdm(basins): ds_test = CamelsTXT( camels_root=user_cfg["camels_root"], basin=basin, dates=[GLOBAL_SETTINGS["val_start"], GLOBAL_SETTINGS["val_end"]], is_train=False, seq_length=run_cfg["seq_length"], with_attributes=True, attribute_means=means, attribute_stds=stds, concat_static=run_cfg["concat_static"], db_path=db_path) loader = DataLoader(ds_test, batch_size=1024, shuffle=False, num_workers=4) preds, obs = evaluate_basin(model, loader) df = pd.DataFrame(data={ 'qobs': obs.flatten(), 'qsim': preds.flatten() }, index=date_range) results[basin] = df _store_results(user_cfg, run_cfg, results)
def dist_train(rank, world_size, cfg): """Train model. Parameters ---------- cfg : Dict Dictionary containing the run config """ print(f"Running basic DDP example on rank {rank}. {world_size}") setup(rank, world_size) # fix random seeds random.seed(cfg["seed"]) np.random.seed(cfg["seed"]) torch.cuda.manual_seed(cfg["seed"]) torch.manual_seed(cfg["seed"]) basins = get_basin_list() if rank == 0: # create folder structure for this run cfg = _setup_run(cfg) # prepare data for training cfg = _prepare_data(cfg=cfg, basins=basins) with open(str(cfg["camels_root"]) + '/cfg.pkl', 'wb') as f: pickle.dump(cfg, f, pickle.HIGHEST_PROTOCOL) dist.barrier() with open(str(cfg["camels_root"]) + '/cfg.pkl', 'rb') as f: cfg = pickle.load(f) # prepare PyTorch DataLoader ds = CamelsH5(h5_file=cfg["train_file"], basins=basins, db_path=cfg["db_path"], concat_static=cfg["concat_static"], cache=cfg["cache_data"], no_static=cfg["no_static"]) sampler = torch.utils.data.distributed.DistributedSampler( ds, num_replicas=world_size, rank=rank ) loader = DataLoader(ds, batch_size=cfg["batch_size"], shuffle=False, num_workers=cfg["num_workers"], sampler=sampler, pin_memory=True) # create model and optimizer input_size_stat = 0 if cfg["no_static"] else 27 input_size_dyn = 5 if (cfg["no_static"] or not cfg["concat_static"]) else 32 model = Model(input_size_dyn=input_size_dyn, input_size_stat=input_size_stat, hidden_size=cfg["hidden_size"], initial_forget_bias=cfg["initial_forget_gate_bias"], dropout=cfg["dropout"], concat_static=cfg["concat_static"], no_static=cfg["no_static"]) # if cfg["initial_forget_gate_bias"] != 0: # model.bias.shape ddp_model = DDP(model.to(rank), device_ids=[rank]) optimizer = torch.optim.Adam(ddp_model.parameters(), lr=cfg["learning_rate"]) # define loss function if cfg["use_mse"]: loss_func = nn.MSELoss() else: loss_func = NSELoss() # reduce learning rates after each 10 epochs learning_rates = {11: 5e-4, 21: 1e-4} CHECKPOINT_PATH = tempfile.gettempdir() + "/model.checkpoint" map_location = {'cuda:%d' % 0: 'cuda:%d' % rank} for epoch in range(1, math.ceil(cfg["epochs"] / world_size) + 1): # set new learning rate if epoch in learning_rates.keys(): for param_group in optimizer.param_groups: param_group["lr"] = learning_rates[epoch] if rank == 0: torch.save(ddp_model.state_dict(), CHECKPOINT_PATH) dist.barrier() ddp_model.load_state_dict( torch.load(CHECKPOINT_PATH, map_location=map_location)) # optimizer.zero_grad() train_epoch(ddp_model, optimizer, loss_func, loader, cfg, epoch, cfg["use_mse"], rank) # model_path = cfg["run_dir"] / f"model_epoch{epoch}.pt" # torch.save(ddp_model.state_dict(), str(model_path)) # if rank == 0: # os.remove(CHECKPOINT_PATH) cleanup()