def get_simulations(data_dir, standardized=True, DOY=False, to_numpy=True, drop_parameters=False): path_in = os.path.join(data_dir, f"sims_in.csv") path_out = os.path.join(data_dir, f"sims_out.csv") X = pd.read_csv(path_in, sep=";") X["DOY_sin"], X["DOY_cos"] = utils.encode_doy(X["DOY"]) if DOY: X = X.drop(columns=["sample", "year", "CO2"]) else: X = X.drop(columns=["DOY", "sample", "year", "CO2"]) if drop_parameters: X = X.drop(columns=["beta", "X0", "gamma", "alpha", "chi"]) if standardized: X = utils.minmax_scaler(X) Y = pd.read_csv(path_out, sep=";") if to_numpy: return X.to_numpy(), Y.to_numpy() #, filenames else: return X, Y
def get_borealsites(year, preles=False, site=["hyytiala"], colnames=[ "PAR", "TAir", "VPD", "Precip", "fAPAR", "DOY_sin", "DOY_cos" ]): data_dir = r"OneDrive\Dokumente\Sc_Master\Masterthesis\Project\DomAdapt\data\borealsites" X = pd.read_csv(os.path.join(data_dir, "borealsites_in"), sep=";") if preles: preles_preds = pd.read_csv(os.path.join(data_dir, "preles_out"), sep=";") Y = pd.read_csv(os.path.join(data_dir, "borealsites_out"), sep=";") colnames = ["PAR", "TAir", "VPD", "Precip", "fAPAR", "DOY_sin", "DOY_cos"] X["DOY_sin"], X["DOY_cos"] = utils.encode_doy( X["DOY"]) # encode day of year as sinus and cosinus row_ind = X['site'].isin(site) print(f"Returns {site} from \n", X["site"].unique()) X, Y = X[row_ind], Y[row_ind] try: preles_preds = preles_preds[row_ind] except: None X[colnames] = utils.minmax_scaler(X[colnames]) X = X[colnames] if year == "train": X = X[:365] Y = Y[:365] try: preles_preds = preles_preds[:365] except: None elif year == "test": X = X[365:] Y = Y[365:] try: preles_preds = preles_preds[:365] except: None else: pass Y = Y.drop(columns=["ET"]) if preles: preles_preds = preles_preds.drop(columns=["ET", "SW"]) return (preles_preds.to_numpy()) else: return (X.to_numpy(), Y.to_numpy())
def random_forest_CV(X, Y, splits, shuffled, n_trees, depth, eval_set = None, selected = True): X_mean, X_std = np.mean(X), np.std(X) X = minmax_scaler(X) # Divide into training and test kf = KFold(n_splits=splits, shuffle = shuffled) kf.get_n_splits(X) regressor = RandomForestRegressor(n_estimators=n_trees, max_depth = depth, criterion = "mse") rmse_train = np.zeros((splits)) mae_train = np.zeros((splits)) rmse_val = np.zeros((splits)) mae_val = np.zeros((splits)) y_preds = [] y_trains = [] y_tests = [] i = 0 for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] if not eval_set is None: X_test = eval_set["X_test"] y_test = eval_set["Y_test"] regressor.fit(X_train, y_train.ravel()) y_pred_test = regressor.predict(X_test) y_pred_train = regressor.predict(X_train) # Evaluate the algorithm rmse_val[i] = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test)) mae_val[i] = metrics.mean_absolute_error(y_test, y_pred_test) rmse_train[i] = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)) mae_train[i] = metrics.mean_absolute_error(y_train, y_pred_train) y_preds.append(y_pred_test) y_tests.append(y_test) y_trains.append(y_train) i+= 1 if selected: errors = [rmse_train, rmse_val, mae_train, mae_val] else: errors = [np.mean(rmse_train), np.mean(rmse_val), np.mean(mae_train), np.mean(mae_val)] return(y_preds, y_tests, errors)
def get_splits(sites, years, datadir, dataset="profound", simulations=None, drop_cols=False, standardized=True, colnames=[ "PAR", "TAir", "VPD", "Precip", "fAPAR", "DOY_sin", "DOY_cos" ], to_numpy=True): datadir = os.path.join(datadir, f"{dataset}") X, Y = load_data(dataset=dataset, data_dir=datadir, simulations=simulations) X["date"] = X["date"].str[:4].astype(int) # get years as integers X["DOY_sin"], X["DOY_cos"] = utils.encode_doy( X["DOY"]) # encode day of year as sinus and cosinus if all([site in X["site"].values for site in sites]): row_ind = X['site'].isin(sites) print(f"Returns {sites} from \n", X["site"].unique()) X, Y = X[row_ind], Y[row_ind] else: print("Not all sites in dataset!") if standardized: X[colnames] = utils.minmax_scaler(X[colnames]) try: row_ind = X["date"].isin(years) print(f"Returns valid years from {years} in \n", X["date"].unique()) X, Y = X[row_ind], Y[row_ind] except: print(" years specification invalid. Returns all years.") try: X = X[colnames] except: print("Columns are missing!") if simulations != None: if drop_cols: Y = Y.drop(columns=["ET", "SW"]) else: X["ET"] = Y["ET"] X["SW"] = Y["SW"] Y = Y.drop(columns=["ET", "SW"]) else: try: Y = Y.drop(columns=["ET"]) except: None if to_numpy: X, Y = X.to_numpy(), Y.to_numpy() return X, Y
import random #%%% datadir = "OneDrive\Dokumente\Sc_Master\Masterthesis\Project\DomAdapt" X, Y = preprocessing.get_splits( sites=['le_bray'], years=[2001], datadir=os.path.join(datadir, "data"), dataset="profound", simulations=None, colnames=["PAR", "TAir", "VPD", "Precip", "fAPAR", "DOY_sin", "DOY_cos"], to_numpy=True) #%% Train X = utils.minmax_scaler(X) X = torch.tensor(X).type(dtype=torch.float) Y = torch.tensor(Y).type(dtype=torch.float) #model = models.MLP([X.shape[1],12,1], nn.ReLU) #model = models.LSTM(X.shape[1], 12, 1, 10, F.relu) x, target = utils.create_batches(X, Y, 128, 0) #x_test, target_test = utils.create_batches(X, Y, 128, 0) x = torch.tensor(x).type(dtype=torch.float) target = torch.tensor(target).type(dtype=torch.float) #x_test = torch.tensor(x_test).type(dtype=torch.float) #target_test = torch.tensor(target_test).type(dtype=torch.float) #%% hiddensize = [16, 32, 64, 128, 256]