def plot_train_test_error_nocv_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0, training_expt = 'abs1.0_norf_spinup', n_trees=10, min_samples_leaf=10, rain_only=False, no_cos=True, use_rh=False): datadir, trainfile, testfile, _ = ml_load.GetDataPath(training_expt) f_test, o_test, _, z, _, _ = ml_load.LoadData(testfile, max_z, n_trn_exs=None, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, random_state = 123, warm_start = False) n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000 test_error = np.zeros(len(n_trn_exs)) train_error = np.zeros(len(n_trn_exs)) for i in range(len(n_trn_exs)): f, o, _, z, rho, _ = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) f_pp = ml_load.init_pp(f_ppi, f) o_pp = ml_load.init_pp(o_ppi, o) f_scl = ml_load.transform_data(f_ppi, f_pp, f, z) o_scl = ml_load.transform_data(o_ppi, o_pp, o, z) rf.fit(f_scl, o_scl) f_test_scl = ml_load.transform_data(f_ppi, f_pp, f_test, z) o_test_scl = ml_load.transform_data(o_ppi, o_pp, o_test, z) test_error[i] = 1.0-rf.score(f_test_scl,o_test_scl) train_error[i] = 1.0-rf.score(f_scl,o_scl) print(str(n_trn_exs[i]) + ': ' + str(test_error[i])) print(str(n_trn_exs[i]) + ': ' + str(train_error[i])) print(test_error) print(train_error) fig = plt.figure() fscale = 100000.0 plt.plot(n_trn_exs/fscale, test_error, '-o', label='test') plt.plot(n_trn_exs/fscale, train_error, '-o', label='train') plt.xlim(-0.15, 9.5) plt.ylim(0,0.51) plt.xlabel("n_trn_exs") plt.ylabel("Error") plt.legend(loc="upper right") plt.legend(frameon=False) ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') plt.tight_layout() # avoid overlap plt.show() fig.savefig('figs_errors/error_test_train_nocv_vs_n_trn_exs.eps', bbox_inches='tight') plt.close()
def plot_error_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0, training_expt = 'abs1.0_norf_spinup', n_trees=10, min_samples_leaf=10, rain_only=False, no_cos = True, use_rh=False, load_results=True, save_results=False): if load_results: print('loading results') n_trn_exs, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trn_exs.pkl', 'rb')) else: datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt) rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False) n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000 cv_error = np.zeros(len(n_trn_exs)) cv_error_std = np.zeros(len(n_trn_exs)) # standard deviation of error estimate across folds for i in range(len(n_trn_exs)): f, o, _, z, rho, _ = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) f_pp = ml_load.init_pp(f_ppi, f) o_pp = ml_load.init_pp(o_ppi, o) f_scl = ml_load.transform_data(f_ppi, f_pp, f, z) o_scl = ml_load.transform_data(o_ppi, o_pp, o, z) scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs) cv_error[i] = 1-scores.mean() cv_error_std[i] = scores.std() print(str(n_trn_exs[i]) + ': ' + str(cv_error[i])) if save_results: print('saving results') pickle.dump([n_trn_exs, cv_error, cv_error_std], open('figs_errors/error_vs_n_trn_exs.pkl', 'wb')) print(n_trn_exs) print(cv_error) print(np.max(cv_error_std/np.sqrt(n_cv))) fig = plt.figure(figsize=(3.0,2.25)) fscale = 100000.0 plt.plot(n_trn_exs/fscale, cv_error, '-o') plt.xlim(-0.15, 9.5) plt.ylim(0,0.51) plt.xlabel("Number of training examples ($10^5$)") plt.ylabel("Error") #plt.legend(loc="upper right") #plt.legend(frameon=False) ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') plt.tight_layout() # avoid overlap plt.show() fig.savefig('figs_errors/error_vs_n_trn_exs.eps', bbox_inches='tight') plt.close()
def train_wrapper(f_ppi, o_ppi, training_expt, input_vert_dim, output_vert_dim, input_vert_vars, output_vert_vars, flag_dict, do_nn=False, n_iter=None, do_train=True, no_cos=True, use_rh=False, max_z=40000.0, rain_only=False, n_trn_exs=None, plot_training_results=False, n_trees=100, min_samples_leaf=10, max_depth=25, n_layers=2, n_hid_neur=10, n_stable=None, weight_decay=0.0, do_wind_input=False, do_diffusion=True, scale_level=False, rewight_outputs=False, weight_list=[1, 1], is_cheyenne=False, only_plot=False): """Loads training data and trains and stores estimator Args: f_ppi (dict): The type of preprocessing to do to the features (inputs) o_ppi (dict): The type of preprocessing to do to the targets (outputs) n_layers (int): Number of layers in the NN n_hid_neur (int): Number of hidden neurons in each layer n_iter (int): Number of iterations n_stable (int): Number of iterations after stability reached max_z (float): Don't train on data above this level weight_decay (float): Regularization strength. 0 is no regularization rain_only (bool): Only train on precipitating examples n_trn_exs (int): Number of training examples to learn on do_nn (bool): Use an ANN instead of a random forest no_cos (bool): If true, don't weight by cosine(latitude) min_samples_leaf (int): minimum samples per leaf plot_training_results (bool): Whether to also plot the model on training data use_rh (bool): use generalized relative humidity instead of total non-precip water as feature do_train (bool): whether to train (just plot the results if false) Returns: str: String id of trained NN """ # Load data (note LoadData seeds the random number generator) if not only_plot: datadir, trainfile, testfile, pp_str = ml_load.GetDataPath( training_expt, wind_input=do_wind_input, is_cheyenne=is_cheyenne) f, o, y, z, rho, p = ml_load.LoadData( trainfile, max_z, input_vert_vars=input_vert_vars, output_vert_vars=output_vert_vars, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh, wind_input=do_wind_input, exclusion_flag=flag_dict['exclusion_flag']) #load test data tf, to, ty, tz, trho, tp = ml_load.LoadData( testfile, max_z, input_vert_vars=input_vert_vars, output_vert_vars=output_vert_vars, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh) # Scale data (both train and test) f_pp, f_scl, tf_scl, o_pp, o_scl, to_scl, pp_str = PreprocessData_tr_ts( f_ppi, f, tf, o_ppi, o, to, pp_str, n_trn_exs, z, input_vert_dim, input_vert_vars, output_vert_dim, output_vert_vars, scale_level, rewight_outputs=rewight_outputs, weight_list=weight_list) #Yani TO DO!!! #Scale test data # tf_pp, tf_scl, to_pp, to_scl, tpp_str = PreprocessData(f_ppi, tf, o_ppi, to, pp_str, n_trn_exs, tz) # # Scale data # f_pp, f_scl, o_pp, o_scl, pp_str = PreprocessData(f_ppi, f, o_ppi, o, pp_str, n_trn_exs, z) # Either build a random forest or build a neural netowrk if do_nn: regularize = CatchRegularization(weight_decay) est, est_str = BuildNN(max_z, n_layers, 'Rectifier', n_hid_neur, 'momentum', pp_str, batch_size=100, n_stable=n_stable, n_iter=n_iter, learning_momentum=0.9, learning_rate=0.01, regularize=regularize, weight_decay=weight_decay, valid_size=0.2) else: est, est_str = BuildRandomForest(max_z, n_trees, min_samples_leaf, pp_str, max_depth, do_diffusion) est_str = UpdateName(no_cos, use_rh, rain_only, est_str) # Print details about the ML algorithm we are using print(est_str + ' Using ' + str(f.shape[0]) + ' training examples with ' + str(f.shape[1]) + ' input features and ' + str(o.shape[1]) + ' output targets') # Train the estimator if do_train: est, est_errors, train_score, test_score = train_est( est, est_str, f_scl, o_scl, tf_scl, to_scl, do_nn) est_str = est_str + 'te' + str(int( str(test_score)[2:4])) + '_tr' + str(int( str(train_score)[2:4])) # Save the estimator to access it later save_est(est, est_str, est_errors, f_ppi, o_ppi, f_pp, o_pp, y, z, p, rho, train_score, test_score, is_cheyenne) # Write a netcdf file for the gcm if do_nn: write_netcdf_nn(est_str, trainfile, rain_only, no_cos, use_rh, is_cheyenne) else: write_netcdf_rf(est_str, trainfile, output_vert_vars, output_vert_dim, rain_only, no_cos, use_rh, scale_level, rewight_outputs=rewight_outputs, weight_list=weight_list, is_cheyenne=is_cheyenne) # Plot figures with testing data using all of it if only_plot: trainfile = '/glade/work/janniy/mldata/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_training_x_no_subsampling.pkl' testfile = '/glade/work/janniy/mldata/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_testing_x_no_subsampling.pkl' est_str = 'qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF40FFTFTTF4848_F-NoSc_O-Stan_Ntr5000000_Nte972360_F_Tin_qin_qpin_latin_O_Tout_qout_qpout_RF_NTr10_MinS20max_d27_maxzinf_nocos_te50_tr54' trainfile = '/glade/work/janniy/mldata/training_data/qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_training.pkl' testfile = '/glade/work/janniy/mldata/training_data/qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_testing.pkl' est_str = 'qobsFFTFTFTFF0FFTFTF15FFFFFTFFFFTF815FFTFTTF00_F-NoSc_O-Stan_Ntr5000002_Nte972360_F_Tin_qin_uin_vinMinusSH_usurf_latin_O_tsurfCorr_qsurfCorr_tkz_RF_NTr10_MinS20max_d27_maxzinf_nocos_te70_tr75' trainfile = '/glade/scratch/janniy/mldata_tmp/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_training_x_no_subsampling.pkl' testfile = '/glade/scratch/janniy/mldata_tmp/training_data/qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_testing_x_no_subsampling.pkl' est_str = 'qobsTTFFFFFTF26TTTFTF48TFFFFFTFTFFF320FFTFTTF4848_F-NoSc_O-Stan_Ntr1969020_Nte218790_F_Tin_qin_qpin_latin_O_Tout_qout_qpout_RF_NTr10_MinS7max_d27_maxzinf_nocos_te85_tr88' if only_plot: figpath = '/glade/scratch/janniy/figs_tmp_xy' + est_str + '/' else: figpath = './figs/' + est_str + '/' ml_plot.PlotAllFigs(est_str, testfile, do_nn, figpath, input_vert_vars, output_vert_vars, input_vert_dim, output_vert_dim, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh, wind_input=do_wind_input, scale_per_column=scale_level, rewight_outputs=rewight_outputs, weight_list=weight_list, is_cheyenne=is_cheyenne) if plot_training_results: # note use n_trn_exs here as training data figpath = figpath + 'training_data/' ml_plot.PlotAllFigs(est_str, trainfile, do_nn, figpath, input_vert_vars, output_vert_vars, input_vert_dim, output_vert_dim, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh, wind_input=do_wind_input, rewight_outputs=rewight_outputs, weight_list=weight_list, is_cheyenne=is_cheyenne) return est_str
min_samples_leaf = 10 n_trees = 11 n_trn_exs = 10000000000 use_rh = False no_cos = True training_expt = 'qobs' #training_expt2 = 'qobs4K rain_only = False datadir, trainfile, testfile, pp_str = ml_load.GetDataPath(training_expt) #Train data f, o, y, z, rho, p = ml_load.LoadData(trainfile, max_z, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh) f_pp, f_scl, o_pp, o_scl, pp_str = PreprocessData(f_ppi, f, o_ppi, o, pp_str, n_trn_exs, z) print('read training data') #test data tf, to, ty, tz, trho, tp = ml_load.LoadData(testfile, max_z, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh) tf_pp, tf_scl, to_pp, to_scl, tpp_str = PreprocessData(f_ppi, tf, o_ppi, to, pp_str, n_trn_exs, tz) print('read test data')
def write_netcdf_nn(est_str, datasource, rain_only=False, no_cos=False, use_rh=False, is_cheyenne=False): # Set output filename if is_cheyenne == False: # On aimsir/esker base_dir = '/net/aimsir/archive1/janniy/' else: base_dir = '/glade/work/janniy/' output_filename = base_dir + 'mldata/gcm_regressors/' + est_str + '.nc' # Load rf and preprocessors est, _, errors, f_ppi, o_ppi, f_pp, o_pp, y, z, p, rho = \ pickle.load(open(base_dir + 'mldata/regressors/' + est_str + '.pkl', 'rb')) # Need to transform some data for preprocessors to be able to export params f, o, _, _, _, _, = ml_load.LoadData(datasource, max_z=max(z), rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) f_scl = ml_load.transform_data(f_ppi, f_pp, f, z) _ = ml_load.transform_data(o_ppi, o_pp, o, z) # Also need to use the predict method to be able to export ANN params _ = est.predict(f_scl) # Grab weights w1 = est.get_parameters()[0].weights w2 = est.get_parameters()[1].weights b1 = est.get_parameters()[0].biases b2 = est.get_parameters()[1].biases # Grab input and output normalization if f_ppi['name'] == 'StandardScaler': fscale_mean = f_pp.mean_ fscale_stnd = f_pp.scale_ else: raise ValueError('Incorrect scaler name') if o_ppi['name'] == 'SimpleO': Nlev = len(z) oscale = np.zeros(b2.shape) oscale[:Nlev] = 1.0 / o_pp[0] oscale[Nlev:] = 1.0 / o_pp[1] elif o_ppi['name'] == 'StandardScaler': oscale_mean = o_pp.mean_ oscale_stnd = o_pp.scale_ else: raise ValueError('Incorrect scaler name') # Write weights to file ncfile = Dataset(output_filename, 'w', format="NETCDF3_CLASSIC") # Write the dimensions ncfile.createDimension('N_in', w1.shape[0]) ncfile.createDimension('N_h1', w1.shape[1]) ncfile.createDimension('N_out', w2.shape[1]) # Create variable entries in the file nc_w1 = ncfile.createVariable('w1', np.dtype('float32').char, ('N_h1', 'N_in')) # Reverse dims nc_w2 = ncfile.createVariable('w2', np.dtype('float32').char, ('N_out', 'N_h1')) nc_b1 = ncfile.createVariable('b1', np.dtype('float32').char, ('N_h1')) nc_b2 = ncfile.createVariable('b2', np.dtype('float32').char, ('N_out')) nc_fscale_mean = ncfile.createVariable('fscale_mean', np.dtype('float32').char, ('N_in')) nc_fscale_stnd = ncfile.createVariable('fscale_stnd', np.dtype('float32').char, ('N_in')) if o_ppi['name'] == 'SimpleO': nc_oscale = ncfile.createVariable('oscale', np.dtype('float32').char, ('N_out')) else: nc_oscale_mean = ncfile.createVariable('oscale_mean', np.dtype('float32').char, ('N_out')) nc_oscale_stnd = ncfile.createVariable('oscale_stnd', np.dtype('float32').char, ('N_out')) # Write variables and close file - transpose because fortran reads it in # "backwards" nc_w1[:] = w1.T nc_w2[:] = w2.T nc_b1[:] = b1 nc_b2[:] = b2 nc_fscale_mean[:] = fscale_mean nc_fscale_stnd[:] = fscale_stnd if o_ppi['name'] == 'SimpleO': nc_oscale[:] = oscale else: nc_oscale_mean[:] = oscale_mean nc_oscale_stnd[:] = oscale_stnd # Write global file attributes ncfile.description = est_str ncfile.close()
# Define preprocessor f_ppi = {'name': 'NoScaler'} # At the moment treated only the possibility to put here NoScalar (for RF it doesn't matter) # f_ppi = {'name': 'StandardScaler'} # o_ppi = {'name': 'SimpleO'} o_ppi = {'name': 'StandardScaler'} rewight_outputs = False #If I want to give more wight to certain features. weight_list = [1,1] rain_only = False datadir, trainfile, testfile, pp_str = ml_load.GetDataPath(training_expt1,is_cheyenne = is_cheyenne) f, o, y, z, rho, p = ml_load.LoadData(trainfile, max_z, input_vert_vars=input_vert_vars, output_vert_vars=output_vert_vars, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh, wind_input=do_wind_input) print('read train data') # load test data tf, to, ty, tz, trho, tp = ml_load.LoadData(testfile, max_z, input_vert_vars=input_vert_vars, output_vert_vars=output_vert_vars, rain_only=rain_only, n_trn_exs=n_trn_exs, no_cos=no_cos, use_rh=use_rh) print('read test data') # Scale data (both train and test) f_pp, f_scl, tf_scl, o_pp, o_scl, to_scl, pp_str = ml_train.PreprocessData_tr_ts(f_ppi, f, tf, o_ppi, o, to, pp_str, n_trn_exs, z, input_vert_dim, input_vert_vars, output_vert_dim, output_vert_vars, scale_level, rewight_outputs=rewight_outputs, weight_list=weight_list) # Yani TO DO!!!
def plot_error_vs_min_samples_leaf(f_ppi, o_ppi, max_z=20000.0, training_expt = 'abs1.0_norf_spinup', n_trees=10, n_trn_exs=None, rain_only=False, no_cos = True, use_rh=False, load_results=True, save_results=False): if load_results: print('loading results') min_samples_leaf, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_min_samples_leaf.pkl', 'rb')) else: datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt) f, o, _, z, rho, _ = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) # scale data f_pp = ml_load.init_pp(f_ppi, f) o_pp = ml_load.init_pp(o_ppi, o) f_scl = ml_load.transform_data(f_ppi, f_pp, f, z) o_scl = ml_load.transform_data(o_ppi, o_pp, o, z) rf = RandomForestRegressor(n_estimators = n_trees, random_state = 123, max_features = 1.0/3.0, warm_start = False) min_min_samples_leaf = 1 max_min_samples_leaf = 16 step_min_samples_leaf = 3 min_samples_leaf = range(min_min_samples_leaf, max_min_samples_leaf + 1, step_min_samples_leaf) cv_error = np.zeros(len(min_samples_leaf)) cv_error_std = np.zeros(len(min_samples_leaf)) for i in range(len(min_samples_leaf)): print(min_samples_leaf[i]) rf.set_params(min_samples_leaf=min_samples_leaf[i]) scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs) cv_error[i] = 1-scores.mean() cv_error_std[i] = scores.std() print(str(min_samples_leaf[i]) + ': ' + str(cv_error[i])) if save_results: print('saving results') pickle.dump([min_samples_leaf, cv_error, cv_error_std], open('figs_errors/error_vs_min_samples_leaf.pkl', 'wb')) print(list(min_samples_leaf)) print(cv_error) print(np.max(cv_error_std/np.sqrt(n_cv))) fig = plt.figure(figsize=(3.0,2.25)) plt.plot(min_samples_leaf, cv_error, '-o') plt.xlim(0, 16.5) plt.ylim(0,0.51) plt.xlabel("Minimum sample size for a leaf") plt.ylabel("Error") #plt.legend(loc="upper right") #plt.legend(frameon=False) ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') plt.tight_layout() # avoid overlap plt.show() fig.savefig('figs_errors/error_vs_min_samples_leaf.eps', bbox_inches='tight') plt.close()
def plot_error_vs_n_trees(f_ppi, o_ppi, max_z=20000.0, training_expt = 'abs1.0_norf_ras', min_samples_leaf = 10, n_trn_exs=None, rain_only=False, no_cos = True, use_rh=False, load_results=True, save_results=False): if load_results: print('loading results') n_trees, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trees.pkl', 'rb')) else: datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt) f, o, y, z, rho, p = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh) # scale data f_pp = ml_load.init_pp(f_ppi, f) o_pp = ml_load.init_pp(o_ppi, o) f_scl = ml_load.transform_data(f_ppi, f_pp, f, z) o_scl = ml_load.transform_data(o_ppi, o_pp, o, z) rf = RandomForestRegressor(min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False) min_n_trees = 1 max_n_trees = 21 n_trees = range(min_n_trees, max_n_trees + 1,2) cv_error = np.zeros(len(n_trees)) cv_error_std = np.zeros(len(n_trees)) # standard deviation across folds for i in range(len(n_trees)): rf.set_params(n_estimators=n_trees[i]) scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs) cv_error[i] = 1-scores.mean() cv_error_std[i] = scores.std() print(str(n_trees[i]) + ': ' + str(cv_error[i])) if save_results: print('saving results') pickle.dump([n_trees, cv_error, cv_error_std], open('figs_errors/error_vs_n_trees.pkl', 'wb')) print(list(n_trees)) print(cv_error) # some confusion in literature as to whether should include 1/sqrt(n_cv) in standard error print(np.max(cv_error_std/np.sqrt(n_cv))) # max of standard error fig = plt.figure(figsize=(3.0,2.25)) plt.plot(n_trees, cv_error, 'o-') plt.xlim(0, 22.8) plt.ylim(0,0.51) plt.xlabel("Number of trees") plt.ylabel("Error") #plt.legend(loc="upper right") #plt.legend(frameon=False) ax = plt.gca() ax.spines['right'].set_color('none') ax.spines['top'].set_color('none') plt.tight_layout() # avoid overlap plt.show() fig.savefig('figs_errors/error_vs_ntrees.eps', bbox_inches='tight') plt.close()