def plot_error_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0,
                            training_expt = 'abs1.0_norf_spinup',
                            n_trees=10, min_samples_leaf=10,
                            rain_only=False, 
                            no_cos = True, 
                            use_rh=False,
                            load_results=True,
                            save_results=False):
   

    if load_results:
     print('loading results')
     n_trn_exs, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trn_exs.pkl', 'rb'))
    else:
     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False)

     n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000

     cv_error = np.zeros(len(n_trn_exs)) 
     cv_error_std = np.zeros(len(n_trn_exs)) # standard deviation of error estimate across folds

     for i in range(len(n_trn_exs)):
        f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)
        f_pp = ml_load.init_pp(f_ppi, f)
        o_pp = ml_load.init_pp(o_ppi, o)
        f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
        o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(n_trn_exs[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([n_trn_exs, cv_error, cv_error_std], open('figs_errors/error_vs_n_trn_exs.pkl', 'wb'))


    print(n_trn_exs)
    print(cv_error)
    print(np.max(cv_error_std/np.sqrt(n_cv)))

    fig = plt.figure(figsize=(3.0,2.25))
    fscale = 100000.0
    plt.plot(n_trn_exs/fscale, cv_error, '-o')
    plt.xlim(-0.15, 9.5)
    plt.ylim(0,0.51) 
    plt.xlabel("Number of training examples ($10^5$)")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_n_trn_exs.eps', bbox_inches='tight')
    plt.close()
Beispiel #2
0
def PreprocessData(f_ppi, f, o_ppi, o, pp_str, n_trn_exs, z):
    """Transform data according to input preprocessor requirements and make
    make preprocessor string for saving"""
    f_pp = ml_load.init_pp(f_ppi, f)
    f = ml_load.transform_data(f_ppi, f_pp, f, z)
    o_pp = ml_load.init_pp(o_ppi, o)
    o = ml_load.transform_data(o_ppi, o_pp, o, z)
    # Make preprocessor string for saving
    pp_str = pp_str + 'F-' + f_ppi['name'] + '_'
    pp_str = pp_str + 'O-' + o_ppi['name'] + '_'
    # Add number of training examples to string
    pp_str = pp_str + 'Ntrnex' + str(n_trn_exs) + '_'
    return f_pp, f, o_pp, o, pp_str
def plot_train_test_error_nocv_vs_n_trn_exs(f_ppi, o_ppi, max_z=40000.0,
                            training_expt = 'abs1.0_norf_spinup',
                            n_trees=10, min_samples_leaf=10,
                            rain_only=False,
                            no_cos=True,
                            use_rh=False):

    datadir, trainfile, testfile, _ = ml_load.GetDataPath(training_expt)

    f_test, o_test, _, z, _, _ = ml_load.LoadData(testfile, max_z, n_trn_exs=None, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)

    rf = RandomForestRegressor(n_estimators = n_trees, min_samples_leaf = min_samples_leaf, random_state = 123, warm_start = False)

    n_trn_exs = np.array([1, 5, 10, 30, 50, 70, 80, 90])*10000

    test_error = np.zeros(len(n_trn_exs))
    train_error = np.zeros(len(n_trn_exs))

    for i in range(len(n_trn_exs)):
       f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs[i], rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)
       f_pp = ml_load.init_pp(f_ppi, f)
       o_pp = ml_load.init_pp(o_ppi, o)
       f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
       o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)
       rf.fit(f_scl, o_scl)
       f_test_scl = ml_load.transform_data(f_ppi, f_pp, f_test, z)
       o_test_scl = ml_load.transform_data(o_ppi, o_pp, o_test, z)
       test_error[i] = 1.0-rf.score(f_test_scl,o_test_scl)
       train_error[i] = 1.0-rf.score(f_scl,o_scl)
       print(str(n_trn_exs[i]) + ': ' + str(test_error[i]))
       print(str(n_trn_exs[i]) + ': ' + str(train_error[i]))

    print(test_error)
    print(train_error)
    fig = plt.figure()
    fscale = 100000.0
    plt.plot(n_trn_exs/fscale, test_error, '-o', label='test')
    plt.plot(n_trn_exs/fscale, train_error, '-o', label='train')
    plt.xlim(-0.15, 9.5)
    plt.ylim(0,0.51) 
    plt.xlabel("n_trn_exs")
    plt.ylabel("Error")
    plt.legend(loc="upper right")
    plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_test_train_nocv_vs_n_trn_exs.eps', bbox_inches='tight')
    plt.close()
def write_netcdf_nn(est_str,
                    datasource,
                    rain_only=False,
                    no_cos=False,
                    use_rh=False,
                    is_cheyenne=False):
    # Set output filename
    if is_cheyenne == False:  # On aimsir/esker
        base_dir = '/net/aimsir/archive1/janniy/'
    else:
        base_dir = '/glade/work/janniy/'

    output_filename = base_dir + 'mldata/gcm_regressors/' + est_str + '.nc'
    # Load rf and preprocessors
    est, _, errors, f_ppi, o_ppi, f_pp, o_pp, y, z, p, rho = \
        pickle.load(open(base_dir + 'mldata/regressors/' + est_str + '.pkl', 'rb'))
    # Need to transform some data for preprocessors to be able to export params
    f, o, _, _, _, _, = ml_load.LoadData(datasource,
                                         max_z=max(z),
                                         rain_only=rain_only,
                                         no_cos=no_cos,
                                         use_rh=use_rh)
    f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
    _ = ml_load.transform_data(o_ppi, o_pp, o, z)
    # Also need to use the predict method to be able to export ANN params
    _ = est.predict(f_scl)

    # Grab weights
    w1 = est.get_parameters()[0].weights
    w2 = est.get_parameters()[1].weights
    b1 = est.get_parameters()[0].biases
    b2 = est.get_parameters()[1].biases

    # Grab input and output normalization
    if f_ppi['name'] == 'StandardScaler':
        fscale_mean = f_pp.mean_
        fscale_stnd = f_pp.scale_
    else:
        raise ValueError('Incorrect scaler name')

    if o_ppi['name'] == 'SimpleO':
        Nlev = len(z)
        oscale = np.zeros(b2.shape)
        oscale[:Nlev] = 1.0 / o_pp[0]
        oscale[Nlev:] = 1.0 / o_pp[1]
    elif o_ppi['name'] == 'StandardScaler':
        oscale_mean = o_pp.mean_
        oscale_stnd = o_pp.scale_
    else:
        raise ValueError('Incorrect scaler name')

        # Write weights to file
    ncfile = Dataset(output_filename, 'w', format="NETCDF3_CLASSIC")
    # Write the dimensions
    ncfile.createDimension('N_in', w1.shape[0])
    ncfile.createDimension('N_h1', w1.shape[1])
    ncfile.createDimension('N_out', w2.shape[1])
    # Create variable entries in the file
    nc_w1 = ncfile.createVariable('w1',
                                  np.dtype('float32').char,
                                  ('N_h1', 'N_in'))  # Reverse dims
    nc_w2 = ncfile.createVariable('w2',
                                  np.dtype('float32').char, ('N_out', 'N_h1'))
    nc_b1 = ncfile.createVariable('b1', np.dtype('float32').char, ('N_h1'))
    nc_b2 = ncfile.createVariable('b2', np.dtype('float32').char, ('N_out'))
    nc_fscale_mean = ncfile.createVariable('fscale_mean',
                                           np.dtype('float32').char, ('N_in'))
    nc_fscale_stnd = ncfile.createVariable('fscale_stnd',
                                           np.dtype('float32').char, ('N_in'))
    if o_ppi['name'] == 'SimpleO':
        nc_oscale = ncfile.createVariable('oscale',
                                          np.dtype('float32').char, ('N_out'))
    else:
        nc_oscale_mean = ncfile.createVariable('oscale_mean',
                                               np.dtype('float32').char,
                                               ('N_out'))
        nc_oscale_stnd = ncfile.createVariable('oscale_stnd',
                                               np.dtype('float32').char,
                                               ('N_out'))

    # Write variables and close file - transpose because fortran reads it in
    # "backwards"
    nc_w1[:] = w1.T
    nc_w2[:] = w2.T
    nc_b1[:] = b1
    nc_b2[:] = b2
    nc_fscale_mean[:] = fscale_mean
    nc_fscale_stnd[:] = fscale_stnd

    if o_ppi['name'] == 'SimpleO':
        nc_oscale[:] = oscale
    else:
        nc_oscale_mean[:] = oscale_mean
        nc_oscale_stnd[:] = oscale_stnd

    # Write global file attributes
    ncfile.description = est_str
    ncfile.close()
def plot_error_vs_min_samples_leaf(f_ppi, o_ppi, max_z=20000.0,
                                   training_expt = 'abs1.0_norf_spinup',
                                   n_trees=10, 
                                   n_trn_exs=None, rain_only=False, 
                                   no_cos = True, use_rh=False,
                                   load_results=True,
                                   save_results=False):

    if load_results:
     print('loading results')
     min_samples_leaf, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_min_samples_leaf.pkl', 'rb'))
    else:

     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     f, o, _, z, rho, _  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)

     # scale data
     f_pp = ml_load.init_pp(f_ppi, f)
     o_pp = ml_load.init_pp(o_ppi, o)
     f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
     o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)

     rf = RandomForestRegressor(n_estimators = n_trees, random_state = 123, max_features = 1.0/3.0, warm_start = False)

     min_min_samples_leaf = 1
     max_min_samples_leaf = 16
     step_min_samples_leaf = 3

     min_samples_leaf = range(min_min_samples_leaf, max_min_samples_leaf + 1, step_min_samples_leaf)

     cv_error = np.zeros(len(min_samples_leaf))
     cv_error_std = np.zeros(len(min_samples_leaf))

     for i in range(len(min_samples_leaf)):
        print(min_samples_leaf[i])
        rf.set_params(min_samples_leaf=min_samples_leaf[i])
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(min_samples_leaf[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([min_samples_leaf, cv_error, cv_error_std], open('figs_errors/error_vs_min_samples_leaf.pkl', 'wb'))

    print(list(min_samples_leaf))
    print(cv_error)
    print(np.max(cv_error_std/np.sqrt(n_cv)))
    fig = plt.figure(figsize=(3.0,2.25))
    plt.plot(min_samples_leaf, cv_error, '-o')
    plt.xlim(0, 16.5)
    plt.ylim(0,0.51) 
    plt.xlabel("Minimum sample size for a leaf")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_min_samples_leaf.eps', bbox_inches='tight')
    plt.close()
def plot_error_vs_n_trees(f_ppi, o_ppi, max_z=20000.0,
                     training_expt = 'abs1.0_norf_ras',
                     min_samples_leaf = 10,  
                     n_trn_exs=None, rain_only=False, 
                     no_cos = True,
                     use_rh=False,
                     load_results=True,
                     save_results=False):

    if load_results:
     print('loading results')
     n_trees, cv_error, cv_error_std =pickle.load(open('figs_errors/error_vs_n_trees.pkl', 'rb'))
    else:
     datadir, trainfile, _, _ = ml_load.GetDataPath(training_expt)

     f, o, y, z, rho, p  = ml_load.LoadData(trainfile, max_z, n_trn_exs=n_trn_exs, rain_only=rain_only, no_cos=no_cos, use_rh=use_rh)


     # scale data
     f_pp = ml_load.init_pp(f_ppi, f)
     o_pp = ml_load.init_pp(o_ppi, o)
     f_scl = ml_load.transform_data(f_ppi, f_pp, f, z)
     o_scl = ml_load.transform_data(o_ppi, o_pp, o, z)

     rf = RandomForestRegressor(min_samples_leaf = min_samples_leaf, max_features = 1.0/3.0, random_state = 123, warm_start = False)

     min_n_trees = 1
     max_n_trees = 21

     n_trees = range(min_n_trees, max_n_trees + 1,2)
     cv_error = np.zeros(len(n_trees))
     cv_error_std = np.zeros(len(n_trees)) # standard deviation across folds

     for i in range(len(n_trees)):
        rf.set_params(n_estimators=n_trees[i])
        scores = cross_val_score(rf, f_scl, o_scl, cv=n_cv, n_jobs=n_jobs)
        cv_error[i] = 1-scores.mean()
        cv_error_std[i] = scores.std()
        print(str(n_trees[i]) + ': ' + str(cv_error[i]))

     if save_results:
      print('saving results')
      pickle.dump([n_trees, cv_error, cv_error_std], open('figs_errors/error_vs_n_trees.pkl', 'wb'))

    print(list(n_trees))
    print(cv_error)
    # some confusion in literature as to whether should include 1/sqrt(n_cv) in standard error
    print(np.max(cv_error_std/np.sqrt(n_cv))) # max of standard error
    fig = plt.figure(figsize=(3.0,2.25))
    plt.plot(n_trees, cv_error, 'o-')
    plt.xlim(0, 22.8)
    plt.ylim(0,0.51) 
    plt.xlabel("Number of trees")
    plt.ylabel("Error")
    #plt.legend(loc="upper right")
    #plt.legend(frameon=False)
    ax = plt.gca()
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    plt.tight_layout() # avoid overlap
    plt.show()
    fig.savefig('figs_errors/error_vs_ntrees.eps', bbox_inches='tight')
    plt.close()