Ejemplo n.º 1
0
def test(out_dir, x_train, y_train, max_trees, step, min_trees=None, silent=False):
    ''' Make a plot for sample_txt of number of trees vs OOB error rates '''

    if not silent: print 'Testing OOB error rate per number of trees...'
    
    oob_errors = []
    if not min_trees:
        min_trees = step
    n_trees = range(min_trees, max_trees + 1, step)
    n_tests = len(n_trees)

    for i, n in enumerate(n_trees):
        rf_model = forest.train_rf_regressor(x_train, y_train, ntrees=n, njobs=1)
        oob_errors.append(1 - rf_model.oob_score_)
        if not silent:
            print 'Testing %s of %s models with %s trees: %.3f' % (i + 1, n_tests, n, rf_model.oob_score_)

    plt.plot(n_trees, oob_errors, '-')
    plt.axis([0, max_trees, 0, 1])
    plt.xlabel('Number of Trees')
    plt.ylabel('Out of Bag Error Rate')
    plt.title('Number of Decision Trees vs. OOB Error Rate')
    out_png = os.path.join(out_dir, 'ntrees_vs_oob_error.png')
    plt.savefig(out_png)
    plt.clf()

    if not silent: print 'Plot PNG written to : ', out_png, '\n'
    
    return [n_trees, oob_errors]
Ejemplo n.º 2
0
def test(out_dir, x_train, y_train, max_trees, step):
    ''' Make a plot for sample_txt of number of trees vs OOB error rates '''

    print 'Testing OOB error rate per number of trees...'
    oob_errors = []
    n_trees = range(50, max_trees + 1, step)
    n_tests = max_trees / step

    for i, n in enumerate(n_trees):
        rf_model = forest.train_rf_regressor(x_train, y_train, ntrees=n)
        #import pdb; pdb.set_trace()
        oob_errors.append(1 - rf_model.oob_score_)
        print 'Testing %s of %s models with %s trees: %.3f' % (
            i + 1, n_tests, n, rf_model.oob_score_)

    plt.plot(n_trees, oob_errors, '-')
    plt.axis([0, max_trees, 0, 1])
    plt.xlabel('Number of Trees')
    plt.ylabel('Out of Bag Error Rate')
    plt.title('Number of Decision Trees vs. OOB Error Rate')
    out_png = os.path.join(out_dir, 'ntrees_vs_oob_error.png')
    plt.savefig(out_png)
    plt.clf()

    print 'Plot PNG written to : ', out_png, '\n'
def main(params):

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt, out_dir
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Make optional numeric arguments numeric
    if 'n_trees' in locals():
        n_trees = int(n_trees)
    else:
        n_trees = 50
    if 'n_jobs' in locals():
        n_jobs = int(n_jobs)
    else:
        n_jobs = 12
    if 'max_depth' in locals():
        max_depth = int(max_depth)
    else:
        max_depth = None

    # Raise an error if var_txt doesn't exist. Otherwise, just read it in
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make the output directory
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    stamp = '{0}_{1}_{2}'.format('susceptibility', date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params,
                 out_dir)  #Copy the params so the parameters used are saved
    #shutil.copy2(var_txt, out_dir)

    # Read in training samples
    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')

    # Check that df_train has exactly the same columns as variables specified in df_vars
    train_columns = df_train.columns.tolist()
    unmatched_vars = [v for v in df_var.index if v not in train_columns]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)

    # Sort the predictors in alphabetical order so that train columns can be in the same order as the predict array when
    #   predicting later on
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.sort_index()

    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    rf_model = forest.train_rf_regressor(x_train,
                                         y_train,
                                         ntrees=n_trees,
                                         njobs=n_jobs,
                                         max_depth=max_depth)

    df_var['importance'] = rf_model.feature_importances_
    rf_path = os.path.join(out_dir, 'regressor_model_%s' % stamp)
    forest.save_rfmodel(rf_model, rf_path)
    oob_score = round(rf_model.oob_score_, 3)
    out_var_txt = os.path.join(out_dir, os.path.basename(var_txt))
    df_var.to_csv(out_var_txt, sep='\t')

    # Record params in inventory text file
    df_inv = pd.read_csv(inventory_txt, sep='\t')
    col_str = re.sub('[\]\[\'\"]', '', str(predict_cols))
    raster_res = sample_txt.split('_')[-2].replace('m', '')
    df_inv = df_inv.append(pd.DataFrame([[
        stamp, oob_score, '', '', '', '',
        len(df_train), raster_res, col_str
    ]],
                                        columns=df_inv.columns),
                           ignore_index=True)
    existing_models = fnmatch.filter(os.listdir(os.path.dirname(out_dir)),
                                     'susc*')
    df_inv = df_inv[df_inv.stamp.isin(existing_models)]
    df_inv.to_csv(inventory_txt, sep='\t', index=False)

    print 'Random Forest Regressor model written to:\n', rf_path
    print '\nOOB score: ', oob_score
    print 'Relative importance:'
    print df_var.importance
Ejemplo n.º 4
0
def main(params):

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        #import pdb; pdb.set_trace()
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt, out_dir
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Make optional numeric arguments numeric
    if 'n_trees' in locals():
        n_trees = int(n_trees)
    else:
        n_trees = 200
    if 'n_jobs' in locals():
        n_jobs = int(n_jobs)
    else:
        n_jobs = 1
    if 'max_depth' in locals():
        max_depth = int(max_depth)
    else:
        max_depth = None

    # Raise an error if var_txt doesn't exist. Otherwise, just read it in
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make the output directory
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params,
                 out_dir)  #Copy the params so the parameters used are saved
    shutil.copy2(sample_txt, out_dir)

    # Read in training samples
    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')

    # Check that df_train has exactly the same columns as variables specified in df_vars
    train_columns = df_train.columns.tolist()
    unmatched_vars = [v for v in df_var.index if v not in train_columns]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)

    # Sort the predictors in alphabetical order so that train columns can be in the same order as the predict array when
    #   predicting later on
    predict_cols = sorted(
        np.unique([c for c in df_train.columns if c in df_var.index]))
    predict_cols = [c for c in predict_cols if c in df_var.index]
    if target_col in predict_cols: predict_cols.remove(target_col)
    df_var = df_var.sort_index()
    if 'constant_vars' in inputs:
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        unmatched_vars = [v for v in constant_vars if v not in train_columns]
        if len(unmatched_vars) != 0:
            unmatched_str = '\n'.join(unmatched_vars)
            msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
            raise NameError(msg)
        predict_cols += constant_vars

    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    rf_model = forest.train_rf_regressor(x_train,
                                         y_train,
                                         ntrees=n_trees,
                                         njobs=n_jobs,
                                         max_depth=max_depth)
    if 'constant_vars' in inputs:
        for v in constant_vars:
            df_var = df_var.append(pd.Series(name=v))
    importance = rf_model.feature_importances_
    df_var['importance'] = importance
    df_var['rank'] = [
        int(r) for r in df_var.importance.rank(method='first', ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % stamp)

    rf_path = os.path.join(out_dir, 'regressor_model_%s' % stamp)
    forest.save_rfmodel(rf_model, rf_path)
    oob_score = round(rf_model.oob_score_, 3)
    out_var_txt = os.path.join(out_dir, os.path.basename(var_txt))
    df_var.to_csv(out_var_txt, sep='\t')

    # Record params in inventory text file
    if 'inventory_txt' in inputs:
        df_inv = pd.read_csv(inventory_txt, sep='\t')
        cols = df_inv.columns
        try:
            res = int(re.search('[0-9]{1,2}', out_dirname).group())
        except:
            res = None
        df_inv = df_inv.append(pd.DataFrame([{
            'stamp': stamp,
            'temporal_res': res,
            'oob_score': oob_score,
            'auc': None,
            'rmse': None,
            'rmse_n': None,
            'rmse_p': None,
            'n_samples': len(df_train),
            'n_trees': n_trees
        }]),
                               ignore_index=True)
        df_inv = df_inv.reindex(columns=cols)
        existing_models = fnmatch.filter(os.listdir(os.path.dirname(out_dir)),
                                         '*res*')
        df_inv = df_inv[df_inv.stamp.isin(existing_models)]
        df_inv.to_csv(inventory_txt, sep='\t', index=False)

    print 'Random Forest Regressor model written to:\n', rf_path
    print '\nOOB score: ', oob_score
    print 'Relative importance:'
    print df_var.importance.sort_values(ascending=False)