Exemple #1
0
def main(params):

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt
        max_trees = int(max_trees)
        step = int(step)
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Raise an error if var_txt doesn't exist. Otherwise, just read it in
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]

    out_dir = os.path.dirname(sample_txt)
    test(out_dir, x_train, y_train, max_trees, step)
Exemple #2
0
def main(params, constant_vars=[], silent=False, return_results=True):

    # Read params and make variables from text
    inputs = forest.read_params(params, silent=silent)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt
        max_trees = int(max_trees)
        step = int(step)
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)

    # Try to read var_txt
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')
    
    if 'constant_vars' in inputs:
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        
    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
    predict_cols = sorted(np.unique([c for c in df_train.columns for v in df_var.index if v in c] + constant_vars))
    
    y_train = df_train[target_col]
    x_train = df_train.reindex(columns=predict_cols)
    
    
    out_dir = os.path.dirname(sample_txt)
    scores = test(out_dir, x_train, y_train, max_trees, step, silent=silent)
    shutil.copy2(var_txt, out_dir)
    
    if return_results:
        return scores
Exemple #3
0
def main(params,
         n_pieces=False,
         ydims=None,
         constant_vars=None,
         year='',
         agg_method=None):

    t0 = time.time()
    print 'Predicting Random Forest... %s\n' % time.ctime(t0)

    # Set optional params to default:
    split_predictors = False

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        nodata = int(nodata)
        str_check = train_params, rf_path, mask_path, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in
    train_dict = forest.read_params(train_params)
    train_txt_bn = os.path.basename(train_dict['var_txt'][:-1])
    if 'var_txt' not in locals():
        var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn)
    if not os.path.exists(var_txt):
        print ''
        msg = 'Could not find var_txt:\n%s\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make sure vars are sorted alphabetically since they were for training
    pred_vars = sorted(df_var.index)
    df_var = df_var.reindex(pred_vars)
    '''if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        #year = constant_vars['YEAR']
        year = 2012
        pred_constants = sorted(constant_vars.keys())
    else:
        df_var.search_str = [s.format(2007) for s in df_var.search_str]'''

    #out_dir = os.path.dirname(out_raster)
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else:        print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
  'will be overwritten...\n') % out_dir
    new_params = os.path.join(out_dir, os.path.basename(params))
    shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year))

    # Load the Random Forest model
    print 'Loading the RandomForest model from \n%s... \n%s\n' % (
        rf_path, time.ctime(time.time()))
    if not os.path.exists(rf_path):
        raise IOError('%s does not exist' % rf_path)
    with open(rf_path) as f:
        rf_model = pickle.load(f)
    n_features = rf_model.n_features_
    n_vars = len(df_var.index.tolist())
    if 'constant_vars' in inputs:
        n_vars += len(pred_constants)
    if n_features != n_vars:
        print df_var.index.tolist() + pred_constants
        sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\
            '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \
            '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\
            '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt))
        #"""
    if 'agg_method' in inputs:
        agg_method = inputs['agg_method']

    # Get mask and raster info
    ds = gdal.Open(mask_path)
    ar = ds.ReadAsArray()
    nodata_mask = ar != 0
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = gdal.GetDriverByName('gtiff')
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx

    # Predict
    #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time()))
    t1 = time.time()
    predict_pieces = []

    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]

    if 'n_tiles' in inputs:
        df_tiles, df_tiles_rc, tile_size = stem.get_tiles(
            n_tiles, xsize, ysize, tx)
        empty_tiles = []
        ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        tile_dir = os.path.join(out_dir, 'predict_tiles')
        if not os.path.isdir(tile_dir):
            os.mkdir(tile_dir)
        for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()):
            print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles))
            t1 = time.time()
            coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist()
            tsa_ar, tsa_off = mosaic.extract_kernel(ds,
                                                    1,
                                                    coords,
                                                    tx,
                                                    xsize,
                                                    ysize,
                                                    nodata=nodata)
            tsa_mask = tsa_ar == 0
            if tsa_mask.all():
                print 'Tile %s empty. Skipping...' % ind
                continue
            tsa_ar[tsa_mask] = nodata
            # Get the ids of TSAs this kernel covers
            tsa_ids = np.unique(tsa_ar)
            #tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata]
            tsa_strs = [str(tsa) for tsa in tsa_ids if tsa != nodata]
            array_shape = tsa_ar.shape

            # Get an array of predictors where each column is a flattened 2D array of a
            #   single predictor variable
            temp_nodata = -9999
            ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar,
                                                coords, tsa_mask, temp_nodata,
                                                1)
            nodata_mask = ~np.any(ar_predictors == temp_nodata, axis=1)
            predictors = ar_predictors[nodata_mask]
            t2 = time.time()
            if agg_method == 'mode':
                args = []
                for dt in rf_model.estimators_:
                    args.append([dt, predictors])
                pool = Pool(rf_model.n_jobs)
                t3 = time.time()
                dt_predictions = np.vstack(
                    pool.map(forest.par_predict_from_dt, args, 1))
                print 'Prediction time: %.1f minutes' % (
                    (time.time() - t3) / 60)
                t3 = time.time()
                predictions = stem.mode(dt_predictions, axis=0)
                print 'Aggregation time:  %.1f minutes' % (
                    (time.time() - t3) / 60)
                del dt_predictions
                t3 = time.time()
                pool.close()
                pool.join()
                print 'Closing time:  %.1f minutes' % ((time.time() - t3) / 60)
            else:
                predictions = rf_model.predict(ar_predictors[nodata_mask])
            print 'Prediction time: %.1f minutes' % ((time.time() - t2) / 60)

            ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8)
            ar_tile[nodata_mask] = predictions.astype(np.uint8)
            ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind]
            ar_out[ul_r:lr_r, ul_c:lr_c] = ar_tile.reshape(array_shape)
            tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res
            mosaic.array_to_raster(ar_tile.reshape(array_shape),
                                   tx_tile,
                                   prj,
                                   driver,
                                   os.path.join(tile_dir, 'tile_%s.tif' % ind),
                                   dtype=gdal.GDT_Byte,
                                   nodata=nodata)
            print 'Total time for this piece: %.1f minutes\n' % (
                (time.time() - t1) / 60)
            #del ar_predictors, nodata_mask, ar_prediction'''
        #ar_prediction = np.concatenate(predict_pieces)
        #del predict_pieces
        '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        for ind, tile_coords in df_tiles_rc.iterrows():
            if ind in empty_tiles:
                continue
            ul_r, lr_r, ul_c, lr_c = tile_coords
            tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind)
            if not os.path.exists(tile_file):
                continue
            ds_t = gdal.Open(tile_file)
            ar_tile = ds_t.ReadAsArray()
            t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']]
            ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile'''

    else:
        ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata)
        # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict
        #   separately, then stack them back together
        if split_predictors:
            split_predictors = int(split_predictors)
            predictions = []
            for i, p in enumerate(
                    np.array_split(ar_predictors, split_predictors)):
                t1 = time.time()
                print '\nPredicting for %s of %s pieces of the final array...' % (
                    i + 1, split_predictors)
                predictions.append(rf_model.predict(p))
                print '%.1f minutes' % ((time.time() - t1) / 60)
            predictions = np.concatenate(predictions)
            print ''
        else:
            print 'Predicting in one chunk...'
            predictions = rf_model.predict(ar_predictors)
        ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32)
        ar_prediction[nodata_mask] = predictions
        del ar_predictors, predictions

    # Save the prediction array to disk
    stamp = os.path.basename(out_dir)
    out_path = os.path.join(out_dir, '%s_rf_vote.tif' % stamp)
    #ar_prediction = ar_prediction.reshape(ysize, xsize)
    if constant_vars:
        out_path = out_path.replace('.tif', '_yr%s.tif' % year)
    forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte,
                           nodata)  #"""
    # Delete the tiles
    shutil.rmtree(tile_dir)
    ds = None
    '''stamp = os.path.basename(out_dir)
    path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) 
    stamp = os.path.basename(os.path.dirname(path))
    ds = gdal.Open(path)
    ar_prediction = ds.ReadAsArray()
    ds = None#'''

    if 'test_params' in inputs:
        #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id')
        print '\nEvaluating the model...'
        t1 = time.time()
        test_dict = forest.read_params(test_params)
        for i in test_dict:
            exec("{0} = str({1})").format(i, test_dict[i])

        if 'n_trials' in test_dict:
            n_trials = int(n_trials)
        else:
            'n_trials not specified. Setting default to 50...\n'
            n_trials = 50
        if 'year' in test_dict:
            year = int(year)
        else:
            year = None
        cell_size = [int(i) for i in cell_size.split(',')]
        n_per_cell = int(n_per_cell)
        param_bn = os.path.basename(test_params)
        shutil.copy2(
            test_params,
            os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year)))

        df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx,
                                                 cell_size, target_col,
                                                 n_per_cell, n_trials, year)
        if len(roc_curves) > 0:
            for fpr, tpr, thresholds in roc_curves:
                plt.plot(fpr, tpr, 'k', alpha=.1)
            out_png = os.path.join(out_dir,
                                   '{0}_roc_curve_{1}.png'.format(stamp, year))
            plt.savefig(out_png)

        if 'lc_path' in test_dict:
            '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
            out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year))
            df_lc.to_csv(out_txt, sep='\t')'''

        #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
        df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
        out_txt = os.path.join(
            out_dir,
            '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year))
        df_lc.to_csv(out_txt, sep='\t')
        if 'inventory_txt' in test_dict:
            score_cols = sorted(df.columns)
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
            for col in score_cols:
                score_mean = df[col].mean()
                df_inv.ix[stamp, col] = score_mean
                print 'Average %s: %2.3f' % (col.upper(), score_mean)
            df_inv.to_csv(inventory_txt, sep='\t')
        out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year))
        df.to_csv(out_txt, sep='\t', index=False)
        samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t')
        print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1) / 60)
    else:
        print '\nEither "test_samples" or "inventory_txt" was not specified.' +\
            ' This model will not be evaluated...'

    print '\nTotal runtime: %.1f minutes' % ((time.time() - t0) / 60)

    return out_path
def main(params):

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt, out_dir
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Make optional numeric arguments numeric
    if 'n_trees' in locals():
        n_trees = int(n_trees)
    else:
        n_trees = 50
    if 'n_jobs' in locals():
        n_jobs = int(n_jobs)
    else:
        n_jobs = 12
    if 'max_depth' in locals():
        max_depth = int(max_depth)
    else:
        max_depth = None

    # Raise an error if var_txt doesn't exist. Otherwise, just read it in
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make the output directory
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    stamp = '{0}_{1}_{2}'.format('susceptibility', date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params,
                 out_dir)  #Copy the params so the parameters used are saved
    #shutil.copy2(var_txt, out_dir)

    # Read in training samples
    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')

    # Check that df_train has exactly the same columns as variables specified in df_vars
    train_columns = df_train.columns.tolist()
    unmatched_vars = [v for v in df_var.index if v not in train_columns]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)

    # Sort the predictors in alphabetical order so that train columns can be in the same order as the predict array when
    #   predicting later on
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.sort_index()

    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    rf_model = forest.train_rf_regressor(x_train,
                                         y_train,
                                         ntrees=n_trees,
                                         njobs=n_jobs,
                                         max_depth=max_depth)

    df_var['importance'] = rf_model.feature_importances_
    rf_path = os.path.join(out_dir, 'regressor_model_%s' % stamp)
    forest.save_rfmodel(rf_model, rf_path)
    oob_score = round(rf_model.oob_score_, 3)
    out_var_txt = os.path.join(out_dir, os.path.basename(var_txt))
    df_var.to_csv(out_var_txt, sep='\t')

    # Record params in inventory text file
    df_inv = pd.read_csv(inventory_txt, sep='\t')
    col_str = re.sub('[\]\[\'\"]', '', str(predict_cols))
    raster_res = sample_txt.split('_')[-2].replace('m', '')
    df_inv = df_inv.append(pd.DataFrame([[
        stamp, oob_score, '', '', '', '',
        len(df_train), raster_res, col_str
    ]],
                                        columns=df_inv.columns),
                           ignore_index=True)
    existing_models = fnmatch.filter(os.listdir(os.path.dirname(out_dir)),
                                     'susc*')
    df_inv = df_inv[df_inv.stamp.isin(existing_models)]
    df_inv.to_csv(inventory_txt, sep='\t', index=False)

    print 'Random Forest Regressor model written to:\n', rf_path
    print '\nOOB score: ', oob_score
    print 'Relative importance:'
    print df_var.importance
Exemple #5
0
def main(params):

    t0 = time.time()
    inputs = read_params(params)
    for var in inputs:
        exec("{0} = str({1})").format(var, inputs[var])

    out_dir = os.path.dirname(out_txt)
    if not os.path.exists(out_dir):
        print 'WARNING: output directory does not exist. Creating new directory:\n', out_dir
        os.makedirs(out_dir)

    # Make optional numeric arguments numeric
    if 'data_band' in locals():
        data_band = int(data_band)
    else:
        data_band = 1
    '''if 'nodata' in locals():
        nodata = int(nodata)
    else:
        nodata = None'''
    if 'pct_train' in locals():
        pct_train = float(pct_train)
    else:
        pct_train = None

    # Check that all required params were specified
    try:
        bin_list = [b.split(':') for b in bins.split(',')]
        bins = [(int(mn), int(mx)) for mn, mx in bin_list]
        n_samples = int(n_samples)
        nodata = int(nodata)
        str_check = raster_path, col_name, out_txt
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Get training and testing samples
    df_train, df_test, raster_res = forest.get_stratified_sample(
        raster_path, col_name, data_band, n_samples, bins, pct_train, nodata)
    df_train['obs_id'] = df_train.index

    # Write samples to text file
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    stamp = '{0}_{1}_{2}_{3}m'.format(len(df_train), date_str, time_str,
                                      int(raster_res))
    out_txt = out_txt.replace('.txt', stamp + '.txt')
    bn = os.path.basename(out_txt)
    out_dir = os.path.join(os.path.dirname(out_txt), bn[:-4])
    out_txt = os.path.join(out_dir, bn)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    df_train.to_csv(out_txt, sep='\t', index=False)
    print 'Samples written to:\n', out_txt, '\n'

    if 'var_txt' in locals():
        df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')
        df_predictors = df_train.copy()
        df_predictors = forest.sample_predictors(df_predictors, df_var, nodata)
        df_predictors.to_csv(out_txt.replace('.txt', '_predictors.txt'),
                             sep='\t',
                             index=False)

    # If pct train was specified, then there should be some testing samples so write them to disk
    if pct_train:
        df_test['obs_id'] = df_test.index
        test_txt = out_txt.replace('%s.txt' % stamp, '_test_%s.txt' % stamp)
        df_test.to_csv(test_txt, sep='\t', index=False)
        print 'Test samples written to:\n', test_txt, '\n'

    if 'test_n_trees_params' in locals():
        if not 'var_txt' in locals():
            print 'Cannot test number of trees because no predictors were sampled. Try specifying a var_txt path.'
        x_train = df_predictors[df_var.index]
        y_train = df_predictors[col_name]
        test_params = forest.read_params(test_n_trees_params)
        max_trees = int(test_params['max_trees'].replace('"', ''))
        step = int(test_params['step'].replace('"', ''))
        test.test(out_dir, x_train, y_train, max_trees, step)

    print 'Total time for sampling predictors: %.1f seconds' % (time.time() -
                                                                t0)
Exemple #6
0
def main(params):

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        #import pdb; pdb.set_trace()
        exec("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        str_check = sample_txt, target_col, var_txt, out_dir
    except NameError as e:
        print ''
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Make optional numeric arguments numeric
    if 'n_trees' in locals():
        n_trees = int(n_trees)
    else:
        n_trees = 200
    if 'n_jobs' in locals():
        n_jobs = int(n_jobs)
    else:
        n_jobs = 1
    if 'max_depth' in locals():
        max_depth = int(max_depth)
    else:
        max_depth = None

    # Raise an error if var_txt doesn't exist. Otherwise, just read it in
    if not os.path.exists(var_txt):
        print ''
        msg = 'var_text path specified does not exist:\n%s\n\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make the output directory
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params,
                 out_dir)  #Copy the params so the parameters used are saved
    shutil.copy2(sample_txt, out_dir)

    # Read in training samples
    df_train = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')

    # Check that df_train has exactly the same columns as variables specified in df_vars
    train_columns = df_train.columns.tolist()
    unmatched_vars = [v for v in df_var.index if v not in train_columns]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)

    # Sort the predictors in alphabetical order so that train columns can be in the same order as the predict array when
    #   predicting later on
    predict_cols = sorted(
        np.unique([c for c in df_train.columns if c in df_var.index]))
    predict_cols = [c for c in predict_cols if c in df_var.index]
    if target_col in predict_cols: predict_cols.remove(target_col)
    df_var = df_var.sort_index()
    if 'constant_vars' in inputs:
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        unmatched_vars = [v for v in constant_vars if v not in train_columns]
        if len(unmatched_vars) != 0:
            unmatched_str = '\n'.join(unmatched_vars)
            msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
            raise NameError(msg)
        predict_cols += constant_vars

    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    rf_model = forest.train_rf_regressor(x_train,
                                         y_train,
                                         ntrees=n_trees,
                                         njobs=n_jobs,
                                         max_depth=max_depth)
    if 'constant_vars' in inputs:
        for v in constant_vars:
            df_var = df_var.append(pd.Series(name=v))
    importance = rf_model.feature_importances_
    df_var['importance'] = importance
    df_var['rank'] = [
        int(r) for r in df_var.importance.rank(method='first', ascending=False)
    ]
    out_txt = os.path.join(out_dir, '%s_importance.txt' % stamp)

    rf_path = os.path.join(out_dir, 'regressor_model_%s' % stamp)
    forest.save_rfmodel(rf_model, rf_path)
    oob_score = round(rf_model.oob_score_, 3)
    out_var_txt = os.path.join(out_dir, os.path.basename(var_txt))
    df_var.to_csv(out_var_txt, sep='\t')

    # Record params in inventory text file
    if 'inventory_txt' in inputs:
        df_inv = pd.read_csv(inventory_txt, sep='\t')
        cols = df_inv.columns
        try:
            res = int(re.search('[0-9]{1,2}', out_dirname).group())
        except:
            res = None
        df_inv = df_inv.append(pd.DataFrame([{
            'stamp': stamp,
            'temporal_res': res,
            'oob_score': oob_score,
            'auc': None,
            'rmse': None,
            'rmse_n': None,
            'rmse_p': None,
            'n_samples': len(df_train),
            'n_trees': n_trees
        }]),
                               ignore_index=True)
        df_inv = df_inv.reindex(columns=cols)
        existing_models = fnmatch.filter(os.listdir(os.path.dirname(out_dir)),
                                         '*res*')
        df_inv = df_inv[df_inv.stamp.isin(existing_models)]
        df_inv.to_csv(inventory_txt, sep='\t', index=False)

    print 'Random Forest Regressor model written to:\n', rf_path
    print '\nOOB score: ', oob_score
    print 'Relative importance:'
    print df_var.importance.sort_values(ascending=False)