Esempio n. 1
0
def main(params,
         snap_coord=None,
         resolution=30,
         n_sizes=5,
         max_features=None,
         n_jobs=1):
    t0 = time.time()

    inputs, df_var = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])

    try:
        sets_per_cell = int(sets_per_cell)
        cell_size = [int(s) for s in cell_size.split(',')]
        min_size = int(min_size)
        max_size = int(max_size)
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))
    if 'max_target_val' in inputs:
        max_target_val = int(max_target_val)
    else:
        max_target_val = df_train[target_col].max()
    if 'n_jobs' in inputs:
        n_jobs = int(n_jobs)

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]

    t1 = time.time()
    if model_type.lower() == 'classifier':
        model_func = stem.fit_tree_classifier
    else:
        model_func = stem.fit_tree_regressor

    # Make grid
    x_res = resolution
    y_res = -resolution
    tx, extent = stem.tx_from_shp(mosaic_path,
                                  x_res,
                                  y_res,
                                  snap_coord=snap_coord)
    min_x, max_x, min_y, max_y = [int(i) for i in extent]
    cells = stem.generate_gsrd_grid(cell_size, min_x, min_y, max_x, max_y,
                                    x_res, y_res)
    grid = pd.DataFrame(cells, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y'])
    grid.to_csv(out_txt.replace('.txt', '_grid.txt'))
    #import pdb; pdb.set_trace()
    grid = intersecting_cells(grid, mosaic_path)
    stem.coords_to_shp(grid, '/vol/v2/stem/extent_shp/CAORWA.shp',
                       out_txt.replace('.txt', '_grid.shp'))

    if 'set_sizes' in inputs:
        set_sizes = np.sort([int(s) for s in set_sizes.split(',')])
    else:
        if 'n_sizes' in inputs:
            n_sizes = int(n_sizes)
        set_sizes = np.arange(min_size, max_size + 1,
                              (max_size - min_size) / n_sizes)

    # Sample grid
    dfs = []
    for i, cell in grid.iterrows():
        ul_x, ul_y, lr_x, lr_y = cell
        min_x, max_x = min(ul_x, lr_x), max(ul_x, lr_x)
        min_y, max_y = min(ul_y, lr_y), max(ul_y, lr_y)

        # Calculate support set centers
        x_centers = [
            int(stem.snap_coordinate(x, snap_coord[0], x_res))
            for x in random.sample(xrange(min_x, max_x + 1), sets_per_cell)
        ]
        y_centers = [
            int(stem.snap_coordinate(y, snap_coord[1], y_res))
            for y in random.sample(xrange(min_y, max_y + 1), sets_per_cell)
        ]

        for size in set_sizes:
            df = stem.sample_gsrd_cell(sets_per_cell,
                                       cell,
                                       size,
                                       size,
                                       x_res,
                                       y_res,
                                       tx,
                                       snap_coord,
                                       center_coords=(zip(
                                           x_centers, y_centers)))
            df['set_size'] = size
            df['cell_id'] = i
            dfs.append(df)

    support_sets = pd.concat(dfs, ignore_index=True)
    n_sets = len(support_sets)
    #import pdb; pdb.set_trace()
    print 'Testing set sizes with %s jobs...\n' % n_jobs
    oob_metrics = _par_train_estimator(n_jobs, n_sets, df_train, predict_cols,
                                       target_col, support_sets, model_func,
                                       model_type, max_features,
                                       max_target_val)
    '''args = [[i, n_sets, start_time, df_train, predict_cols, target_col, support_set, model_func, model_type, max_features, max_target_val] for i, (si, support_set) in enumerate(support_sets.ix[:100].iterrows())]
    oob_metrics = []
    for arg in args:
        oob_metrics.append(par_train_estimator(arg))'''

    oob_metrics = pd.DataFrame(oob_metrics)
    oob_metrics.set_index('set_id', inplace=True)
    support_sets = pd.merge(support_sets,
                            oob_metrics,
                            left_index=True,
                            right_index=True)
    #import pdb; pdb.set_trace()
    support_sets.to_csv(out_txt)
Esempio n. 2
0
def main(params,
         data_band=1,
         nodata=None,
         sampling_scheme='proportional',
         data_type='continuous',
         kernel=False,
         boundary_shp=None,
         bin_scale=1,
         min_sample=None,
         max_sample=None,
         n_samples=None,
         n_per_tile=None):

    t0 = time.time()
    data_band = None
    nodata = None
    zero_inflation = None

    # Read params and make variables from each line
    inputs = read_params(params)
    for var in inputs:
        exec("{0} = str({1})").format(var, inputs[var])

    #out_dir = os.path.dirname(out_txt)
    '''if not os.path.exists(out_dir):
        print 'Warning: output directory does not exist. Creating directory...'
        os.makedirs(out_dir)'''

    # Integerize numeric params
    if 'data_band' in locals(): data_band = int(data_band)
    if 'nodata' in locals(): nodata = int(nodata)
    if 'pct_train' in locals():
        pct_train = float(pct_train)
    else:
        pct_train = None
    if zero_inflation: zero_inflation = int(zero_inflation)
    if 'bin_scale' in inputs:
        bin_scale = float(bin_scale)
    if 'min_sample' in inputs:
        min_sample = int(min_sample)
    if 'max_sample' in inputs:
        max_sample = int(max_sample)
    if 'n_per_tile' in inputs:
        n_per_tile = int(n_per_tile)
    if 'n_samples' in inputs:
        n_samples = int(n_samples)
    try:
        bins = parse_bins(bins)
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # If number of tiles not given, need to calculate them
    if 'n_tiles' in inputs:
        n_tiles = [int(i) for i in n_tiles.split(',')]
    else:
        n_tiles = 3, 10
        print 'Using default tile size of %s x %s ....' % n_tiles

    # Generate samples
    df_train, df_test, df_tiles = get_stratified_sample_by_tile(
        raster_path,
        col_name,
        data_band,
        n_samples,
        bins,
        min_sample,
        max_sample,
        pct_train,
        nodata,
        sampling_scheme,
        zero_inflation,
        data_type,
        kernel,
        n_tiles,
        boundary_shp,
        bin_scale=bin_scale,
        n_per_tile=n_per_tile)
    df_train['obs_id'] = df_train.index

    # Write samples to text file
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    bn = '{0}_{1}_sample_{2}_{3}_{4}.txt'.format(col_name, sampling_scheme,
                                                 len(df_train), date_str,
                                                 time_str)
    #bn = os.path.basename(out_txt)
    stamp = bn[:-4]
    out_dir = os.path.join(out_dir, stamp)
    #if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    out_txt = os.path.join(out_dir, bn)
    df_train.to_csv(out_txt, sep='\t', index=False)
    print 'Sample written to:\n%s\n' % out_txt

    shutil.copy2(params, out_dir)  #Copy the params for reference

    if pct_train > 1:
        df_test['obs_id'] = df_test.index
        test_txt = out_txt.replace('%s.txt' % stamp, '%s_test.txt' % stamp)
        df_test.to_csv(test_txt, sep='\t', index=False)
        print 'Test samples written to directory:\n%s' % out_dir

    if n_tiles != [1, 1]:
        if boundary_shp:
            out_shp = os.path.join(out_dir, 'sampling_tiles.shp')
            stem.coords_to_shp(df_tiles, boundary_shp, out_shp)
        else:
            tile_txt = os.path.join(out_dir, 'sampling_tiles.txt')
            df_tiles.to_csv(tile_txt, sep='\t', index=False)

    print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60)

    return out_txt
Esempio n. 3
0
def main(params,
         pct_train=None,
         min_oob=0,
         gsrd_shp=None,
         resolution=30,
         make_oob_map=False,
         snap_coord=None,
         oob_map_metric='oob_rate',
         n_jobs=1,
         oob_drop=None):
    t0 = time.time()

    inputs = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, out_dir, model_type
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)
    print(var_info)
    df_var = pd.read_csv(var_info, sep='\t', index_col='var_name')

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))
    if 'max_target_val' in inputs:
        max_target_val = int(max_target_val)
    else:
        max_target_val = df_train[target_col].max()

    # Make a timestamped output directory if outdir not specified
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params, out_dir)  #Copy the params for reference '''

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    # If there are variables that should remain constant across the modeling
    #   region, get the names
    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_sets = stem.get_gsrd(mosaic_path,
                            cell_size,
                            support_size,
                            sets_per_cell,
                            df_train,
                            min_obs,
                            target_col,
                            predict_cols,
                            out_txt,
                            gsrd_shp,
                            pct_train,
                            snap_coord=snap_coord)
    n_sets = len(df_sets)

    # Create SQL DB and add train sample table
    '''print 'Dumping train_txt to database...'
    t1 = time.time()#'''
    db_path = os.path.join(out_dir, stamp + '.db')
    '''engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    #df_train.to_sql('train_sample', engine, chunksize=10000)
    print '%.1f minutes\n' % ((time.time() - t1)/60)#'''

    # Split x and y train
    t1 = time.time()
    print "'{0}'".format(model_type.lower())
    if model_type.lower().strip(
    ) == 'classifier':  #remove .trim() peter clary  it was after lower
        print 'Training STEM with classifier algorithm...'
        model_func = stem.fit_tree_classifier
    elif model_type.lower().strip() == 'zeroinflated':
        print 'Training STEM with zeroinflated regression algorithm...'
        model_func = stem.fit_tree_zeroinflated
    else:
        print 'Training STEM with regressor algorithm...'
        model_func = stem.fit_tree_regressor
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    importance_cols = ['importance_%s' % c for c in predict_cols]
    for c in importance_cols:
        df_sets[c] = 0

    # Train estimators
    dropped_sets = pd.DataFrame(columns=df_sets.columns)
    dt_dir = os.path.join(out_dir, 'decisiontree_models')
    if not os.path.exists(dt_dir):
        os.mkdir(dt_dir)
    dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl')

    #oob_rates = [0]
    n_jobs = int(n_jobs)

    sets = _par_train_stem(n_jobs, n_sets, df_train, predict_cols, target_col,
                           min_obs, df_sets, model_func, model_type,
                           max_features, dt_path_template, db_path,
                           max_target_val)
    support_sets, samples = zip(*sets)
    df_sets = pd.DataFrame(list(support_sets))\
                .dropna(subset=['dt_file'])\
                .rename_axis('set_id')
    #print('the cols in the df at this point are: ', df_sets.columns)
    df_sets.to_csv(os.path.join(out_dir, 'support_sets.txt'), sep='\t')

    # Consider moving this back to train function by switching to DBMS with multithread support
    '''print '\n\nMaking relationship table for samples and sets...'
    t1 = time.time()
    set_samples = pd.concat(list(samples), ignore_index=True)
    set_samples.to_sql('set_samples', engine, chunksize=100000)
    print '%.1f minutes\n' % ((time.time() - t1)/60)'''

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates and dropping sets with high OOB error...'
    t1 = time.time()
    try:
        df_sets, low_oob, oob_metric = stem.get_oob_rates(
            df_sets,
            df_train,
            db_path,
            target_col,
            predict_cols,
            min_oob,
            model_type,
            drop_expression=oob_drop)
    except Exception as e:
        import pdb
        pdb.set_trace()
    if oob_drop and len(low_oob) > 0:
        df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    set_shp = os.path.join(out_dir, 'support_sets.shp')
    try:
        stem.coords_to_shp(df_sets.drop('dt_model', axis=1), gsrd_shp, set_shp)
    except Exception as e:
        import pdb
        pdb.set_trace()
        print e.message
    print 'Min OOB rate after dropping: ', df_sets[oob_metric].min()
    print 'Estimated average OOB score: ', int(df_sets[oob_metric].mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving support set info...'
    #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt')
    df_sets['set_id'] = df_sets.index
    df_sets = df_sets.drop('dt_model',
                           axis=1)  #.to_csv(set_txt, sep='\t', index=False)
    #df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine)
    t1 = time.time()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #"""
    '''stamp = os.path.basename(out_dir)
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    with engine.connect() as con, con.begin():
        df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')
    predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
Esempio n. 4
0
def main(tile_shp,
         strata_shp,
         n_psu,
         out_shp,
         strata_id_field='NA_L2NAME',
         min_dist=173779,
         split_tags=['2001', '2011']):

    n_psu = int(n_psu)

    tiles = attributes_to_df(tile_shp)
    tile_ds = ogr.Open(tile_shp)
    tile_lyr = tile_ds.GetLayer()
    tiles['xctr'] = (tiles.xmax - tiles.xmin) / 2 + tiles.xmin
    tiles['yctr'] = (tiles.ymax - tiles.ymin) / 2 + tiles.ymin
    tiles['ul_x'] = tiles.xmin
    tiles['lr_x'] = tiles.xmax
    tiles['ul_y'] = tiles.ymax
    tiles['lr_y'] = tiles.ymin

    strata = attributes_to_df(strata_shp)
    #
    strata_ds = ogr.Open(strata_shp)
    strata_lyr = strata_ds.GetLayer()

    # Get areas and calculate proportions of total
    for feat in strata_lyr:
        fid = feat.GetFID()
        geom = feat.GetGeometryRef()
        area = geom.GetArea()
        strata.loc[fid, 'area'] = area

    # Features could be multipart, so calculate sums for all parts of same stratum
    unique_names = strata[strata_id_field].unique()
    summed_areas = pd.Series({
        name: strata.loc[strata[strata_id_field] == name, 'area'].sum()
        for name in unique_names if name != 'WATER'
    })
    strata.drop_duplicates(strata_id_field, inplace=True)
    strata.set_index(strata_id_field, inplace=True)
    strata.drop('WATER', inplace=True)

    strata['area'] = summed_areas / summed_areas.sum()
    strata['n_psu'] = (strata.area * n_psu).round().astype(int)
    strata.loc[strata.n_psu == 0, 'n_psu'] = 1

    # Randomly shuffle strata so the same strata don't always influence availble
    #   psus
    strata = strata.sample(frac=1)
    candidates = tiles.copy()
    fids = []
    strata_names = {}
    for i, (stratum_name, stratum) in enumerate(strata.iterrows()):
        print i, stratum_name, ':',
        strata_lyr.SetAttributeFilter("%s = '%s'" %
                                      (strata_id_field, stratum_name))
        strata_feat = strata_lyr.GetNextFeature()
        strata_geom = ogr.Geometry(ogr.wkbMultiPolygon)
        while strata_feat:
            g = strata_feat.GetGeometryRef()
            strata_geom = strata_geom.Union(g)
            strata_feat = strata_lyr.GetNextFeature()

        # find all tile features that intersect this stratum
        overlapping = []
        print 'getting overlapping...',
        for t_fid in candidates.index:
            tile_feature = tile_lyr.GetFeature(t_fid)
            tile_geom = tile_feature.GetGeometryRef()
            if strata_geom.Intersects(tile_geom):
                overlapping.append(t_fid)  #'''
        if len(overlapping) == 0:
            continue

        print 'selecting...\n'
        for j in range(stratum.n_psu):
            this_fid = random.sample(overlapping, 1)
            fids.extend(this_fid)
            selected = tiles.loc[fids]
            strata_names[this_fid[0]] = stratum_name
            for ti, c_tile in candidates.iterrows():
                if np.any(
                        np.sqrt((selected.xctr - c_tile.xctr)**2 +
                                (selected.yctr - c_tile.yctr)**2) <= min_dist):
                    candidates.drop(ti, inplace=True)
                    # Additionally remove tiles from overlapping list so they're not selected
                    if ti in overlapping:
                        # Might not be depending on search distance
                        overlapping.remove(ti)

    selected[strata_id_field] = pd.Series(strata_names)

    if split_tags:
        #random_ids = random.sample(selected.index, strata.n_psu.sum()/2)
        selected1 = selected.sample(frac=.5)
        selected2 = selected.loc[~selected.index.isin(selected1.index)]
        coords_to_shp(selected1, tile_shp,
                      out_shp.replace('.shp', '_%s.shp') % split_tags[0])
        coords_to_shp(selected2, tile_shp,
                      out_shp.replace('.shp', '_%s.shp') % split_tags[1])
    else:
        #selected.to_csv(out_shp.replace('.shp', '.txt'))
        coords_to_shp(selected, tile_shp, out_shp)

    strata_ds, strata_lyr, strata_feat = None, None, None
    tile_ds, tile_lyr = None, None
Esempio n. 5
0
def main(params, n_pieces=False, ydims=None, constant_vars=None, year='', agg_method=None):

    t0 = time.time()
    print 'Predicting Random Forest... %s\n' % time.ctime(t0)

    # Set optional params to default:
    split_predictors = False

    # Read params and make variables from text
    inputs = forest.read_params(params)
    for i in inputs:
        exec ("{0} = str({1})").format(i, inputs[i])

    # Check that variables were specified in params
    try:
        nodata = int(nodata)
        str_check = train_params, rf_path, mask_path, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params)
        raise NameError(msg)

    # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in
    train_dict = forest.read_params(train_params)
    train_txt_bn = os.path.basename(train_dict['var_txt'][:-1])
    if 'var_txt' not in locals():
        var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn)
    if not os.path.exists(var_txt):
        print ''
        msg = 'Could not find var_txt:\n%s\n' % var_txt
        raise IOError(msg)
    df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name')

    # Make sure vars are sorted alphabetically since they were for training
    pred_vars  = sorted(df_var.index)
    df_var = df_var.reindex(pred_vars)
    '''if 'constant_vars' in inputs:
        constant_vars = parse_constant_vars(constant_vars)
        #year = constant_vars['YEAR']
        year = 2012
        pred_constants = sorted(constant_vars.keys())
    else:
        df_var.search_str = [s.format(2007) for s in df_var.search_str]'''

    #out_dir = os.path.dirname(out_raster)
    if not os.path.exists(out_dir): os.mkdir(out_dir)
    else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \
    'will be overwritten...\n') % out_dir
    new_params = os.path.join(out_dir, os.path.basename(params))
    shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year))

    # Load the Random Forest model
    print 'Loading the RandomForest model from \n%s... \n%s\n' % (rf_path, time.ctime(time.time()))
    if not os.path.exists(rf_path):
        raise IOError('%s does not exist' % rf_path)
    with open(rf_path) as f:
        rf_model = pickle.load(f)
    n_features = rf_model.n_features_
    n_vars = len(df_var.index.tolist())
    if 'constant_vars' in inputs: 
        n_vars += len(pred_constants)
    if n_features != n_vars:
        print df_var.index.tolist() + pred_constants
        sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\
            '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \
            '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\
            '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt))
        #"""

    # Get mask and raster info
    ds = gdal.Open(mask_path)
    ar = ds.ReadAsArray()
    nodata_mask = ar != 0
    xsize = ds.RasterXSize
    ysize = ds.RasterYSize
    tx = ds.GetGeoTransform()
    prj = ds.GetProjection()
    driver = gdal.GetDriverByName('gtiff')
    ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx
    

    # Predict
    #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time()))
    t1 = time.time()
    predict_pieces = []
    
    '''if n_pieces:
        # assumes predictors all line up and have same dimensions
        if 'mask_path' not in inputs: 
            raise NameError('mask_path not specified')
        # Figure out the y dimension of each piece
        n_pieces = int(n_pieces)
        piece_ysize = ysize/n_pieces
        upper_ydim = range(0, ysize, piece_ysize)
        lower_ydim = range(piece_ysize, ysize, piece_ysize)
        lower_ydim[-1] = ysize
        ydims = zip(upper_ydim, lower_ydim)
        
        for i, yd in enumerate(ydims):
            print 'Predicting for piece %s of %s...' % (i + 1, n_pieces)
            t1 = time.time()
            ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata, yd, constant_vars)
            t2 = time.time()
            predictions = rf_model.predict(ar_predictors)
            print 'Prediction time: %.1f minutes' % ((time.time() - t2)/60)
            ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.uint8)
            ar_prediction[nodata_mask] = (predictions * 100).astype(np.uint8)
            predict_pieces.append(ar_prediction)
            print 'Total time for this piece: %.1f minutes\n' % ((time.time() - t1)/60)
            del ar_predictors, nodata_mask, ar_prediction
        ar_prediction = np.concatenate(predict_pieces)
        del predict_pieces'''
    
    if 'n_tiles' not in inputs:
        print 'n_tiles not specified. Using default: 25 x 15 ...\n'
        n_tiles = 25, 15
    else:
        n_tiles = [int(i) for i in n_tiles.split(',')]
        
    if 'n_tiles' in inputs:
        df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, tx)
        stem.coords_to_shp(df_tiles, '/vol/v2/stem/extent_shp/CAORWA.shp', os.path.join(out_dir, 'tile.shp'))
        empty_tiles = []
        ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        tile_dir = os.path.join(out_dir, 'predict_tiles')
        if not os.path.isdir(tile_dir):
            os.mkdir(tile_dir)
        for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()):
            print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles))
            t1 = time.time()
            coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist()
            tsa_ar, tsa_off = mosaic.extract_kernel(ds, 1, coords, tx, xsize, ysize, nodata=nodata)
            tsa_mask = tsa_ar == 0
            if tsa_mask.all():
                print 'Tile %s empty. Skipping...' % ind
                continue
            tsa_ar[tsa_mask] = nodata
            # Get the ids of TSAs this kernel covers
            tsa_ids = np.unique(tsa_ar)
            tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata]
            array_shape = tsa_ar.shape
        
            # Get an array of predictors where each column is a flattened 2D array of a
            #   single predictor variable
            temp_nodata = -9999
            ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar, coords, tsa_mask, temp_nodata, 1)
            nodata_mask = ~ np.any(ar_predictors==temp_nodata, axis=1)
            predictors = ar_predictors[nodata_mask]
            t2 = time.time()
            if agg_method == 'mode':
                args = []
                for dt in rf_model.estimators_:
                    args.append([dt, predictors])
                pool = Pool(rf_model.n_jobs)
                t3 = time.time()
                dt_predictions = np.vstack(pool.map(forest.par_predict_from_dt, args, 1))
                print 'Prediction time: %.1f minutes' % ((time.time() - t3)/60)
                t3 = time.time()
                predictions = stem.mode(dt_predictions, axis=0)
                print 'Aggregation time:  %.1f minutes' % ((time.time() - t3)/60)
                del dt_predictions
                t3 = time.time()
                pool.close()
                pool.join()
                print 'Closing time:  %.1f minutes' % ((time.time() - t3)/60)
            else:
                predictions = rf_model.predict(ar_predictors[nodata_mask])
            print 'Prediction time: %.1f minutes' % ((time.time() - t2)/60)
            
            ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8)
            ar_tile[nodata_mask] = predictions.astype(np.uint8)
            ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind]
            ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile.reshape(array_shape)
            tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res
            mosaic.array_to_raster(ar_tile.reshape(array_shape), tx_tile, prj, driver, os.path.join(tile_dir, 'tile_%s.tif' % ind), dtype=gdal.GDT_Byte, nodata=nodata)
            print 'Total time for this piece: %.1f minutes\n' % ((time.time() - t1)/60)
            #del ar_predictors, nodata_mask, ar_prediction'''
        #ar_prediction = np.concatenate(predict_pieces)
        #del predict_pieces
        '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8)
        for ind, tile_coords in df_tiles_rc.iterrows():
            if ind in empty_tiles:
                continue
            ul_r, lr_r, ul_c, lr_c = tile_coords
            tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind)
            if not os.path.exists(tile_file):
                continue
            ds_t = gdal.Open(tile_file)
            ar_tile = ds_t.ReadAsArray()
            t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']]
            ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile'''
        
    else:
        ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata)
        # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict
        #   separately, then stack them back together
        if split_predictors:
            split_predictors = int(split_predictors)
            predictions = []
            for i, p in enumerate(np.array_split(ar_predictors, split_predictors)):
                t1 = time.time()
                print '\nPredicting for %s of %s pieces of the final array...' % (i + 1, split_predictors)
                predictions.append(rf_model.predict(p))
                print '%.1f minutes' % ((time.time() - t1)/60)
            predictions = np.concatenate(predictions)
            print ''
        else:
            print 'Predicting in one chunk...'
            predictions = rf_model.predict(ar_predictors)
        ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32)
        ar_prediction[nodata_mask] = predictions
        del ar_predictors, predictions

    # Save the prediction array to disk
    stamp = os.path.basename(out_dir)
    out_path = os.path.join(out_dir, '%s_rf_mean.tif' % stamp)
    #ar_prediction = ar_prediction.reshape(ysize, xsize)
    if constant_vars: 
        out_path = out_path.replace('.tif', '_yr%s.tif' % year )
    forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte, nodata)#"""
    # Delete the tiles
    shutil.rmtree(tile_dir)
    ds = None
    '''stamp = os.path.basename(out_dir)
    path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) 
    stamp = os.path.basename(os.path.dirname(path))
    ds = gdal.Open(path)
    ar_prediction = ds.ReadAsArray()
    ds = None#'''
    

    if 'test_params' in inputs:
        #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id')
        print '\nEvaluating the model...'
        t1 = time.time()
        test_dict = forest.read_params(test_params)
        for i in test_dict:
            exec ("{0} = str({1})").format(i, test_dict[i])
            
        if 'n_trials' in test_dict: 
            n_trials = int(n_trials)
        else:
            'n_trials not specified. Setting default to 50...\n'
            n_trials = 50
        if 'year' in test_dict: 
            year = int(year)
        else:
            year = None
        cell_size = [int(i) for i in cell_size.split(',')]
        n_per_cell = int(n_per_cell)
        param_bn = os.path.basename(test_params)
        shutil.copy2(test_params, 
                     os.path.join(out_dir, 
                                  param_bn.replace('.txt', '_%s.txt' % year))
                    )
        
        df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx,
                                                 cell_size, target_col, n_per_cell,
                                                 n_trials, year)
        if len(roc_curves) > 0:
            for fpr, tpr, thresholds in roc_curves:
                plt.plot(fpr, tpr, 'k', alpha=.1)
            out_png = os.path.join(out_dir, '{0}_roc_curve_{1}.png'.format(stamp, year))
            plt.savefig(out_png)
            
        if 'lc_path' in test_dict:
            '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
            out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year))
            df_lc.to_csv(out_txt, sep='\t')'''
        
        #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id')
        df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col)
        out_txt = os.path.join(out_dir, '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year))
        df_lc.to_csv(out_txt, sep='\t')
        if 'inventory_txt' in test_dict:
            score_cols = sorted(df.columns)
            df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') 
            for col in score_cols:
                score_mean = df[col].mean()
                df_inv.ix[stamp, col] = score_mean
                print 'Average %s: %2.3f' % (col.upper(), score_mean) 
            df_inv.to_csv(inventory_txt, sep='\t')
        out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year))
        df.to_csv(out_txt, sep='\t', index=False)
        samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t')
        print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1)/60)
    else:
        print '\nEither "test_samples" or "inventory_txt" was not specified.' +\
            ' This model will not be evaluated...'

    print '\nTotal runtime: %.1f minutes' % ((time.time() - t0)/60)
Esempio n. 6
0
def main(params,
         pct_train=None,
         min_oob=0,
         gsrd_shp=None,
         resolution=30,
         make_oob_map=False,
         snap_coord=None,
         oob_map_metric='oob_rate'):
    t0 = time.time()

    inputs, df_var = stem.read_params(params)

    # Convert params to named variables and check for required vars
    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, out_dir, model_type
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    # Read in training samples and check that df_train has exactly the same
    #   columns as variables specified in df_vars
    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n\t'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str
        import pdb
        pdb.set_trace()
        raise NameError(msg)
    if target_col not in df_train.columns:
        raise NameError('target_col "%s" not in sample_txt: %s' %
                        (target_col, sample_txt))

    # Make a timestamped output directory if outdir not specified
    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists
    shutil.copy2(params, out_dir)  #Copy the params for reference '''

    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    # If there are variables that should remain constant across the modeling
    #   region, get the names
    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    if snap_coord:
        snap_coord = [int(c) for c in snap_coord.split(',')]
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_sets = stem.get_gsrd(mosaic_path,
                            cell_size,
                            support_size,
                            sets_per_cell,
                            df_train,
                            min_obs,
                            target_col,
                            predict_cols,
                            out_txt,
                            gsrd_shp,
                            pct_train,
                            snap_coord=snap_coord)
    n_sets = len(df_sets)

    # Create SQL DB and add train sample table
    print 'Dumping train_txt to database...'
    t1 = time.time()
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    df_train.to_sql('train_sample', engine, chunksize=10000)
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Train a tree for each support set
    t1 = time.time()
    if model_type.lower() == 'classifier':
        print 'Training STEM with classifier algorithm...'
        model_func = stem.fit_tree_classifier
    else:
        print 'Training STEM with regressor algorithm...'
        model_func = stem.fit_tree_regressor
    x_train = df_train.reindex(columns=predict_cols)
    y_train = df_train[target_col]
    importance_cols = ['importance_%s' % c for c in predict_cols]
    for c in importance_cols:
        df_sets[c] = 0

    # Train estimators
    dropped_sets = pd.DataFrame(columns=df_sets.columns)
    dt_dir = os.path.join(out_dir, 'decisiontree_models')
    if not os.path.exists(dt_dir):
        os.mkdir(dt_dir)
    dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl')

    # establish DB connection and create empty relationship table for sample inds
    cmd = (
        'CREATE TABLE set_samples (set_id INTEGER, sample_id INTEGER, in_bag INTEGER);'
    )
    with sqlite3.connect(db_path) as connection:
        connection.executescript(cmd)
        connection.commit()
    insert_cmd = 'INSERT INTO set_samples (set_id, sample_id, in_bag) VALUES (?,?,?);'

    oob_rates = [0]
    for i, (set_id, ss) in enumerate(df_sets.iterrows()):
        format_tuple = i + 1, n_sets, float(i) / n_sets * 100, (
            time.time() - t1) / 60, np.mean(oob_rates)
        sys.stdout.write(
            '\rTraining %s/%s DTs (%.1f%%) || %.1f minutes || Avg OOB: %d' %
            format_tuple)
        sys.stdout.flush()

        # Get all samples within support set
        sample_inds = df_train.index[
            (df_train['x'] > ss[['ul_x', 'lr_x']].min())
            & (df_train['x'] < ss[['ul_x', 'lr_x']].max()) &
            (df_train['y'] > ss[['ul_y', 'lr_y']].min()) &
            (df_train['y'] < ss[['ul_y', 'lr_y']].max())]

        n_samples = int(len(sample_inds) * .63)
        if n_samples < min_obs:
            df_sets.drop(set_id, inplace=True)
            continue

        this_x = x_train.ix[sample_inds]
        this_y = y_train.ix[sample_inds]
        support_set = df_sets.ix[set_id]
        dt_path = dt_path_template % set_id
        dt_model, train_inds, oob_inds, importance, oob_metrics = stem.train_estimator(
            support_set, n_samples, this_x, this_y, model_func, model_type,
            max_features, dt_path)
        oob_rates.append(oob_metrics['oob_rate'])
        df_sets.ix[set_id, importance_cols] = importance
        df_sets.ix[set_id, 'dt_model'] = dt_model
        df_sets.ix[set_id, 'dt_file'] = dt_path
        df_sets.ix[set_id, 'n_samples'] = n_samples
        for metric in oob_metrics:
            df_sets.ix[set_id, metric] = oob_metrics[metric]

        # Save oob and train inds
        n_train = len(train_inds)
        n_oob = len(oob_inds)
        train_records = zip(np.full(n_train, set_id, dtype=int), train_inds,
                            np.ones(n_train, dtype=int))
        oob_records = zip(np.full(n_oob, set_id, dtype=int), oob_inds,
                          np.zeros(n_oob, dtype=int))

        #try:
        with sqlite3.connect(db_path) as connection:
            connection.executemany(insert_cmd, train_records + oob_records)
            connection.commit()

    print '\n%.1f minutes\n' % ((time.time() - t1) / 60)

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates...'
    t1 = time.time()
    df_sets, low_oob = stem.get_oob_rates(df_sets, df_train, db_path,
                                          target_col, predict_cols, min_oob)
    if len(low_oob) > 0:
        #df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    set_shp = os.path.join(out_dir, 'support_sets.shp')
    try:
        stem.coords_to_shp(df_sets, gsrd_shp, set_shp)
    except Exception as e:
        print e.message
    print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob)
    print 'Min OOB rate after dropping: ', df_sets.oob_rate.min()
    print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving support set info...'
    #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt')
    df_sets['set_id'] = df_sets.index
    #df_sets = df_sets.drop('dt_model', axis=1)#.to_csv(set_txt, sep='\t', index=False)
    df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine)
    t1 = time.time()
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #"""
    '''stamp = os.path.basename(out_dir)
    db_path = os.path.join(out_dir, stamp + '.db')
    engine = sqlalchemy.create_engine('sqlite:///%s' % db_path)
    with engine.connect() as con, con.begin():
        df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')
    predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''
    if make_oob_map or oob_map_metric in inputs:
        # Check if oob_map params were specified. If not, set to defaults
        if 'n_tiles' not in inputs:
            n_tiles = 40, 90
            print 'n_tiles not specified. Using default: %s x %s ...\n' % (
                n_tiles)

        else:
            n_tiles = int(n_tiles[0]), int(n_tiles[1])

        print 'Calculating OOB score and making OOB score map...'
        try:
            ds = gdal.Open(mosaic_path)
            ar = ds.ReadAsArray()
            mask = ar != 0
            del ar
            xsize = ds.RasterXSize
            ysize = ds.RasterYSize
            tx = ds.GetGeoTransform()
            prj = ds.GetProjection()
            driver = ds.GetDriver()
            ds = None
        except:
            mosaic_ds = ogr.Open(mosaic_path)
            if 'resolution' not in inputs:
                warnings.warn(
                    'Resolution not specified. Assuming default of 30...\n')
            mask = mosaic_ds.GetLayer()
            min_x, max_x, min_y, max_y = mask.GetExtent()
            ul_x = min_x - ((min_x - snap_coord[0]) % resolution)
            ul_y = max_y - ((max_y - snap_coord[1]) % resolution)
            xsize = int((max_x - ul_x) / resolution)
            ysize = int((ul_y - min_y) / resolution)
            prj = mask.GetSpatialRef().ExportToWkt()
            driver = gdal.GetDriverByName('gtiff')
            x_res = resolution
            y_res = -resolution
            tx = ul_x, x_res, 0, ul_y, 0, y_res

        avg_dict, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx,
                                         support_size, db_path, df_sets,
                                         df_train, target_col, predict_cols,
                                         out_dir, stamp, prj, driver,
                                         oob_map_metric)
        df_sets.to_csv(set_txt, sep='\t')  #'''

        avg_oob = round(avg_dict[oob_map_metric], 1)
        avg_cnt = int(round(avg_dict['count'], 0))

        print '\nAverage OOB score: .................... %.1f' % avg_oob
        print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt

        print 'Time to make OOB score map: %.1f hours\n' % (
            (time.time() - t1) / 3600)

    # Record params in inventory text file
    if 'inventory_txt' in inputs:
        t1 = time.time()
        print 'Getting model info...\n'
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        n_sets = len(df_sets)
        '''if 'sample' in sample_txt:
            n_samples = int(sample_txt.split('_')[1].replace('sample',''))
        inv_columns = df_inv.columns
        if 'n_sets' in inv_columns: df_inv.ix[stamp, 'n_sets'] = n_sets
        if 'n_samples' in inv_columns: df_inv.ix[stamp, 'n_samples'] = n_samples
        if 'support_size' in inv_columns: df_inv.ix[stamp, 'support_size'] = str(support_size)
        if 'sets_per_cell' in inv_columns: df_inv.ix[stamp, 'sets_per_cell'] = sets_per_cell
        if 'max_features' in inv_columns: df_inv.ix[stamp, 'max_features'] = max_features
        info_dir = os.path.dirname(inventory_txt)
        existing_models = fnmatch.filter(os.listdir(info_dir), '%s*' % target_col)
        if len(existing_models) > 0:
            df_inv = df_inv[df_inv.index.isin(existing_models)]#'''

        if 'avg_oob' in inv_columns and make_oob_map:
            df_inv.ix[stamp, 'avg_oob'] = avg_oob
        if 'avg_count' in inv_columns and make_oob_map:
            df_inv.ix[stamp, 'avg_count'] = avg_cnt
        if len(df_inv) > 1:
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print 'WARNING: Model info not written to inventory_txt...\n'  #'''

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, pct_train=None, min_oob=0, err_threshold=10):
    t0 = time.time()

    #read_params(params)
    inputs, df_var = stem.read_params(params)

    for i in inputs:
        exec("{0} = str({1})").format(i, inputs[i])
    try:
        if 'max_features' not in locals(): max_features = None
        if 'err_threshold' in inputs: err_threshold = float(err_threshold)
        if 'min_oob' in inputs: min_oob = int(min_oob)
        num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell,
                                        min_obs, max_features, pct_train)
        cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars
        str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir
    except NameError as e:
        missing_var = str(e).split("'")[1]
        msg = "Variable '%s' not specified in param file:\n%s" % (missing_var,
                                                                  params)
        raise NameError(msg)

    now = datetime.now()
    date_str = str(now.date()).replace('-', '')
    time_str = str(now.time()).replace(':', '')[:4]
    if not 'out_dirname' in locals(): out_dirname = target_col
    stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str)
    out_dir = os.path.join(out_dir, stamp)
    #import pdb; pdb.set_trace()
    os.makedirs(
        out_dir
    )  # With a timestamp in dir, no need to check if it already exists'''
    #stamp = os.path.dirname(out_dir)
    shutil.copy2(params, out_dir)  #Copy the params for reference

    df_train = pd.read_csv(sample_txt, sep='\t')
    n_samples = len(df_train)
    # Check that df_train has exactly the same columns as variables specified in df_vars
    unmatched_vars = [
        v for v in df_var.index if v not in [c for c in df_train]
    ]
    if len(unmatched_vars) != 0:
        unmatched_str = '\n'.join(unmatched_vars)
        msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str
        raise NameError(msg)
    predict_cols = sorted(
        np.unique(
            [c for c in df_train.columns for v in df_var.index if v in c]))
    #import pdb; pdb.set_trace()
    df_var = df_var.reindex(df_var.index.sort_values(
    ))  # Make sure predict_cols and df_var are in the same order

    if 'constant_vars' in locals():
        constant_vars = sorted([i.strip() for i in constant_vars.split(',')])
        predict_cols += constant_vars

    # Get samples and support set bounds
    if 'gsrd_shp' not in locals(): gsrd_shp = None
    out_txt = os.path.join(out_dir, stamp + '.txt')
    df_train, df_sets, df_oob = stem.get_gsrd(mosaic_path, cell_size,
                                              support_size, sets_per_cell,
                                              df_train, min_obs, target_col,
                                              predict_cols, out_txt, gsrd_shp,
                                              pct_train)

    # Train a tree for each support set
    print 'Training models...'
    t1 = time.time()
    x_train = df_train.reindex(columns=predict_cols + ['set_id'])
    y_train = df_train[[target_col, 'set_id']]
    df_sets['dt_model'] = [stem.fit_tree_regressor(x_train.ix[x_train.set_id==s, predict_cols],\
    y_train.ix[y_train.set_id==s, target_col], max_features) for s in df_sets.index]
    del df_train
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Calculate OOB rates and drop sets with too low OOB
    print 'Calculating OOB rates...'
    t1 = time.time()
    df_sets, low_oob = stem.get_oob_rates(df_sets, df_oob, err_threshold,
                                          target_col, predict_cols, min_oob)
    if len(low_oob) > 0:
        df_sets.drop(low_oob.index, inplace=True)
        low_oob_shp = os.path.join(out_dir, 'gsrd_low_oob.shp')
        low_oob.drop('dt_model', axis=1, inplace=True)
        stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp)
    print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob)
    print 'Min OOB rate after dropping: ', df_sets.oob_rate.min()
    print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean())
    print '%.1f minutes\n' % ((time.time() - t1) / 60)

    # Write df_sets and each decison tree to disk
    print 'Saving models...'
    t1 = time.time()
    df_sets, set_txt = stem.write_model(out_dir, df_sets)
    print '%.1f minutes\n' % ((time.time() - t1) / 60)  #'''

    #stamp = os.path.basename(out_dir)
    #set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp)

    #predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#'''

    # Record params in inventory text file
    if 'inventory_txt' in locals():
        t1 = time.time()
        '''print 'Getting model info...\n'
        df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp')
        if 'regressor' in params: 
            model_type = 'Regressor'
        else: 
            model_type = 'Classifier'
        n_sets = len(df_sets)
        if 'sample' in sample_txt:
            n_samples = int(sample_txt.split('_')[1].replace('sample',''))
        info = [model_type, None, None, None, None, None, None, None, None, n_sets, n_samples, str(support_size), sets_per_cell, max_features]
        df_inv.ix[stamp] = info
        info_dir = os.path.dirname(inventory_txt)
        existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)), '%s*' % target_col)
        if len(existing_models) > 0:
            df_inv = df_inv[df_inv.index.isin(existing_models)]'''

        # Check if oob_map params were specified. If not, set to defaults
        if 'err_threshold' not in locals():
            print 'err_threshold not specified. Using default: 10 ...\n'
            err_threshold = 10
        else:
            err_threshold = int(err_threshold)
        if 'n_tiles' not in locals():
            print 'n_tiles not specified. Using default: 25 x 15 ...\n'
            n_tiles = 25, 15
        else:
            n_tiles = int(n_tiles[0]), int(n_tiles[1])

        #t1 = time.time()
        print 'Calculating OOB score and making OOB score map...'
        ds = gdal.Open(mosaic_path)
        ar = ds.ReadAsArray()
        mask = ar != 0
        del ar
        xsize = ds.RasterXSize
        ysize = ds.RasterYSize
        tx = ds.GetGeoTransform()
        prj = ds.GetProjection()
        driver = ds.GetDriver()
        ds = None

        #import get_oob_map as oob
        ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles,
                                               tx, support_size, df_oob,
                                               df_sets, target_col,
                                               predict_cols, err_threshold,
                                               out_dir, stamp, prj, driver)
        df_sets.to_csv(set_txt, sep='\t')  #'''

        #if 'inventory_txt' in locals() :
        avg_oob = round(np.mean(ar_oob[mask]), 1)
        avg_cnt = int(round(np.mean(ar_cnt[mask]), 0))
        '''df_inv.ix[stamp, 'avg_oob'] = avg_oob
        #df_inv.ix[stamp, 'avg_count'] = avg_cnt
        if len(df_inv) > 1:
            df_inv.to_csv(inventory_txt, sep='\t')
        else:
            print 'WARNING: Model info not written to inventory_txt...\n' '''

        print '\nAverage OOB score: .................... %.1f' % avg_oob
        print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt

        print 'Time to make OOB score map: %.1f hours\n' % (
            (time.time() - t1) / 3600)

        #except Exception as e:
        #    print 'Problem getting oob map: ', e

    print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(n_tiles,
         tile_path=None,
         add_field=True,
         out_path=None,
         snap=True,
         clip=True):

    try:
        if add_field.lower() == 'false':
            add_field = False
    except:
        pass
    try:
        if snap.lower() == 'false':
            snap = False
    except:
        pass

    if tile_path is None:
        tile_path = TILE_PATH

    if not os.path.exists(tile_path):
        raise RuntimeError('tile_path does not exist: %s' % tile_path)

    try:
        n_tiles = tuple([int(i) for i in n_tiles.split(',')])
    except:
        raise ValueError(
            'Could not parse n_tiles %s. It must be given as "n_tiles, n_x_tiles"'
            % n_tiles)

    # Get processing tiles
    tx, (xmin, xmax, ymin, ymax) = tx_from_shp(tile_path, XRES, YRES)
    xsize = abs(int(xmax - xmin) / XRES)
    ysize = abs(int(ymax - ymin) / YRES)
    tiles, _, _ = get_tiles(n_tiles, xsize, ysize, tx=tx)
    tile_id_field = 'eetile%sx%s' % n_tiles
    tiles[tile_id_field] = tiles.index

    if snap:
        coords, _ = get_coords(tile_path, multipart='split')
        coords = np.array(coords)  #shape is (nfeatures, ncoords, 2)
        xcoords = np.unique(coords[:, :, 0])
        ycoords = np.unique(coords[:, :, 1])
        for i, processing_coords in tiles.iterrows():
            tiles.loc[i, 'ul_x'] = xcoords[np.argmin(
                np.abs(xcoords - processing_coords.ul_x))]
            tiles.loc[i, 'lr_x'] = xcoords[np.argmin(
                np.abs(xcoords - processing_coords.lr_x))]
            tiles.loc[i, 'ul_y'] = ycoords[np.argmin(
                np.abs(ycoords - processing_coords.ul_y))]
            tiles.loc[i, 'lr_y'] = ycoords[np.argmin(
                np.abs(ycoords - processing_coords.lr_y))]

    if not out_path:
        out_path = os.path.join(OUT_DIR,
                                'ee_processing_tiles_%sx%s.shp' % n_tiles)
    coords_to_shp(tiles, tile_path, out_path)
    descr = ('Tiles for processing data on Google Earth Engine. The tiles ' +
            'have %s row(s) and %s col(s) and are bounded by the extent of %s') %\
            (n_tiles[0], n_tiles[1], tile_path)
    '''if clip:
        ds = ogr.Open(tile_path)
        lyr = ds.GetLayer()
        geoms = ogr.Geometry(ogr.wkbMultiPolygon)
        for feature in lyr:
            g = feature.GetGeometryRef()
            geoms.AddGeometry(g)
        union = geoms.UnionCascaded()
        base_path, ext = os.path.splitext(tile_path)
        temp_file = tile_path.replace(ext, '_uniontemp' + ext)
        feature'''

    createMetadata(sys.argv, out_path, description=descr)
    print '\nNew processing tiles written to', out_path

    # Find which features processing tile touches which each CONUS storage tile
    #   use get_overallping_sets() to find which
    # Read in the CONUS storage tiles
    if add_field:
        conus_tiles = attributes_to_df(tile_path)

        # Make a temporary copy of it
        base_path, ext = os.path.splitext(tile_path)
        temp_file = tile_path.replace(ext, '_temp' + ext)
        df_to_shp(conus_tiles, tile_path, temp_file, copy_fields=False)

        # Loop through each processing tile and find all overlapping
        conus_tiles[tile_id_field] = -1
        ds = ogr.Open(tile_path)
        lyr = ds.GetLayer()
        for p_fid, processing_coords in tiles.iterrows():
            wkt = 'POLYGON (({0} {1}, {2} {1}, {2} {3}, {0} {3}, {0} {1}))'.format(
                processing_coords.ul_x, processing_coords.ul_y,
                processing_coords.lr_x, processing_coords.lr_y)
            p_geom = ogr.CreateGeometryFromWkt(wkt)
            p_geom.CloseRings()
            for c_fid in conus_tiles.index:
                feature = lyr.GetFeature(c_fid)
                geom = feature.GetGeometryRef()
                if geom.Intersection(p_geom).GetArea() > 0:
                    conus_tiles.loc[c_fid, tile_id_field] = p_fid
        lyr, feature = None, None

        # re-write the CONUS tiles shapefile with the new field
        df_to_shp(conus_tiles, tile_path, tile_path, copy_fields=False)

        # delete temporary file
        driver = ds.GetDriver()
        driver.DeleteDataSource(temp_file)
        ds = None
        print '\nField with processing tile ID added to', tile_path

        # if the metadata text file exists, add a line about appending the field.
        #   otherwise, make a new metadata file.
        meta_file = tile_path.replace(ext, '_meta.txt')
        if os.path.exists(meta_file):
            with open(meta_file, 'a') as f:
                f.write(
                    '\n\nAppended field %s with IDs from the overlapping feature of %s'
                    % (tile_id_field, out_path))
        else:
            descr = 'Tile system with appended field %s with IDs from the overlapping feature of %s' % (
                tile_id_field, out_path)
            createMetadata(sys.argv, tile_path, description=descr)
Esempio n. 9
0
def main(region_path,
         tile_path,
         reference_path,
         out_dir,
         id_field='region_id',
         ref_basename='nlcd'):

    df = attributes_to_df(region_path)
    tile_info = attributes_to_df(tile_path)
    tile_info['ul_x'] = tile_info.xmin
    tile_info['lr_x'] = tile_info.xmax
    tile_info['ul_y'] = tile_info.ymax
    tile_info['lr_y'] = tile_info.ymin

    _, vector_ext = os.path.splitext(region_path)
    region_ids = df[id_field].unique()
    n_regions = len(region_ids)

    region_ds = ogr.Open(region_path)
    region_lyr = region_ds.GetLayer()

    for i, r_id in enumerate(region_ids):
        print 'Making region dir for %s (%s of %s)' % (r_id, i, n_regions)
        df_r = df[df.region_id == r_id]
        id_str = ('0' + str(r_id))[-2:]

        fid = df_r.index[0]
        region_feature = region_lyr.GetFeature(fid)
        xmin, xmax, ymin, ymax = region_feature.GetGeometryRef().GetEnvelope()
        region_feature.Destroy()
        df_r['ul_x'] = xmin
        df_r['lr_x'] = xmax
        df_r['ul_y'] = ymax
        df_r['lr_y'] = ymin
        clip_coords = df_r.loc[fid, ['ul_x', 'lr_x', 'ul_y', 'lr_y']]

        region_dir = os.path.join(out_dir, 'region_%s' % id_str)
        if not os.path.exists(region_dir):
            os.mkdir(region_dir)

        # Make a shapefile of the tiles
        out_vector = os.path.join(region_dir,
                                  'tile_{0}{1}'.format(id_str, vector_ext))
        if not os.path.exists(out_vector):
            ''' switch to selection by min/max of coords '''
            region_tiles = tile_info[tile_info[id_field] == r_id]
            coords_to_shp(region_tiles, region_path, out_vector)

        # Make a map of reference NLCD
        ds = gdal.Open(out_vector.replace(vector_ext, '.tif'))
        mask = ds.ReadAsArray() == 255
        ds = None
        nlcd_year = re.search(
            '\d\d\d\d',
            reference_path).group()  # finds the first one (potentially buggy)
        out_ref_map = os.path.join(
            region_dir, '%s_%s_%s.tif' % (ref_basename, nlcd_year, id_str))
        if not False:  #os.path.exists(out_ref_map):
            ref_ds = gdal.Open(reference_path)
            ref_tx = ref_ds.GetGeoTransform()
            ref_shape = ref_ds.RasterYSize, ref_ds.RasterXSize

            col_off = (ref_tx[0] - clip_coords.ul_x) / ref_tx[1]
            row_off = (ref_tx[3] - clip_coords.ul_y) / ref_tx[5]
            n_cols = abs((clip_coords.ul_x - clip_coords.lr_x) / ref_tx[1])
            n_rows = abs((clip_coords.ul_y - clip_coords.lr_y) / ref_tx[1])

            ar_inds, ref_inds = get_offset_array_indices(
                (n_rows, n_cols), ref_shape, (row_off, col_off))
            ref_n_cols = ref_inds[1] - ref_inds[0]
            ref_n_rows = ref_inds[3] - ref_inds[2]

            ar_ref = ref_ds.ReadAsArray(ref_inds[2], ref_inds[0], ref_n_cols,
                                        ref_n_rows)
            ar = np.full((n_rows, n_cols), 255)
            ar[ar_inds[0]:ar_inds[1], ar_inds[2]:ar_inds[3]] = ar_ref
            ar[mask] = 255

            tx = clip_coords.ul_x, 30, 0, clip_coords.ul_y, 0, -30
            prj = ref_ds.GetProjection()
            driver = gdal.GetDriverByName('gtiff')
            array_to_raster(ar, tx, prj, driver, out_ref_map, nodata=255)

        # Make a clipped raster of the tiles
        out_raster = out_vector.replace(vector_ext, '.tif')
        if not os.path.exists(out_raster):
            tiles = ogr.Open(tile_path)
            tile_lyr = tiles.GetLayer()
            tx = clip_coords.ul_x, 30, 0, clip_coords.ul_y, 0, -30
            tile_array, _ = kernel_from_shp(tile_lyr,
                                            clip_coords,
                                            tx,
                                            255,
                                            val_field='name')
            tile_array[ar == 255] = 255
            driver = gdal.GetDriverByName('gtiff')
            prj = tile_lyr.GetSpatialRef().ExportToWkt()
            array_to_raster(tile_array,
                            tx,
                            prj,
                            driver,
                            out_raster,
                            nodata=255)
            tiles.Destroy()