def main(params, snap_coord=None, resolution=30, n_sizes=5, max_features=None, n_jobs=1): t0 = time.time() inputs, df_var = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: sets_per_cell = int(sets_per_cell) cell_size = [int(s) for s in cell_size.split(',')] min_size = int(min_size) max_size = int(max_size) except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) if 'max_target_val' in inputs: max_target_val = int(max_target_val) else: max_target_val = df_train[target_col].max() if 'n_jobs' in inputs: n_jobs = int(n_jobs) predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] t1 = time.time() if model_type.lower() == 'classifier': model_func = stem.fit_tree_classifier else: model_func = stem.fit_tree_regressor # Make grid x_res = resolution y_res = -resolution tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) min_x, max_x, min_y, max_y = [int(i) for i in extent] cells = stem.generate_gsrd_grid(cell_size, min_x, min_y, max_x, max_y, x_res, y_res) grid = pd.DataFrame(cells, columns=['ul_x', 'ul_y', 'lr_x', 'lr_y']) grid.to_csv(out_txt.replace('.txt', '_grid.txt')) #import pdb; pdb.set_trace() grid = intersecting_cells(grid, mosaic_path) stem.coords_to_shp(grid, '/vol/v2/stem/extent_shp/CAORWA.shp', out_txt.replace('.txt', '_grid.shp')) if 'set_sizes' in inputs: set_sizes = np.sort([int(s) for s in set_sizes.split(',')]) else: if 'n_sizes' in inputs: n_sizes = int(n_sizes) set_sizes = np.arange(min_size, max_size + 1, (max_size - min_size) / n_sizes) # Sample grid dfs = [] for i, cell in grid.iterrows(): ul_x, ul_y, lr_x, lr_y = cell min_x, max_x = min(ul_x, lr_x), max(ul_x, lr_x) min_y, max_y = min(ul_y, lr_y), max(ul_y, lr_y) # Calculate support set centers x_centers = [ int(stem.snap_coordinate(x, snap_coord[0], x_res)) for x in random.sample(xrange(min_x, max_x + 1), sets_per_cell) ] y_centers = [ int(stem.snap_coordinate(y, snap_coord[1], y_res)) for y in random.sample(xrange(min_y, max_y + 1), sets_per_cell) ] for size in set_sizes: df = stem.sample_gsrd_cell(sets_per_cell, cell, size, size, x_res, y_res, tx, snap_coord, center_coords=(zip( x_centers, y_centers))) df['set_size'] = size df['cell_id'] = i dfs.append(df) support_sets = pd.concat(dfs, ignore_index=True) n_sets = len(support_sets) #import pdb; pdb.set_trace() print 'Testing set sizes with %s jobs...\n' % n_jobs oob_metrics = _par_train_estimator(n_jobs, n_sets, df_train, predict_cols, target_col, support_sets, model_func, model_type, max_features, max_target_val) '''args = [[i, n_sets, start_time, df_train, predict_cols, target_col, support_set, model_func, model_type, max_features, max_target_val] for i, (si, support_set) in enumerate(support_sets.ix[:100].iterrows())] oob_metrics = [] for arg in args: oob_metrics.append(par_train_estimator(arg))''' oob_metrics = pd.DataFrame(oob_metrics) oob_metrics.set_index('set_id', inplace=True) support_sets = pd.merge(support_sets, oob_metrics, left_index=True, right_index=True) #import pdb; pdb.set_trace() support_sets.to_csv(out_txt)
def main(params, data_band=1, nodata=None, sampling_scheme='proportional', data_type='continuous', kernel=False, boundary_shp=None, bin_scale=1, min_sample=None, max_sample=None, n_samples=None, n_per_tile=None): t0 = time.time() data_band = None nodata = None zero_inflation = None # Read params and make variables from each line inputs = read_params(params) for var in inputs: exec("{0} = str({1})").format(var, inputs[var]) #out_dir = os.path.dirname(out_txt) '''if not os.path.exists(out_dir): print 'Warning: output directory does not exist. Creating directory...' os.makedirs(out_dir)''' # Integerize numeric params if 'data_band' in locals(): data_band = int(data_band) if 'nodata' in locals(): nodata = int(nodata) if 'pct_train' in locals(): pct_train = float(pct_train) else: pct_train = None if zero_inflation: zero_inflation = int(zero_inflation) if 'bin_scale' in inputs: bin_scale = float(bin_scale) if 'min_sample' in inputs: min_sample = int(min_sample) if 'max_sample' in inputs: max_sample = int(max_sample) if 'n_per_tile' in inputs: n_per_tile = int(n_per_tile) if 'n_samples' in inputs: n_samples = int(n_samples) try: bins = parse_bins(bins) except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # If number of tiles not given, need to calculate them if 'n_tiles' in inputs: n_tiles = [int(i) for i in n_tiles.split(',')] else: n_tiles = 3, 10 print 'Using default tile size of %s x %s ....' % n_tiles # Generate samples df_train, df_test, df_tiles = get_stratified_sample_by_tile( raster_path, col_name, data_band, n_samples, bins, min_sample, max_sample, pct_train, nodata, sampling_scheme, zero_inflation, data_type, kernel, n_tiles, boundary_shp, bin_scale=bin_scale, n_per_tile=n_per_tile) df_train['obs_id'] = df_train.index # Write samples to text file now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] bn = '{0}_{1}_sample_{2}_{3}_{4}.txt'.format(col_name, sampling_scheme, len(df_train), date_str, time_str) #bn = os.path.basename(out_txt) stamp = bn[:-4] out_dir = os.path.join(out_dir, stamp) #if not os.path.exists(out_dir): os.makedirs(out_dir) out_txt = os.path.join(out_dir, bn) df_train.to_csv(out_txt, sep='\t', index=False) print 'Sample written to:\n%s\n' % out_txt shutil.copy2(params, out_dir) #Copy the params for reference if pct_train > 1: df_test['obs_id'] = df_test.index test_txt = out_txt.replace('%s.txt' % stamp, '%s_test.txt' % stamp) df_test.to_csv(test_txt, sep='\t', index=False) print 'Test samples written to directory:\n%s' % out_dir if n_tiles != [1, 1]: if boundary_shp: out_shp = os.path.join(out_dir, 'sampling_tiles.shp') stem.coords_to_shp(df_tiles, boundary_shp, out_shp) else: tile_txt = os.path.join(out_dir, 'sampling_tiles.txt') df_tiles.to_csv(tile_txt, sep='\t', index=False) print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60) return out_txt
def main(params, pct_train=None, min_oob=0, gsrd_shp=None, resolution=30, make_oob_map=False, snap_coord=None, oob_map_metric='oob_rate', n_jobs=1, oob_drop=None): t0 = time.time() inputs = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, out_dir, model_type except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) print(var_info) df_var = pd.read_csv(var_info, sep='\t', index_col='var_name') # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) if 'max_target_val' in inputs: max_target_val = int(max_target_val) else: max_target_val = df_train[target_col].max() # Make a timestamped output directory if outdir not specified now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists shutil.copy2(params, out_dir) #Copy the params for reference ''' predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order # If there are variables that should remain constant across the modeling # region, get the names if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] out_txt = os.path.join(out_dir, stamp + '.txt') df_sets = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train, snap_coord=snap_coord) n_sets = len(df_sets) # Create SQL DB and add train sample table '''print 'Dumping train_txt to database...' t1 = time.time()#''' db_path = os.path.join(out_dir, stamp + '.db') '''engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) #df_train.to_sql('train_sample', engine, chunksize=10000) print '%.1f minutes\n' % ((time.time() - t1)/60)#''' # Split x and y train t1 = time.time() print "'{0}'".format(model_type.lower()) if model_type.lower().strip( ) == 'classifier': #remove .trim() peter clary it was after lower print 'Training STEM with classifier algorithm...' model_func = stem.fit_tree_classifier elif model_type.lower().strip() == 'zeroinflated': print 'Training STEM with zeroinflated regression algorithm...' model_func = stem.fit_tree_zeroinflated else: print 'Training STEM with regressor algorithm...' model_func = stem.fit_tree_regressor x_train = df_train.reindex(columns=predict_cols) y_train = df_train[target_col] importance_cols = ['importance_%s' % c for c in predict_cols] for c in importance_cols: df_sets[c] = 0 # Train estimators dropped_sets = pd.DataFrame(columns=df_sets.columns) dt_dir = os.path.join(out_dir, 'decisiontree_models') if not os.path.exists(dt_dir): os.mkdir(dt_dir) dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl') #oob_rates = [0] n_jobs = int(n_jobs) sets = _par_train_stem(n_jobs, n_sets, df_train, predict_cols, target_col, min_obs, df_sets, model_func, model_type, max_features, dt_path_template, db_path, max_target_val) support_sets, samples = zip(*sets) df_sets = pd.DataFrame(list(support_sets))\ .dropna(subset=['dt_file'])\ .rename_axis('set_id') #print('the cols in the df at this point are: ', df_sets.columns) df_sets.to_csv(os.path.join(out_dir, 'support_sets.txt'), sep='\t') # Consider moving this back to train function by switching to DBMS with multithread support '''print '\n\nMaking relationship table for samples and sets...' t1 = time.time() set_samples = pd.concat(list(samples), ignore_index=True) set_samples.to_sql('set_samples', engine, chunksize=100000) print '%.1f minutes\n' % ((time.time() - t1)/60)''' # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates and dropping sets with high OOB error...' t1 = time.time() try: df_sets, low_oob, oob_metric = stem.get_oob_rates( df_sets, df_train, db_path, target_col, predict_cols, min_oob, model_type, drop_expression=oob_drop) except Exception as e: import pdb pdb.set_trace() if oob_drop and len(low_oob) > 0: df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) set_shp = os.path.join(out_dir, 'support_sets.shp') try: stem.coords_to_shp(df_sets.drop('dt_model', axis=1), gsrd_shp, set_shp) except Exception as e: import pdb pdb.set_trace() print e.message print 'Min OOB rate after dropping: ', df_sets[oob_metric].min() print 'Estimated average OOB score: ', int(df_sets[oob_metric].mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving support set info...' #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt') df_sets['set_id'] = df_sets.index df_sets = df_sets.drop('dt_model', axis=1) #.to_csv(set_txt, sep='\t', index=False) #df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine) t1 = time.time() print '%.1f minutes\n' % ((time.time() - t1) / 60) #""" '''stamp = os.path.basename(out_dir) db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(tile_shp, strata_shp, n_psu, out_shp, strata_id_field='NA_L2NAME', min_dist=173779, split_tags=['2001', '2011']): n_psu = int(n_psu) tiles = attributes_to_df(tile_shp) tile_ds = ogr.Open(tile_shp) tile_lyr = tile_ds.GetLayer() tiles['xctr'] = (tiles.xmax - tiles.xmin) / 2 + tiles.xmin tiles['yctr'] = (tiles.ymax - tiles.ymin) / 2 + tiles.ymin tiles['ul_x'] = tiles.xmin tiles['lr_x'] = tiles.xmax tiles['ul_y'] = tiles.ymax tiles['lr_y'] = tiles.ymin strata = attributes_to_df(strata_shp) # strata_ds = ogr.Open(strata_shp) strata_lyr = strata_ds.GetLayer() # Get areas and calculate proportions of total for feat in strata_lyr: fid = feat.GetFID() geom = feat.GetGeometryRef() area = geom.GetArea() strata.loc[fid, 'area'] = area # Features could be multipart, so calculate sums for all parts of same stratum unique_names = strata[strata_id_field].unique() summed_areas = pd.Series({ name: strata.loc[strata[strata_id_field] == name, 'area'].sum() for name in unique_names if name != 'WATER' }) strata.drop_duplicates(strata_id_field, inplace=True) strata.set_index(strata_id_field, inplace=True) strata.drop('WATER', inplace=True) strata['area'] = summed_areas / summed_areas.sum() strata['n_psu'] = (strata.area * n_psu).round().astype(int) strata.loc[strata.n_psu == 0, 'n_psu'] = 1 # Randomly shuffle strata so the same strata don't always influence availble # psus strata = strata.sample(frac=1) candidates = tiles.copy() fids = [] strata_names = {} for i, (stratum_name, stratum) in enumerate(strata.iterrows()): print i, stratum_name, ':', strata_lyr.SetAttributeFilter("%s = '%s'" % (strata_id_field, stratum_name)) strata_feat = strata_lyr.GetNextFeature() strata_geom = ogr.Geometry(ogr.wkbMultiPolygon) while strata_feat: g = strata_feat.GetGeometryRef() strata_geom = strata_geom.Union(g) strata_feat = strata_lyr.GetNextFeature() # find all tile features that intersect this stratum overlapping = [] print 'getting overlapping...', for t_fid in candidates.index: tile_feature = tile_lyr.GetFeature(t_fid) tile_geom = tile_feature.GetGeometryRef() if strata_geom.Intersects(tile_geom): overlapping.append(t_fid) #''' if len(overlapping) == 0: continue print 'selecting...\n' for j in range(stratum.n_psu): this_fid = random.sample(overlapping, 1) fids.extend(this_fid) selected = tiles.loc[fids] strata_names[this_fid[0]] = stratum_name for ti, c_tile in candidates.iterrows(): if np.any( np.sqrt((selected.xctr - c_tile.xctr)**2 + (selected.yctr - c_tile.yctr)**2) <= min_dist): candidates.drop(ti, inplace=True) # Additionally remove tiles from overlapping list so they're not selected if ti in overlapping: # Might not be depending on search distance overlapping.remove(ti) selected[strata_id_field] = pd.Series(strata_names) if split_tags: #random_ids = random.sample(selected.index, strata.n_psu.sum()/2) selected1 = selected.sample(frac=.5) selected2 = selected.loc[~selected.index.isin(selected1.index)] coords_to_shp(selected1, tile_shp, out_shp.replace('.shp', '_%s.shp') % split_tags[0]) coords_to_shp(selected2, tile_shp, out_shp.replace('.shp', '_%s.shp') % split_tags[1]) else: #selected.to_csv(out_shp.replace('.shp', '.txt')) coords_to_shp(selected, tile_shp, out_shp) strata_ds, strata_lyr, strata_feat = None, None, None tile_ds, tile_lyr = None, None
def main(params, n_pieces=False, ydims=None, constant_vars=None, year='', agg_method=None): t0 = time.time() print 'Predicting Random Forest... %s\n' % time.ctime(t0) # Set optional params to default: split_predictors = False # Read params and make variables from text inputs = forest.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) # Check that variables were specified in params try: nodata = int(nodata) str_check = train_params, rf_path, mask_path, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in train_dict = forest.read_params(train_params) train_txt_bn = os.path.basename(train_dict['var_txt'][:-1]) if 'var_txt' not in locals(): var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn) if not os.path.exists(var_txt): print '' msg = 'Could not find var_txt:\n%s\n' % var_txt raise IOError(msg) df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name') # Make sure vars are sorted alphabetically since they were for training pred_vars = sorted(df_var.index) df_var = df_var.reindex(pred_vars) '''if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) #year = constant_vars['YEAR'] year = 2012 pred_constants = sorted(constant_vars.keys()) else: df_var.search_str = [s.format(2007) for s in df_var.search_str]''' #out_dir = os.path.dirname(out_raster) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir new_params = os.path.join(out_dir, os.path.basename(params)) shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year)) # Load the Random Forest model print 'Loading the RandomForest model from \n%s... \n%s\n' % (rf_path, time.ctime(time.time())) if not os.path.exists(rf_path): raise IOError('%s does not exist' % rf_path) with open(rf_path) as f: rf_model = pickle.load(f) n_features = rf_model.n_features_ n_vars = len(df_var.index.tolist()) if 'constant_vars' in inputs: n_vars += len(pred_constants) if n_features != n_vars: print df_var.index.tolist() + pred_constants sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\ '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \ '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\ '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt)) #""" # Get mask and raster info ds = gdal.Open(mask_path) ar = ds.ReadAsArray() nodata_mask = ar != 0 xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = gdal.GetDriverByName('gtiff') ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx # Predict #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time())) t1 = time.time() predict_pieces = [] '''if n_pieces: # assumes predictors all line up and have same dimensions if 'mask_path' not in inputs: raise NameError('mask_path not specified') # Figure out the y dimension of each piece n_pieces = int(n_pieces) piece_ysize = ysize/n_pieces upper_ydim = range(0, ysize, piece_ysize) lower_ydim = range(piece_ysize, ysize, piece_ysize) lower_ydim[-1] = ysize ydims = zip(upper_ydim, lower_ydim) for i, yd in enumerate(ydims): print 'Predicting for piece %s of %s...' % (i + 1, n_pieces) t1 = time.time() ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata, yd, constant_vars) t2 = time.time() predictions = rf_model.predict(ar_predictors) print 'Prediction time: %.1f minutes' % ((time.time() - t2)/60) ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.uint8) ar_prediction[nodata_mask] = (predictions * 100).astype(np.uint8) predict_pieces.append(ar_prediction) print 'Total time for this piece: %.1f minutes\n' % ((time.time() - t1)/60) del ar_predictors, nodata_mask, ar_prediction ar_prediction = np.concatenate(predict_pieces) del predict_pieces''' if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = [int(i) for i in n_tiles.split(',')] if 'n_tiles' in inputs: df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, tx) stem.coords_to_shp(df_tiles, '/vol/v2/stem/extent_shp/CAORWA.shp', os.path.join(out_dir, 'tile.shp')) empty_tiles = [] ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) tile_dir = os.path.join(out_dir, 'predict_tiles') if not os.path.isdir(tile_dir): os.mkdir(tile_dir) for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()): print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles)) t1 = time.time() coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist() tsa_ar, tsa_off = mosaic.extract_kernel(ds, 1, coords, tx, xsize, ysize, nodata=nodata) tsa_mask = tsa_ar == 0 if tsa_mask.all(): print 'Tile %s empty. Skipping...' % ind continue tsa_ar[tsa_mask] = nodata # Get the ids of TSAs this kernel covers tsa_ids = np.unique(tsa_ar) tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata] array_shape = tsa_ar.shape # Get an array of predictors where each column is a flattened 2D array of a # single predictor variable temp_nodata = -9999 ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar, coords, tsa_mask, temp_nodata, 1) nodata_mask = ~ np.any(ar_predictors==temp_nodata, axis=1) predictors = ar_predictors[nodata_mask] t2 = time.time() if agg_method == 'mode': args = [] for dt in rf_model.estimators_: args.append([dt, predictors]) pool = Pool(rf_model.n_jobs) t3 = time.time() dt_predictions = np.vstack(pool.map(forest.par_predict_from_dt, args, 1)) print 'Prediction time: %.1f minutes' % ((time.time() - t3)/60) t3 = time.time() predictions = stem.mode(dt_predictions, axis=0) print 'Aggregation time: %.1f minutes' % ((time.time() - t3)/60) del dt_predictions t3 = time.time() pool.close() pool.join() print 'Closing time: %.1f minutes' % ((time.time() - t3)/60) else: predictions = rf_model.predict(ar_predictors[nodata_mask]) print 'Prediction time: %.1f minutes' % ((time.time() - t2)/60) ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8) ar_tile[nodata_mask] = predictions.astype(np.uint8) ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind] ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile.reshape(array_shape) tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res mosaic.array_to_raster(ar_tile.reshape(array_shape), tx_tile, prj, driver, os.path.join(tile_dir, 'tile_%s.tif' % ind), dtype=gdal.GDT_Byte, nodata=nodata) print 'Total time for this piece: %.1f minutes\n' % ((time.time() - t1)/60) #del ar_predictors, nodata_mask, ar_prediction''' #ar_prediction = np.concatenate(predict_pieces) #del predict_pieces '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) for ind, tile_coords in df_tiles_rc.iterrows(): if ind in empty_tiles: continue ul_r, lr_r, ul_c, lr_c = tile_coords tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind) if not os.path.exists(tile_file): continue ds_t = gdal.Open(tile_file) ar_tile = ds_t.ReadAsArray() t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']] ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile''' else: ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata) # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict # separately, then stack them back together if split_predictors: split_predictors = int(split_predictors) predictions = [] for i, p in enumerate(np.array_split(ar_predictors, split_predictors)): t1 = time.time() print '\nPredicting for %s of %s pieces of the final array...' % (i + 1, split_predictors) predictions.append(rf_model.predict(p)) print '%.1f minutes' % ((time.time() - t1)/60) predictions = np.concatenate(predictions) print '' else: print 'Predicting in one chunk...' predictions = rf_model.predict(ar_predictors) ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32) ar_prediction[nodata_mask] = predictions del ar_predictors, predictions # Save the prediction array to disk stamp = os.path.basename(out_dir) out_path = os.path.join(out_dir, '%s_rf_mean.tif' % stamp) #ar_prediction = ar_prediction.reshape(ysize, xsize) if constant_vars: out_path = out_path.replace('.tif', '_yr%s.tif' % year ) forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte, nodata)#""" # Delete the tiles shutil.rmtree(tile_dir) ds = None '''stamp = os.path.basename(out_dir) path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) stamp = os.path.basename(os.path.dirname(path)) ds = gdal.Open(path) ar_prediction = ds.ReadAsArray() ds = None#''' if 'test_params' in inputs: #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id') print '\nEvaluating the model...' t1 = time.time() test_dict = forest.read_params(test_params) for i in test_dict: exec ("{0} = str({1})").format(i, test_dict[i]) if 'n_trials' in test_dict: n_trials = int(n_trials) else: 'n_trials not specified. Setting default to 50...\n' n_trials = 50 if 'year' in test_dict: year = int(year) else: year = None cell_size = [int(i) for i in cell_size.split(',')] n_per_cell = int(n_per_cell) param_bn = os.path.basename(test_params) shutil.copy2(test_params, os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year)) ) df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx, cell_size, target_col, n_per_cell, n_trials, year) if len(roc_curves) > 0: for fpr, tpr, thresholds in roc_curves: plt.plot(fpr, tpr, 'k', alpha=.1) out_png = os.path.join(out_dir, '{0}_roc_curve_{1}.png'.format(stamp, year)) plt.savefig(out_png) if 'lc_path' in test_dict: '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t')''' #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join(out_dir, '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t') if 'inventory_txt' in test_dict: score_cols = sorted(df.columns) df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') for col in score_cols: score_mean = df[col].mean() df_inv.ix[stamp, col] = score_mean print 'Average %s: %2.3f' % (col.upper(), score_mean) df_inv.to_csv(inventory_txt, sep='\t') out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year)) df.to_csv(out_txt, sep='\t', index=False) samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t') print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1)/60) else: print '\nEither "test_samples" or "inventory_txt" was not specified.' +\ ' This model will not be evaluated...' print '\nTotal runtime: %.1f minutes' % ((time.time() - t0)/60)
def main(params, pct_train=None, min_oob=0, gsrd_shp=None, resolution=30, make_oob_map=False, snap_coord=None, oob_map_metric='oob_rate'): t0 = time.time() inputs, df_var = stem.read_params(params) # Convert params to named variables and check for required vars for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, out_dir, model_type except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Read in training samples and check that df_train has exactly the same # columns as variables specified in df_vars df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n\t'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n\t' + unmatched_str import pdb pdb.set_trace() raise NameError(msg) if target_col not in df_train.columns: raise NameError('target_col "%s" not in sample_txt: %s' % (target_col, sample_txt)) # Make a timestamped output directory if outdir not specified now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists shutil.copy2(params, out_dir) #Copy the params for reference ''' predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order # If there are variables that should remain constant across the modeling # region, get the names if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None if snap_coord: snap_coord = [int(c) for c in snap_coord.split(',')] out_txt = os.path.join(out_dir, stamp + '.txt') df_sets = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train, snap_coord=snap_coord) n_sets = len(df_sets) # Create SQL DB and add train sample table print 'Dumping train_txt to database...' t1 = time.time() db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) df_train.to_sql('train_sample', engine, chunksize=10000) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Train a tree for each support set t1 = time.time() if model_type.lower() == 'classifier': print 'Training STEM with classifier algorithm...' model_func = stem.fit_tree_classifier else: print 'Training STEM with regressor algorithm...' model_func = stem.fit_tree_regressor x_train = df_train.reindex(columns=predict_cols) y_train = df_train[target_col] importance_cols = ['importance_%s' % c for c in predict_cols] for c in importance_cols: df_sets[c] = 0 # Train estimators dropped_sets = pd.DataFrame(columns=df_sets.columns) dt_dir = os.path.join(out_dir, 'decisiontree_models') if not os.path.exists(dt_dir): os.mkdir(dt_dir) dt_path_template = os.path.join(dt_dir, stamp + '_decisiontree_%s.pkl') # establish DB connection and create empty relationship table for sample inds cmd = ( 'CREATE TABLE set_samples (set_id INTEGER, sample_id INTEGER, in_bag INTEGER);' ) with sqlite3.connect(db_path) as connection: connection.executescript(cmd) connection.commit() insert_cmd = 'INSERT INTO set_samples (set_id, sample_id, in_bag) VALUES (?,?,?);' oob_rates = [0] for i, (set_id, ss) in enumerate(df_sets.iterrows()): format_tuple = i + 1, n_sets, float(i) / n_sets * 100, ( time.time() - t1) / 60, np.mean(oob_rates) sys.stdout.write( '\rTraining %s/%s DTs (%.1f%%) || %.1f minutes || Avg OOB: %d' % format_tuple) sys.stdout.flush() # Get all samples within support set sample_inds = df_train.index[ (df_train['x'] > ss[['ul_x', 'lr_x']].min()) & (df_train['x'] < ss[['ul_x', 'lr_x']].max()) & (df_train['y'] > ss[['ul_y', 'lr_y']].min()) & (df_train['y'] < ss[['ul_y', 'lr_y']].max())] n_samples = int(len(sample_inds) * .63) if n_samples < min_obs: df_sets.drop(set_id, inplace=True) continue this_x = x_train.ix[sample_inds] this_y = y_train.ix[sample_inds] support_set = df_sets.ix[set_id] dt_path = dt_path_template % set_id dt_model, train_inds, oob_inds, importance, oob_metrics = stem.train_estimator( support_set, n_samples, this_x, this_y, model_func, model_type, max_features, dt_path) oob_rates.append(oob_metrics['oob_rate']) df_sets.ix[set_id, importance_cols] = importance df_sets.ix[set_id, 'dt_model'] = dt_model df_sets.ix[set_id, 'dt_file'] = dt_path df_sets.ix[set_id, 'n_samples'] = n_samples for metric in oob_metrics: df_sets.ix[set_id, metric] = oob_metrics[metric] # Save oob and train inds n_train = len(train_inds) n_oob = len(oob_inds) train_records = zip(np.full(n_train, set_id, dtype=int), train_inds, np.ones(n_train, dtype=int)) oob_records = zip(np.full(n_oob, set_id, dtype=int), oob_inds, np.zeros(n_oob, dtype=int)) #try: with sqlite3.connect(db_path) as connection: connection.executemany(insert_cmd, train_records + oob_records) connection.commit() print '\n%.1f minutes\n' % ((time.time() - t1) / 60) # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates...' t1 = time.time() df_sets, low_oob = stem.get_oob_rates(df_sets, df_train, db_path, target_col, predict_cols, min_oob) if len(low_oob) > 0: #df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'low_oob_sets.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) set_shp = os.path.join(out_dir, 'support_sets.shp') try: stem.coords_to_shp(df_sets, gsrd_shp, set_shp) except Exception as e: print e.message print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob) print 'Min OOB rate after dropping: ', df_sets.oob_rate.min() print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving support set info...' #set_txt = os.path.join(dt_dir, stamp + '_support_sets.txt') df_sets['set_id'] = df_sets.index #df_sets = df_sets.drop('dt_model', axis=1)#.to_csv(set_txt, sep='\t', index=False) df_sets.drop('dt_model', axis=1).to_sql('support_sets', engine) t1 = time.time() print '%.1f minutes\n' % ((time.time() - t1) / 60) #""" '''stamp = os.path.basename(out_dir) db_path = os.path.join(out_dir, stamp + '.db') engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_brightness','delta_greenness','delta_nbr','delta_wetness', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' if make_oob_map or oob_map_metric in inputs: # Check if oob_map params were specified. If not, set to defaults if 'n_tiles' not in inputs: n_tiles = 40, 90 print 'n_tiles not specified. Using default: %s x %s ...\n' % ( n_tiles) else: n_tiles = int(n_tiles[0]), int(n_tiles[1]) print 'Calculating OOB score and making OOB score map...' try: ds = gdal.Open(mosaic_path) ar = ds.ReadAsArray() mask = ar != 0 del ar xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() ds = None except: mosaic_ds = ogr.Open(mosaic_path) if 'resolution' not in inputs: warnings.warn( 'Resolution not specified. Assuming default of 30...\n') mask = mosaic_ds.GetLayer() min_x, max_x, min_y, max_y = mask.GetExtent() ul_x = min_x - ((min_x - snap_coord[0]) % resolution) ul_y = max_y - ((max_y - snap_coord[1]) % resolution) xsize = int((max_x - ul_x) / resolution) ysize = int((ul_y - min_y) / resolution) prj = mask.GetSpatialRef().ExportToWkt() driver = gdal.GetDriverByName('gtiff') x_res = resolution y_res = -resolution tx = ul_x, x_res, 0, ul_y, 0, y_res avg_dict, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx, support_size, db_path, df_sets, df_train, target_col, predict_cols, out_dir, stamp, prj, driver, oob_map_metric) df_sets.to_csv(set_txt, sep='\t') #''' avg_oob = round(avg_dict[oob_map_metric], 1) avg_cnt = int(round(avg_dict['count'], 0)) print '\nAverage OOB score: .................... %.1f' % avg_oob print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt print 'Time to make OOB score map: %.1f hours\n' % ( (time.time() - t1) / 3600) # Record params in inventory text file if 'inventory_txt' in inputs: t1 = time.time() print 'Getting model info...\n' df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') n_sets = len(df_sets) '''if 'sample' in sample_txt: n_samples = int(sample_txt.split('_')[1].replace('sample','')) inv_columns = df_inv.columns if 'n_sets' in inv_columns: df_inv.ix[stamp, 'n_sets'] = n_sets if 'n_samples' in inv_columns: df_inv.ix[stamp, 'n_samples'] = n_samples if 'support_size' in inv_columns: df_inv.ix[stamp, 'support_size'] = str(support_size) if 'sets_per_cell' in inv_columns: df_inv.ix[stamp, 'sets_per_cell'] = sets_per_cell if 'max_features' in inv_columns: df_inv.ix[stamp, 'max_features'] = max_features info_dir = os.path.dirname(inventory_txt) existing_models = fnmatch.filter(os.listdir(info_dir), '%s*' % target_col) if len(existing_models) > 0: df_inv = df_inv[df_inv.index.isin(existing_models)]#''' if 'avg_oob' in inv_columns and make_oob_map: df_inv.ix[stamp, 'avg_oob'] = avg_oob if 'avg_count' in inv_columns and make_oob_map: df_inv.ix[stamp, 'avg_count'] = avg_cnt if len(df_inv) > 1: df_inv.to_csv(inventory_txt, sep='\t') else: print 'WARNING: Model info not written to inventory_txt...\n' #''' print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, pct_train=None, min_oob=0, err_threshold=10): t0 = time.time() #read_params(params) inputs, df_var = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) try: if 'max_features' not in locals(): max_features = None if 'err_threshold' in inputs: err_threshold = float(err_threshold) if 'min_oob' in inputs: min_oob = int(min_oob) num_vars = stem.vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train) cell_size, support_size, sets_per_cell, min_obs, max_features, pct_train = num_vars str_check = sample_txt, target_col, mosaic_path, tsa_txt, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) now = datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] if not 'out_dirname' in locals(): out_dirname = target_col stamp = '{0}_{1}_{2}'.format(out_dirname, date_str, time_str) out_dir = os.path.join(out_dir, stamp) #import pdb; pdb.set_trace() os.makedirs( out_dir ) # With a timestamp in dir, no need to check if it already exists''' #stamp = os.path.dirname(out_dir) shutil.copy2(params, out_dir) #Copy the params for reference df_train = pd.read_csv(sample_txt, sep='\t') n_samples = len(df_train) # Check that df_train has exactly the same columns as variables specified in df_vars unmatched_vars = [ v for v in df_var.index if v not in [c for c in df_train] ] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str raise NameError(msg) predict_cols = sorted( np.unique( [c for c in df_train.columns for v in df_var.index if v in c])) #import pdb; pdb.set_trace() df_var = df_var.reindex(df_var.index.sort_values( )) # Make sure predict_cols and df_var are in the same order if 'constant_vars' in locals(): constant_vars = sorted([i.strip() for i in constant_vars.split(',')]) predict_cols += constant_vars # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None out_txt = os.path.join(out_dir, stamp + '.txt') df_train, df_sets, df_oob = stem.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, df_train, min_obs, target_col, predict_cols, out_txt, gsrd_shp, pct_train) # Train a tree for each support set print 'Training models...' t1 = time.time() x_train = df_train.reindex(columns=predict_cols + ['set_id']) y_train = df_train[[target_col, 'set_id']] df_sets['dt_model'] = [stem.fit_tree_regressor(x_train.ix[x_train.set_id==s, predict_cols],\ y_train.ix[y_train.set_id==s, target_col], max_features) for s in df_sets.index] del df_train print '%.1f minutes\n' % ((time.time() - t1) / 60) # Calculate OOB rates and drop sets with too low OOB print 'Calculating OOB rates...' t1 = time.time() df_sets, low_oob = stem.get_oob_rates(df_sets, df_oob, err_threshold, target_col, predict_cols, min_oob) if len(low_oob) > 0: df_sets.drop(low_oob.index, inplace=True) low_oob_shp = os.path.join(out_dir, 'gsrd_low_oob.shp') low_oob.drop('dt_model', axis=1, inplace=True) stem.coords_to_shp(low_oob, gsrd_shp, low_oob_shp) print '%s sets dropped because OOB rate < %s' % (len(low_oob), min_oob) print 'Min OOB rate after dropping: ', df_sets.oob_rate.min() print 'Estimated average OOB score: ', int(df_sets.oob_rate.mean()) print '%.1f minutes\n' % ((time.time() - t1) / 60) # Write df_sets and each decison tree to disk print 'Saving models...' t1 = time.time() df_sets, set_txt = stem.write_model(out_dir, df_sets) print '%.1f minutes\n' % ((time.time() - t1) / 60) #''' #stamp = os.path.basename(out_dir) #set_txt = '/vol/v2/stem/{0}/models/{1}/decisiontree_models/{1}_support_sets.txt'.format(target_col, stamp) #predict_cols = ['aspectNESW','aspectNWSE','brightness','delta_bright','delta_green','delta_nbr','delta_wet', 'elevation','greenness','mse','nbr','slope','time_since','wetness']#''' # Record params in inventory text file if 'inventory_txt' in locals(): t1 = time.time() '''print 'Getting model info...\n' df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') if 'regressor' in params: model_type = 'Regressor' else: model_type = 'Classifier' n_sets = len(df_sets) if 'sample' in sample_txt: n_samples = int(sample_txt.split('_')[1].replace('sample','')) info = [model_type, None, None, None, None, None, None, None, None, n_sets, n_samples, str(support_size), sets_per_cell, max_features] df_inv.ix[stamp] = info info_dir = os.path.dirname(inventory_txt) existing_models = fnmatch.filter(os.listdir(os.path.dirname(info_dir)), '%s*' % target_col) if len(existing_models) > 0: df_inv = df_inv[df_inv.index.isin(existing_models)]''' # Check if oob_map params were specified. If not, set to defaults if 'err_threshold' not in locals(): print 'err_threshold not specified. Using default: 10 ...\n' err_threshold = 10 else: err_threshold = int(err_threshold) if 'n_tiles' not in locals(): print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = int(n_tiles[0]), int(n_tiles[1]) #t1 = time.time() print 'Calculating OOB score and making OOB score map...' ds = gdal.Open(mosaic_path) ar = ds.ReadAsArray() mask = ar != 0 del ar xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() ds = None #import get_oob_map as oob ar_oob, ar_cnt, df_sets = stem.oob_map(ysize, xsize, 0, mask, n_tiles, tx, support_size, df_oob, df_sets, target_col, predict_cols, err_threshold, out_dir, stamp, prj, driver) df_sets.to_csv(set_txt, sep='\t') #''' #if 'inventory_txt' in locals() : avg_oob = round(np.mean(ar_oob[mask]), 1) avg_cnt = int(round(np.mean(ar_cnt[mask]), 0)) '''df_inv.ix[stamp, 'avg_oob'] = avg_oob #df_inv.ix[stamp, 'avg_count'] = avg_cnt if len(df_inv) > 1: df_inv.to_csv(inventory_txt, sep='\t') else: print 'WARNING: Model info not written to inventory_txt...\n' ''' print '\nAverage OOB score: .................... %.1f' % avg_oob print '\nAverage number of overlapping sets: ... %s\n' % avg_cnt print 'Time to make OOB score map: %.1f hours\n' % ( (time.time() - t1) / 3600) #except Exception as e: # print 'Problem getting oob map: ', e print 'Total training time: %.1f minutes' % ((time.time() - t0) / 60)
def main(n_tiles, tile_path=None, add_field=True, out_path=None, snap=True, clip=True): try: if add_field.lower() == 'false': add_field = False except: pass try: if snap.lower() == 'false': snap = False except: pass if tile_path is None: tile_path = TILE_PATH if not os.path.exists(tile_path): raise RuntimeError('tile_path does not exist: %s' % tile_path) try: n_tiles = tuple([int(i) for i in n_tiles.split(',')]) except: raise ValueError( 'Could not parse n_tiles %s. It must be given as "n_tiles, n_x_tiles"' % n_tiles) # Get processing tiles tx, (xmin, xmax, ymin, ymax) = tx_from_shp(tile_path, XRES, YRES) xsize = abs(int(xmax - xmin) / XRES) ysize = abs(int(ymax - ymin) / YRES) tiles, _, _ = get_tiles(n_tiles, xsize, ysize, tx=tx) tile_id_field = 'eetile%sx%s' % n_tiles tiles[tile_id_field] = tiles.index if snap: coords, _ = get_coords(tile_path, multipart='split') coords = np.array(coords) #shape is (nfeatures, ncoords, 2) xcoords = np.unique(coords[:, :, 0]) ycoords = np.unique(coords[:, :, 1]) for i, processing_coords in tiles.iterrows(): tiles.loc[i, 'ul_x'] = xcoords[np.argmin( np.abs(xcoords - processing_coords.ul_x))] tiles.loc[i, 'lr_x'] = xcoords[np.argmin( np.abs(xcoords - processing_coords.lr_x))] tiles.loc[i, 'ul_y'] = ycoords[np.argmin( np.abs(ycoords - processing_coords.ul_y))] tiles.loc[i, 'lr_y'] = ycoords[np.argmin( np.abs(ycoords - processing_coords.lr_y))] if not out_path: out_path = os.path.join(OUT_DIR, 'ee_processing_tiles_%sx%s.shp' % n_tiles) coords_to_shp(tiles, tile_path, out_path) descr = ('Tiles for processing data on Google Earth Engine. The tiles ' + 'have %s row(s) and %s col(s) and are bounded by the extent of %s') %\ (n_tiles[0], n_tiles[1], tile_path) '''if clip: ds = ogr.Open(tile_path) lyr = ds.GetLayer() geoms = ogr.Geometry(ogr.wkbMultiPolygon) for feature in lyr: g = feature.GetGeometryRef() geoms.AddGeometry(g) union = geoms.UnionCascaded() base_path, ext = os.path.splitext(tile_path) temp_file = tile_path.replace(ext, '_uniontemp' + ext) feature''' createMetadata(sys.argv, out_path, description=descr) print '\nNew processing tiles written to', out_path # Find which features processing tile touches which each CONUS storage tile # use get_overallping_sets() to find which # Read in the CONUS storage tiles if add_field: conus_tiles = attributes_to_df(tile_path) # Make a temporary copy of it base_path, ext = os.path.splitext(tile_path) temp_file = tile_path.replace(ext, '_temp' + ext) df_to_shp(conus_tiles, tile_path, temp_file, copy_fields=False) # Loop through each processing tile and find all overlapping conus_tiles[tile_id_field] = -1 ds = ogr.Open(tile_path) lyr = ds.GetLayer() for p_fid, processing_coords in tiles.iterrows(): wkt = 'POLYGON (({0} {1}, {2} {1}, {2} {3}, {0} {3}, {0} {1}))'.format( processing_coords.ul_x, processing_coords.ul_y, processing_coords.lr_x, processing_coords.lr_y) p_geom = ogr.CreateGeometryFromWkt(wkt) p_geom.CloseRings() for c_fid in conus_tiles.index: feature = lyr.GetFeature(c_fid) geom = feature.GetGeometryRef() if geom.Intersection(p_geom).GetArea() > 0: conus_tiles.loc[c_fid, tile_id_field] = p_fid lyr, feature = None, None # re-write the CONUS tiles shapefile with the new field df_to_shp(conus_tiles, tile_path, tile_path, copy_fields=False) # delete temporary file driver = ds.GetDriver() driver.DeleteDataSource(temp_file) ds = None print '\nField with processing tile ID added to', tile_path # if the metadata text file exists, add a line about appending the field. # otherwise, make a new metadata file. meta_file = tile_path.replace(ext, '_meta.txt') if os.path.exists(meta_file): with open(meta_file, 'a') as f: f.write( '\n\nAppended field %s with IDs from the overlapping feature of %s' % (tile_id_field, out_path)) else: descr = 'Tile system with appended field %s with IDs from the overlapping feature of %s' % ( tile_id_field, out_path) createMetadata(sys.argv, tile_path, description=descr)
def main(region_path, tile_path, reference_path, out_dir, id_field='region_id', ref_basename='nlcd'): df = attributes_to_df(region_path) tile_info = attributes_to_df(tile_path) tile_info['ul_x'] = tile_info.xmin tile_info['lr_x'] = tile_info.xmax tile_info['ul_y'] = tile_info.ymax tile_info['lr_y'] = tile_info.ymin _, vector_ext = os.path.splitext(region_path) region_ids = df[id_field].unique() n_regions = len(region_ids) region_ds = ogr.Open(region_path) region_lyr = region_ds.GetLayer() for i, r_id in enumerate(region_ids): print 'Making region dir for %s (%s of %s)' % (r_id, i, n_regions) df_r = df[df.region_id == r_id] id_str = ('0' + str(r_id))[-2:] fid = df_r.index[0] region_feature = region_lyr.GetFeature(fid) xmin, xmax, ymin, ymax = region_feature.GetGeometryRef().GetEnvelope() region_feature.Destroy() df_r['ul_x'] = xmin df_r['lr_x'] = xmax df_r['ul_y'] = ymax df_r['lr_y'] = ymin clip_coords = df_r.loc[fid, ['ul_x', 'lr_x', 'ul_y', 'lr_y']] region_dir = os.path.join(out_dir, 'region_%s' % id_str) if not os.path.exists(region_dir): os.mkdir(region_dir) # Make a shapefile of the tiles out_vector = os.path.join(region_dir, 'tile_{0}{1}'.format(id_str, vector_ext)) if not os.path.exists(out_vector): ''' switch to selection by min/max of coords ''' region_tiles = tile_info[tile_info[id_field] == r_id] coords_to_shp(region_tiles, region_path, out_vector) # Make a map of reference NLCD ds = gdal.Open(out_vector.replace(vector_ext, '.tif')) mask = ds.ReadAsArray() == 255 ds = None nlcd_year = re.search( '\d\d\d\d', reference_path).group() # finds the first one (potentially buggy) out_ref_map = os.path.join( region_dir, '%s_%s_%s.tif' % (ref_basename, nlcd_year, id_str)) if not False: #os.path.exists(out_ref_map): ref_ds = gdal.Open(reference_path) ref_tx = ref_ds.GetGeoTransform() ref_shape = ref_ds.RasterYSize, ref_ds.RasterXSize col_off = (ref_tx[0] - clip_coords.ul_x) / ref_tx[1] row_off = (ref_tx[3] - clip_coords.ul_y) / ref_tx[5] n_cols = abs((clip_coords.ul_x - clip_coords.lr_x) / ref_tx[1]) n_rows = abs((clip_coords.ul_y - clip_coords.lr_y) / ref_tx[1]) ar_inds, ref_inds = get_offset_array_indices( (n_rows, n_cols), ref_shape, (row_off, col_off)) ref_n_cols = ref_inds[1] - ref_inds[0] ref_n_rows = ref_inds[3] - ref_inds[2] ar_ref = ref_ds.ReadAsArray(ref_inds[2], ref_inds[0], ref_n_cols, ref_n_rows) ar = np.full((n_rows, n_cols), 255) ar[ar_inds[0]:ar_inds[1], ar_inds[2]:ar_inds[3]] = ar_ref ar[mask] = 255 tx = clip_coords.ul_x, 30, 0, clip_coords.ul_y, 0, -30 prj = ref_ds.GetProjection() driver = gdal.GetDriverByName('gtiff') array_to_raster(ar, tx, prj, driver, out_ref_map, nodata=255) # Make a clipped raster of the tiles out_raster = out_vector.replace(vector_ext, '.tif') if not os.path.exists(out_raster): tiles = ogr.Open(tile_path) tile_lyr = tiles.GetLayer() tx = clip_coords.ul_x, 30, 0, clip_coords.ul_y, 0, -30 tile_array, _ = kernel_from_shp(tile_lyr, clip_coords, tx, 255, val_field='name') tile_array[ar == 255] = 255 driver = gdal.GetDriverByName('gtiff') prj = tile_lyr.GetSpatialRef().ExportToWkt() array_to_raster(tile_array, tx, prj, driver, out_raster, nodata=255) tiles.Destroy()