def make_tiles(n_tiles, ds_snap): try: n_tiles = int(n_tiles) except ValueError: if ',' in n_tiles: try: n_tiles = [int(i) for i in n_tiles.split(',')] except ValueError: pass else: try: n_tiles = [int(i) for i in n_tiles.split()] except ValueError: raise ValueError('format of n_tiles not understood: %s' % n_tiles) ysize = ds_snap.RasterYSize xsize = ds_snap.RasterXSize tx = ds_snap.GetGeoTransform() # Figure out how many tiles belong in each row if necessary if isinstance(n_tiles, int): # if n_tiles = nx * ny and nx = ny * ratio -> y = (n_tiles/ratio) ** .5 ratio = xsize / float(ysize) ny = int((n_tiles / ratio)**.5) nx = int(n_tiles / ny) n_tiles = ny, nx _, tiles, __ = stem.get_tiles(n_tiles, xsize, ysize, tx) #stem.coords_to_shp(_, '/vol/v2/stem/extent_shp/CAORWA.shp', '/home/server/pi/homes/shooper/delete/tiles.shp') return tiles
def buffered_tile_inds(n_tiles, xsize, ysize, tx, tile_buffer, mask): # Find empty tiles print 'Finding empty tiles...' t1 = time.time() df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) empty_tiles = find_empty_tiles(df_tiles, mask, tx) df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles) print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Add buffer around each tile df_buf = df_tiles.copy() df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize) df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize) return df_tiles, df_buf
def get_stratified_sample_by_tile(raster_path, col_name, data_band, n_samples, bins, min_sample=None, max_sample=None, pct_train=1, nodata=None, sampling_scheme='equal', zero_inflation=None, data_type='continuous', kernel=False, n_tiles=(1, 1), boundary_shp=None, bin_scale=1, n_per_tile=None): ''' Return a dataframe of stratified randomly sampled pixels from raster_path ''' print 'Reading the raster_path... %s\n' % datetime.now() ds = gdal.Open(raster_path) tx = ds.GetGeoTransform() band = ds.GetRasterBand(data_band) #ar_full = band.ReadAsArray() #ar_data = band.ReadAsArray() if nodata == None: nodata = band.GetNoDataValue() if nodata == None: sys.exit('Could not obtain nodata value from dataset and' +\ ' none specified in parameters file. Try re-running with' +\ 'nodata specified.') # Split up the raster into tiles and figure out how print 'Calculating sample size per tile...' t1 = time.time() xsize = ds.RasterXSize ysize = ds.RasterYSize df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) # If a boundary shapefile is given, calculate the proportional area within # each tile if boundary_shp: boundary_ds = ogr.Open(boundary_shp) boundary_lyr = boundary_ds.GetLayer() empty_tiles = stem.find_empty_tiles(df_tiles, boundary_lyr, tx, nodata=0) df_tiles.drop(empty_tiles, inplace=True) df_tiles_rc.drop(empty_tiles, inplace=True) total_tiles = len(df_tiles) calc_proportional_area(df_tiles, boundary_shp) # Calcs area in place if n_per_tile: pct_area = df_tiles.pct_area df_tiles['pct_max_sample'] = pct_area / (pct_area.max() - pct_area.min()) df_tiles_rc['n_samples'] = (n_per_tile * df_tiles.pct_max_sample).astype(int) else: df_tiles_rc['n_samples'] = n_samples * df_tiles.pct_area else: if n_per_tile: df_tiles_rc['n_samples'] = n_per_tile else: df_tiles_rc['n_samples'] = float(n_samples) / total_tiles df_tiles['n_samples'] = df_tiles_rc.n_samples print '%.1f minutes\n' % ((time.time() - t1) / 60) # For each tile, get random sample for each bin train_rows = [] train_cols = [] test_rows = [] test_cols = [] empty_tiles = [] classes = ['_%s' % b[1] for b in bins] df_tiles = df_tiles.reindex(columns=df_tiles.columns.tolist() + classes, fill_value=0) for c, (i, tile_coords) in enumerate(df_tiles_rc.iterrows()): t1 = time.time() print 'Sampling for %d pixels for tile %s of %s...' % ( tile_coords.n_samples, c + 1, total_tiles) if tile_coords.n_samples == 0: print '\tSkipping this tile because all pixels == nodata...\n' empty_tiles.append(i) continue ul_r, lr_r, ul_c, lr_c = tile_coords[['ul_r', 'lr_r', 'ul_c', 'lr_c']] tile_ysize = lr_r - ul_r tile_xsize = lr_c - ul_c ar = band.ReadAsArray(ul_c, ul_r, tile_xsize, tile_ysize) if not type(ar) == np.ndarray: import pdb pdb.set_trace() nodata_mask = ar != nodata if not nodata_mask.any(): print '\tSkipping this tile because all pixels == nodata...\n' empty_tiles.append(i) continue ar_rows, ar_cols = np.indices(ar.shape) ar_rows = ar_rows + ul_r ar_cols = ar_cols + ul_c n_per_bin, scaled_pcts = calc_strata_sizes(ar, nodata_mask, tile_coords.n_samples, bins, sampling_scheme, bin_scale, zero_inflation) df_tiles.ix[i, classes] = n_per_bin #record sample size per bin for i, (this_min, this_max) in enumerate(bins): #t2 = time.time() try: this_sample_size = n_per_bin[i] if min_sample: this_sample_size = int(max(n_per_bin[i], min_sample)) if max_sample: if max_sample < this_sample_size: this_sample_size = max_sample except: import pdb pdb.set_trace() print 'Sampling between %s and %s: %s pixels (%.1f%% of sample for this tile) ' % ( this_min, this_max, this_sample_size, scaled_pcts[i] * 100) mask = (ar > this_min) & (ar <= this_max) & nodata_mask these_rows = ar_rows[mask] these_cols = ar_cols[mask] '''if this_max == 0 and zero_inflation: this_sample_size *= zero_inflation''' # If there aren't enough pixels to generate samples for this bin if these_rows.size < this_sample_size: #import pdb; pdb.set_trace() print (('Not enough pixels between {0} and {1} to generate {2}' +\ ' random samples. Returning all {3} pixels for this bin.'))\ .format(this_min, this_max, this_sample_size, these_rows.size) tr_rows = these_rows tr_cols = these_cols else: samples = random.sample(xrange(len(these_rows)), this_sample_size) tr_rows = these_rows[samples] tr_cols = these_cols[samples] # If pct_train is specified, split the sample indices into train/test sets te_rows = [] te_cols = [] if pct_train: split_ind = len(tr_rows) * pct_train te_rows = tr_rows[split_ind:] te_cols = tr_cols[split_ind:] tr_rows = tr_rows[:split_ind] tr_cols = tr_cols[:split_ind] train_rows.extend(tr_rows) train_cols.extend(tr_cols) test_rows.extend(te_rows) test_cols.extend(te_cols) #print '%.1f seconds\n' % (time.time() - t2) print 'Time for this tile: %.1f minutes\n' % ((time.time() - t1) / 60) del tr_rows, tr_cols, te_rows, te_cols, ar # Read the whole raster in to extract stuff ar = band.ReadAsArray() # If True, extract with 3x3 kernel. Otherwise, just get the vals (row,col) if kernel: train_vals = extract_by_kernel(ar, train_rows, train_cols, data_type, col_name, nodata) else: train_vals = ar[train_rows, train_cols] # Calculate x and y for later extractions ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx train_x = [int(ul_x + c * x_res) for c in train_cols] train_y = [int(ul_y + r * y_res) for r in train_rows] df_train = pd.DataFrame({ 'x': train_x, 'y': train_y, 'row': train_rows, 'col': train_cols, col_name: train_vals }) # If training and testing samples were split, get test vals df_test = None if pct_train > 1: if kernel: test_vals = extract_by_kernel(ar, train_rows, train_cols, data_type, col_name, nodata) else: test_vals = ar[test_rows, test_cols] test_x = [int(ul_x + c * x_res) for c in test_cols] test_y = [int(ul_y + r * y_res) for r in test_rows] df_test = pd.DataFrame({ 'x': test_x, 'y': test_y, 'row': test_rows, 'col': test_cols, col_name: test_vals }) ds = None # In case empty tiles weren't filtered out already df_tiles = df_tiles.ix[~df_tiles.index.isin(empty_tiles)] return df_train, df_test, df_tiles
def main(params, n_pieces=False, ydims=None, constant_vars=None, year='', agg_method=None): t0 = time.time() print 'Predicting Random Forest... %s\n' % time.ctime(t0) # Set optional params to default: split_predictors = False # Read params and make variables from text inputs = forest.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) # Check that variables were specified in params try: nodata = int(nodata) str_check = train_params, rf_path, mask_path, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in train_dict = forest.read_params(train_params) train_txt_bn = os.path.basename(train_dict['var_txt'][:-1]) if 'var_txt' not in locals(): var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn) if not os.path.exists(var_txt): print '' msg = 'Could not find var_txt:\n%s\n' % var_txt raise IOError(msg) df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name') # Make sure vars are sorted alphabetically since they were for training pred_vars = sorted(df_var.index) df_var = df_var.reindex(pred_vars) '''if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) #year = constant_vars['YEAR'] year = 2012 pred_constants = sorted(constant_vars.keys()) else: df_var.search_str = [s.format(2007) for s in df_var.search_str]''' #out_dir = os.path.dirname(out_raster) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir new_params = os.path.join(out_dir, os.path.basename(params)) shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year)) # Load the Random Forest model print 'Loading the RandomForest model from \n%s... \n%s\n' % ( rf_path, time.ctime(time.time())) if not os.path.exists(rf_path): raise IOError('%s does not exist' % rf_path) with open(rf_path) as f: rf_model = pickle.load(f) n_features = rf_model.n_features_ n_vars = len(df_var.index.tolist()) if 'constant_vars' in inputs: n_vars += len(pred_constants) if n_features != n_vars: print df_var.index.tolist() + pred_constants sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\ '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \ '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\ '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt)) #""" if 'agg_method' in inputs: agg_method = inputs['agg_method'] # Get mask and raster info ds = gdal.Open(mask_path) ar = ds.ReadAsArray() nodata_mask = ar != 0 xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = gdal.GetDriverByName('gtiff') ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx # Predict #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time())) t1 = time.time() predict_pieces = [] if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = [int(i) for i in n_tiles.split(',')] if 'n_tiles' in inputs: df_tiles, df_tiles_rc, tile_size = stem.get_tiles( n_tiles, xsize, ysize, tx) empty_tiles = [] ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) tile_dir = os.path.join(out_dir, 'predict_tiles') if not os.path.isdir(tile_dir): os.mkdir(tile_dir) for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()): print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles)) t1 = time.time() coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist() tsa_ar, tsa_off = mosaic.extract_kernel(ds, 1, coords, tx, xsize, ysize, nodata=nodata) tsa_mask = tsa_ar == 0 if tsa_mask.all(): print 'Tile %s empty. Skipping...' % ind continue tsa_ar[tsa_mask] = nodata # Get the ids of TSAs this kernel covers tsa_ids = np.unique(tsa_ar) #tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata] tsa_strs = [str(tsa) for tsa in tsa_ids if tsa != nodata] array_shape = tsa_ar.shape # Get an array of predictors where each column is a flattened 2D array of a # single predictor variable temp_nodata = -9999 ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar, coords, tsa_mask, temp_nodata, 1) nodata_mask = ~np.any(ar_predictors == temp_nodata, axis=1) predictors = ar_predictors[nodata_mask] t2 = time.time() if agg_method == 'mode': args = [] for dt in rf_model.estimators_: args.append([dt, predictors]) pool = Pool(rf_model.n_jobs) t3 = time.time() dt_predictions = np.vstack( pool.map(forest.par_predict_from_dt, args, 1)) print 'Prediction time: %.1f minutes' % ( (time.time() - t3) / 60) t3 = time.time() predictions = stem.mode(dt_predictions, axis=0) print 'Aggregation time: %.1f minutes' % ( (time.time() - t3) / 60) del dt_predictions t3 = time.time() pool.close() pool.join() print 'Closing time: %.1f minutes' % ((time.time() - t3) / 60) else: predictions = rf_model.predict(ar_predictors[nodata_mask]) print 'Prediction time: %.1f minutes' % ((time.time() - t2) / 60) ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8) ar_tile[nodata_mask] = predictions.astype(np.uint8) ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind] ar_out[ul_r:lr_r, ul_c:lr_c] = ar_tile.reshape(array_shape) tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res mosaic.array_to_raster(ar_tile.reshape(array_shape), tx_tile, prj, driver, os.path.join(tile_dir, 'tile_%s.tif' % ind), dtype=gdal.GDT_Byte, nodata=nodata) print 'Total time for this piece: %.1f minutes\n' % ( (time.time() - t1) / 60) #del ar_predictors, nodata_mask, ar_prediction''' #ar_prediction = np.concatenate(predict_pieces) #del predict_pieces '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) for ind, tile_coords in df_tiles_rc.iterrows(): if ind in empty_tiles: continue ul_r, lr_r, ul_c, lr_c = tile_coords tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind) if not os.path.exists(tile_file): continue ds_t = gdal.Open(tile_file) ar_tile = ds_t.ReadAsArray() t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']] ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile''' else: ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata) # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict # separately, then stack them back together if split_predictors: split_predictors = int(split_predictors) predictions = [] for i, p in enumerate( np.array_split(ar_predictors, split_predictors)): t1 = time.time() print '\nPredicting for %s of %s pieces of the final array...' % ( i + 1, split_predictors) predictions.append(rf_model.predict(p)) print '%.1f minutes' % ((time.time() - t1) / 60) predictions = np.concatenate(predictions) print '' else: print 'Predicting in one chunk...' predictions = rf_model.predict(ar_predictors) ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32) ar_prediction[nodata_mask] = predictions del ar_predictors, predictions # Save the prediction array to disk stamp = os.path.basename(out_dir) out_path = os.path.join(out_dir, '%s_rf_vote.tif' % stamp) #ar_prediction = ar_prediction.reshape(ysize, xsize) if constant_vars: out_path = out_path.replace('.tif', '_yr%s.tif' % year) forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte, nodata) #""" # Delete the tiles shutil.rmtree(tile_dir) ds = None '''stamp = os.path.basename(out_dir) path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) stamp = os.path.basename(os.path.dirname(path)) ds = gdal.Open(path) ar_prediction = ds.ReadAsArray() ds = None#''' if 'test_params' in inputs: #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id') print '\nEvaluating the model...' t1 = time.time() test_dict = forest.read_params(test_params) for i in test_dict: exec("{0} = str({1})").format(i, test_dict[i]) if 'n_trials' in test_dict: n_trials = int(n_trials) else: 'n_trials not specified. Setting default to 50...\n' n_trials = 50 if 'year' in test_dict: year = int(year) else: year = None cell_size = [int(i) for i in cell_size.split(',')] n_per_cell = int(n_per_cell) param_bn = os.path.basename(test_params) shutil.copy2( test_params, os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year))) df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx, cell_size, target_col, n_per_cell, n_trials, year) if len(roc_curves) > 0: for fpr, tpr, thresholds in roc_curves: plt.plot(fpr, tpr, 'k', alpha=.1) out_png = os.path.join(out_dir, '{0}_roc_curve_{1}.png'.format(stamp, year)) plt.savefig(out_png) if 'lc_path' in test_dict: '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t')''' #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join( out_dir, '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t') if 'inventory_txt' in test_dict: score_cols = sorted(df.columns) df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') for col in score_cols: score_mean = df[col].mean() df_inv.ix[stamp, col] = score_mean print 'Average %s: %2.3f' % (col.upper(), score_mean) df_inv.to_csv(inventory_txt, sep='\t') out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year)) df.to_csv(out_txt, sep='\t', index=False) samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t') print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1) / 60) else: print '\nEither "test_samples" or "inventory_txt" was not specified.' +\ ' This model will not be evaluated...' print '\nTotal runtime: %.1f minutes' % ((time.time() - t0) / 60) return out_path
def main(params, n_tiles=(25, 15), n_jobs=20, kernel_type='circle', filter_value=None): t0 = time.time() # Read params and make variables from text inputs = read_params(params) # Check params try: path = inputs['path'] function = inputs['function'] out_path = inputs['out_path'] kernel_size = int(inputs['kernel_size']) databand = int(inputs['databand']) except KeyError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) if 'n_jobs' in inputs: n_jobs = int(inputs['n_jobs']) if 'n_tiles' in inputs: n_tiles = [int(n) for n in inputs['n_tiles'].split(',')] if 'nodata' in inputs: nodata = int(inputs['nodata']) extra_args = () # The default for ndi.generic_filter 'extra_args' is an empty tuple if 'average' in function.lower(): func = np.nanmean elif 'mode' in function.lower(): func = mode elif 'area' in function.lower(): func = pct_nonzero if not filter_value and not 'filter_value' in inputs: sys.exit('Cannot calculate percent area without filter_value. ' +\ 'Try specifying filter_value in parameters file.') else: filter_value = int(inputs['filter_value']) elif 'equal' in function.lower(): func = is_equal_to center_idx = kernel_size**2/2 extra_args = tuple([center_idx]) else: sys.exit('Could not find filtering function for alias: %s' % function) out_dir = os.path.dirname(out_path) if not os.path.exists(out_dir): os.mkdir(out_dir) shutil.copy2(params, out_dir) print '\nReading input raster...\n' t1 = time.time() ds = gdal.Open(path) band = ds.GetRasterBand(databand) tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() xsize = ds.RasterXSize ysize = ds.RasterYSize # Get an array and mask out nodata values with nans if 'nodata' not in inputs: print 'nodata not specified in params. Getting nodata value from input dataset...\n' nodata = band.GetNoDataValue() '''ar = band.ReadAsArray() ds = None array_dtype = ar.dtype ar = ar.astype(np.float16) mask = (ar != nodata) #& (ar != 255) ar[~mask] = np.nan''' if 'area' in function.lower(): ar[(ar != filter_value) & mask] = 0 #import pdb; pdb.set_trace() #ysize, xsize = ar.shape print '%.1f minutes\n' % ((time.time() - t1)/60) if kernel_type.lower() == 'circle': #kernel_size /= 2 kernel = circle_mask(kernel_size) else: kernel = np.ones((kernel_size, kernel_size)) tile_buffer = kernel.shape[0]/2 # Tile up the array to filter in parallel # Find empty tiles print 'Finding empty tiles...' t1 = time.time() df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) '''empty_tiles = find_empty_tiles(df_tiles, mask, tx) df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles) print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60)''' # Add buffer around each tile df_buf = df_tiles_rc.copy() df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize) df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize) # Get arrays print 'Getting buffered arrays...' t1 = time.time() n_full_tiles = len(df_tiles) args = [] temp_dir = os.path.join(out_dir, 'tiles') if not os.path.exists(temp_dir): os.mkdir(temp_dir) for i, (ind, r) in enumerate(df_buf.iterrows()): #this_ar = ar[r.ul_r : r.lr_r, r.ul_c : r.lr_c] #args.append([ind, this_ar, func, kernel, extra_args, i + 1, n_full_tiles]) args.append([ind, path, databand, nodata, r, df_tiles.ix[ind], temp_dir, func, kernel, extra_args, i + 1, n_full_tiles]) #arrays.append([i, this_ar]) print '%.1f minutes\n' % ((time.time() - t1)/60) print 'Filtering chunks in parallel with %s jobs...' % n_jobs p = Pool(n_jobs) tiles = p.map(par_filter, args, 1) print '\nTotal time for filtering: %.1f minutes\n' % ((time.time() - t1)/60)#''' print 'Tiling pieces back together...' t1 = time.time() gdal_dtype = band.DataType array_dtype = gdalnumeric.GDALTypeCodeToNumericTypeCode(gdal_dtype) filtered = np.full((ysize, xsize), nodata, dtype=array_dtype) for i, tile_path in tiles: if not tile_path: continue ds_t = gdal.Open(tile_path) buffered_tile = ds_t.ReadAsArray() b_inds = df_buf.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']] t_inds = df_tiles_rc.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']] d_ulr, d_lrr, d_ulc, d_lrc = t_inds - b_inds tile = buffered_tile[d_ulr : d_lrr, d_ulc : d_lrc] tile[np.isnan(tile)] = nodata tile = tile.astype(array_dtype) t_ulr, t_lrr, t_ulc, t_lrc = t_inds filtered[t_ulr : t_lrr, t_ulc : t_lrc] = tile print '%.1f minutes\n' % ((time.time() - t1)/60) #filtered = filtered.astype(array_dtype) if 'out_nodata' in inputs: #filtered[np.isnan(filtered) | ~mask] = nodata filtered[filtered == nodata] = int(inputs['out_nodata']) nodata = int(inputs['out_nodata']) try: array_to_raster(filtered, tx, prj, driver, out_path, dtype=gdal_dtype, nodata=nodata) except: array_to_raster(filtered, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) desc = ('Raster filtered by kernel of shape {kernel_type} and size ' +\ '{kernel_size} and function {func}').format(kernel_type=kernel_type, kernel_size=kernel_size, func=function) meta_path = createMetadata(sys.argv, out_path, description=desc) write_params_to_meta(meta_path, params) del ar, filtered, tiles, args, p ds = None import pdb; pdb.set_trace() shutil.rmtree(temp_dir) print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
def main(model_dir, n_tiles, **kwargs): t0 = time.time() n_tiles = [int(n) for n in n_tiles.split(',')] if not os.path.isdir(model_dir): message = 'model directory given does not exist or is not a directory: ', model_dir raise IOError(message) model = os.path.basename(model_dir) dt_dir = os.path.join(model_dir, 'decisiontree_models') set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model) df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') pred_param_path = glob(os.path.join(model_dir, 'predict_stem_*params.txt'))[0] predict_params, df_var = stem.read_params(pred_param_path) train_param_path = glob(os.path.join(model_dir, 'train_stem_*params.txt'))[0] train_params, _ = stem.read_params(train_param_path) df_var.sort_index(inplace=True) nodata = int(predict_params['nodata'].replace('"', '')) if len(kwargs) == 0: var_ids = df_sets.max_importance.unique() var_names = df_var.ix[var_ids].index variables = zip(var_ids, var_names) else: variables = [(variable_id, variable_name) for variable_name, variable_id in kwargs] mask_path = os.path.join(model_dir, '%s_vote.bsq' % model) if not os.path.exists(mask_path): mask_path = mask_path.replace('.bsq', '.tif') mask_ds = gdal.Open(mask_path) mask_tx = mask_ds.GetGeoTransform() xsize = mask_ds.RasterXSize ysize = mask_ds.RasterYSize prj = mask_ds.GetProjection() df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mask_tx) total_tiles = len(df_tiles) df_tiles['tile'] = df_tiles.index # Find the tiles that have only nodata values t1 = time.time() print '\nFinding empty tiles...' mask = mask_ds.ReadAsArray() == nodata empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx) mask_ds = None print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Select only tiles that are not empty df_tiles = df_tiles.select(lambda x: x not in empty_tiles) total_tiles = len(df_tiles) #some_set = df_sets.iloc[0] support_size = [ int(s) for s in train_params['support_size'].replace('"', '').split(',') ] set_size = [int(abs(s / mask_tx[1])) for s in support_size] out_dir = os.path.join(model_dir, 'importance_maps') if not os.path.exists(out_dir): os.mkdir(out_dir) print variables for vi, (v_id, v_name) in enumerate(variables): t1 = time.time() print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1, len(variables)) ar = np.full((ysize, xsize), nodata, dtype=np.uint8) for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()): t2 = time.time() print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles) # Calculate the size of this tile in case it's at the edge where the # tile size will be slightly different this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x - t_row.ul_x) df_these_sets = stem.get_overlapping_sets(df_sets, t_row, this_size, support_size) rc = df_tiles_rc.ix[t_ind] this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c n_sets = len(df_these_sets) # Load overlapping predictions from disk and read them as arrays tile_ul = t_row[['ul_x', 'ul_y']] print n_sets, ' Overlapping sets' importance_bands = [] importance_values = [] for s_ind, s_row in df_these_sets.iterrows(): # Calculate offset and array/tile indices offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y), mask_tx) #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]): tile_inds, a_inds = mosaic.get_offset_array_indices( tile_size, set_size, offset) # Get feature with maximum importance and fill tile with that val try: with open(s_row.dt_file, 'rb') as f: dt_model = pickle.load(f) importance_value = int( dt_model.feature_importances_[v_id] * 100) importance_values.append(importance_value) #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8) #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata) import_band = np.full(this_size, np.nan, dtype=np.float16) import_band[tile_inds[0]:tile_inds[1], tile_inds[2]:tile_inds[3]] = importance_value importance_bands.append(import_band) except Exception as e: print e continue #''' print 'Average importance for this tile: %.1f' % np.mean( importance_values) #Aggregate importance_stack = np.dstack(importance_bands) importance_tile = np.nanmean(importance_stack, axis=2) tile_mask = mask[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] | np.isnan(importance_tile) importance_tile[tile_mask] = nodata ar[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8) print 'Aggregation time for this tile: %.1f minutes\n' % ( (time.time() - t2) / 60) '''temp_dir = os.path.join(out_dir, 'delete') if not os.path.isdir(temp_dir): os.mkdir(temp_dir) t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30 array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)''' out_path = os.path.join(out_dir, '%s_importance_%s.tif' % (model, v_name)) try: array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'), out_path, gdal.GDT_Byte, nodata) except Exception as e: print e import pdb pdb.set_trace() print 'Time for this variable: %.1f minutes\n' % ( (time.time() - t1) / 60) print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), ( (time.time() - t0) / 3600))
def main(n_tiles, tile_path=None, add_field=True, out_path=None, snap=True, clip=True): try: if add_field.lower() == 'false': add_field = False except: pass try: if snap.lower() == 'false': snap = False except: pass if tile_path is None: tile_path = TILE_PATH if not os.path.exists(tile_path): raise RuntimeError('tile_path does not exist: %s' % tile_path) try: n_tiles = tuple([int(i) for i in n_tiles.split(',')]) except: raise ValueError( 'Could not parse n_tiles %s. It must be given as "n_tiles, n_x_tiles"' % n_tiles) # Get processing tiles tx, (xmin, xmax, ymin, ymax) = tx_from_shp(tile_path, XRES, YRES) xsize = abs(int(xmax - xmin) / XRES) ysize = abs(int(ymax - ymin) / YRES) tiles, _, _ = get_tiles(n_tiles, xsize, ysize, tx=tx) tile_id_field = 'eetile%sx%s' % n_tiles tiles[tile_id_field] = tiles.index if snap: coords, _ = get_coords(tile_path, multipart='split') coords = np.array(coords) #shape is (nfeatures, ncoords, 2) xcoords = np.unique(coords[:, :, 0]) ycoords = np.unique(coords[:, :, 1]) for i, processing_coords in tiles.iterrows(): tiles.loc[i, 'ul_x'] = xcoords[np.argmin( np.abs(xcoords - processing_coords.ul_x))] tiles.loc[i, 'lr_x'] = xcoords[np.argmin( np.abs(xcoords - processing_coords.lr_x))] tiles.loc[i, 'ul_y'] = ycoords[np.argmin( np.abs(ycoords - processing_coords.ul_y))] tiles.loc[i, 'lr_y'] = ycoords[np.argmin( np.abs(ycoords - processing_coords.lr_y))] if not out_path: out_path = os.path.join(OUT_DIR, 'ee_processing_tiles_%sx%s.shp' % n_tiles) coords_to_shp(tiles, tile_path, out_path) descr = ('Tiles for processing data on Google Earth Engine. The tiles ' + 'have %s row(s) and %s col(s) and are bounded by the extent of %s') %\ (n_tiles[0], n_tiles[1], tile_path) '''if clip: ds = ogr.Open(tile_path) lyr = ds.GetLayer() geoms = ogr.Geometry(ogr.wkbMultiPolygon) for feature in lyr: g = feature.GetGeometryRef() geoms.AddGeometry(g) union = geoms.UnionCascaded() base_path, ext = os.path.splitext(tile_path) temp_file = tile_path.replace(ext, '_uniontemp' + ext) feature''' createMetadata(sys.argv, out_path, description=descr) print '\nNew processing tiles written to', out_path # Find which features processing tile touches which each CONUS storage tile # use get_overallping_sets() to find which # Read in the CONUS storage tiles if add_field: conus_tiles = attributes_to_df(tile_path) # Make a temporary copy of it base_path, ext = os.path.splitext(tile_path) temp_file = tile_path.replace(ext, '_temp' + ext) df_to_shp(conus_tiles, tile_path, temp_file, copy_fields=False) # Loop through each processing tile and find all overlapping conus_tiles[tile_id_field] = -1 ds = ogr.Open(tile_path) lyr = ds.GetLayer() for p_fid, processing_coords in tiles.iterrows(): wkt = 'POLYGON (({0} {1}, {2} {1}, {2} {3}, {0} {3}, {0} {1}))'.format( processing_coords.ul_x, processing_coords.ul_y, processing_coords.lr_x, processing_coords.lr_y) p_geom = ogr.CreateGeometryFromWkt(wkt) p_geom.CloseRings() for c_fid in conus_tiles.index: feature = lyr.GetFeature(c_fid) geom = feature.GetGeometryRef() if geom.Intersection(p_geom).GetArea() > 0: conus_tiles.loc[c_fid, tile_id_field] = p_fid lyr, feature = None, None # re-write the CONUS tiles shapefile with the new field df_to_shp(conus_tiles, tile_path, tile_path, copy_fields=False) # delete temporary file driver = ds.GetDriver() driver.DeleteDataSource(temp_file) ds = None print '\nField with processing tile ID added to', tile_path # if the metadata text file exists, add a line about appending the field. # otherwise, make a new metadata file. meta_file = tile_path.replace(ext, '_meta.txt') if os.path.exists(meta_file): with open(meta_file, 'a') as f: f.write( '\n\nAppended field %s with IDs from the overlapping feature of %s' % (tile_id_field, out_path)) else: descr = 'Tile system with appended field %s with IDs from the overlapping feature of %s' % ( tile_id_field, out_path) createMetadata(sys.argv, tile_path, description=descr)