def par_filter(args): t0 = time.time() #ind, ar, func, kernel, extra_args, i, n_tiles = args ind, path, databand, nodata, r, tile_coords, out_dir, func, kernel, extra_args, i, n_tiles = args ds = gdal.Open(path) nrows = r.lr_r - r.ul_r ncols = r.lr_c - r.ul_c ar = ds.GetRasterBand(databand).ReadAsArray(r.ul_c, r.ul_r, ncols, nrows) mask = ar == nodata if np.all(mask): return ind, None ar = ndi.generic_filter(ar, func, footprint=kernel, extra_arguments=extra_args) ar[mask] = nodata _, x_res, _, _, _, y_res = ds.GetGeoTransform() driver = gdal.GetDriverByName('gtiff') prj = ds.GetProjection() tx = tile_coords.ul_x, x_res, 0, tile_coords.ul_y, 0, y_res out_path = os.path.join(out_dir, 'tile_%s.tif' % ind) array_to_raster(ar, tx, prj, driver, out_path, nodata=nodata) print 'Time for tile %s of %s: %.1f minutes' % (i, n_tiles, ((time.time() - t0)/60)) ds = None return ind, out_path
def main(pred_path, targ_path, lc_path, mask_path, nodata_p, nodata_t, nodata_lc, search_dir, search_str, eval_scales, out_dir, clip_shp=None): pxl_scale_dir = os.path.join(out_dir, 'pixel_scale') if not os.path.exists(pxl_scale_dir): os.makedirs(pxl_scale_dir) ds_m = gdal.Open(mask_path) tx_m = ds_m.GetGeoTransform() ar_m = ds_m.ReadAsArray().astype(np.int32) nonforest = ar_m == 1 ar_m = None print('\nReading in raster data...\n') ds_p = gdal.Open(pred_path) ar_p = ds_p.ReadAsArray() tx = ds_p.GetGeoTransform() prj = ds_p.GetProjection() driver = ds_p.GetDriver() ds_t = gdal.Open(targ_path) ar_t = ds_t.ReadAsArray() ar_t[ar_t == 0] = nodata_t ar_t[nonforest] = nodata_t ar_p[nonforest] = nodata_p stdv_path = pred_path.replace('vote', 'stdv') ds_stdv = gdal.Open(stdv_path) ar_stdv = ds_stdv.ReadAsArray() print('Getting difference map...') t0 = time.time() ar_diff, nans = get_dif_map(ar_p, ar_t, nodata_p, nodata_t) ras_ext = pred_path.split('.')[-1] dif_path = os.path.join(pxl_scale_dir, 'prediction_minus_target.' + ras_ext) mosaic.array_to_raster(ar_diff, tx, prj, driver, dif_path, GDT_Int32, nodata_p) print('%.1f seconds\n' % (time.time() - t0)) shps = find_files(search_dir, search_str, eval_scales) print('Calculating stats and plotting for all evaluation scales...') for eval_scale, zone_shp in shps: #If clip_shp is specified, assume that zone shape is unclipped and clip it if clip_shp: print ('clip_shp given so... getting only features from %s that ' +\ 'overlap %s') % (zone_shp, clip_shp) out_shp = zone_shp.replace( '.shp', '_%s.shp' % os.path.basename(clip_shp)[:-4]) get_overlapping_polys(zone_shp, clip_shp, out_shp) zone_shp = out_shp scale_dir = os.path.join(out_dir, 'scale_%s_m' % eval_scale) if not os.path.exists(scale_dir): os.mkdir(scale_dir) print('Getting zonal stats for %s scale...' % eval_scale) t0 = time.time() df_stats = zonal_stats(ar_p, ar_t, ar_diff, ar_stdv, zone_shp, tx, nodata_p, nodata_t) out_txt = os.path.join(scale_dir, 'zonal_stats_%s.txt' % eval_scale) df_stats.to_csv(out_txt, sep='\t', index=False) print('%.1f seconds\n' % (time.time() - t0)) print('Writing stats to shp...') t0 = time.time() out_shp = os.path.join(scale_dir, 'zonal_stats_%s.shp' % eval_scale) df_to_shp(df_stats, zone_shp, out_shp, copy_fields=False) print('%.1f seconds\n' % (time.time() - t0)) print('Making scatter plot for %s scale...' % eval_scale) t0 = time.time() plt.scatter(df_stats.targ_mean, df_stats.pred_mean, alpha=.05) plt.xlabel('Target') plt.ylabel('Prediction') scatter_path = os.path.join(scale_dir, 'scatter_%s.png' % eval_scale) plt.savefig(scatter_path) print('%.1f seconds\n' % (time.time() - t0)) ar_stdv = None ds_stdv = None ar_diff = None ar_t_data = ar_t[~nans] ar_p_data = ar_p[~nans] print('Plotting scatter of the 2 maps...') t0 = time.time() inds = random.sample(xrange(len(ar_t_data)), 100000) x = ar_t_data[inds] y = ar_p_data[inds] plt.scatter(x, y, alpha=.01) plt.xlabel(os.path.basename(targ_path)) plt.ylabel(os.path.basename(pred_path)) fig_path = os.path.join(pxl_scale_dir, 'prediction_vs_target_scatter_no0.png') plt.savefig(fig_path) plt.clf() print('%.1f seconds\n' % (time.time() - t0)) # Create 2D histograms print('Plotting 2D histogram...') t0 = time.time() plt.hist2d(ar_t_data, ar_p_data, bins=50, norm=LogNorm()) plt.xlabel(os.path.basename(targ_path)) plt.ylabel(os.path.basename(pred_path)) plt.colorbar() fig_path = os.path.join(pxl_scale_dir, 'prediction_vs_target_2Dhistogram_no0.png') plt.savefig(fig_path) plt.clf() print('%.1f seconds\n' % (time.time() - t0)) print('Evaluating by land cover class...') t0 = time.time() ds_lc = gdal.Open(lc_path) ar_lc = ds_lc.ReadAsArray() df_lc = evaluate_by_lc(ar_p, ar_t, ar_lc, ~nans, nodata_lc, pxl_scale_dir) print('%.1f seconds\n' % (time.time() - t0)) print('Plotting bin stats...') t0 = time.time() plot_bin_agreement(ar_p_data, ar_t_data, nodata_t, pxl_scale_dir) print('%.1f seconds\n' % (time.time() - t0)) #''' print('Calculating confusion matrix...') t0 = time.time() out_txt = os.path.join(pxl_scale_dir, 'confusion_matrix.txt') ar_t_samples = ar_t_data[inds] ar_p_samples = ar_p_data[inds] confusion_matrix(ar_p_samples, ar_t_samples, out_txt=out_txt) print('%.1f seconds\n' % (time.time() - t0)) ds_p = None ds_t = None ds_lc = None ar_p = None ar_t = None ar_lc = None print('Outputs written to ', out_dir)
def main(params, n_pieces=False, ydims=None, constant_vars=None, year='', agg_method=None): t0 = time.time() print 'Predicting Random Forest... %s\n' % time.ctime(t0) # Set optional params to default: split_predictors = False # Read params and make variables from text inputs = forest.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) # Check that variables were specified in params try: nodata = int(nodata) str_check = train_params, rf_path, mask_path, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Raise an error if the var_txt path doesn't exist. Otherwise, just read it in train_dict = forest.read_params(train_params) train_txt_bn = os.path.basename(train_dict['var_txt'][:-1]) if 'var_txt' not in locals(): var_txt = os.path.join(os.path.dirname(rf_path), train_txt_bn) if not os.path.exists(var_txt): print '' msg = 'Could not find var_txt:\n%s\n' % var_txt raise IOError(msg) df_var = pd.read_csv(var_txt, sep='\t', index_col='var_name') # Make sure vars are sorted alphabetically since they were for training pred_vars = sorted(df_var.index) df_var = df_var.reindex(pred_vars) '''if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) #year = constant_vars['YEAR'] year = 2012 pred_constants = sorted(constant_vars.keys()) else: df_var.search_str = [s.format(2007) for s in df_var.search_str]''' #out_dir = os.path.dirname(out_raster) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir new_params = os.path.join(out_dir, os.path.basename(params)) shutil.copy2(params, new_params.replace('.txt', '_%s.txt' % year)) # Load the Random Forest model print 'Loading the RandomForest model from \n%s... \n%s\n' % ( rf_path, time.ctime(time.time())) if not os.path.exists(rf_path): raise IOError('%s does not exist' % rf_path) with open(rf_path) as f: rf_model = pickle.load(f) n_features = rf_model.n_features_ n_vars = len(df_var.index.tolist()) if 'constant_vars' in inputs: n_vars += len(pred_constants) if n_features != n_vars: print df_var.index.tolist() + pred_constants sys.exit(('\nKeyError: Number of features of the random forest model does not match the number of variables in df_var.' +\ '\nNumber of features of the model: {0} \nNumber of variables in var_txt: {1}' + \ '\nCheck that all predictors for used in var_txt to train the model are in this var_txt ' +\ '\nPath of Random Forest model: {2}\nPath of var_txt: {3}').format(n_features, n_vars, rf_path, var_txt)) #""" if 'agg_method' in inputs: agg_method = inputs['agg_method'] # Get mask and raster info ds = gdal.Open(mask_path) ar = ds.ReadAsArray() nodata_mask = ar != 0 xsize = ds.RasterXSize ysize = ds.RasterYSize tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = gdal.GetDriverByName('gtiff') ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx # Predict #print 'Predicting with %s processors... %s' % (rf_model.n_jobs, time.ctime(time.time())) t1 = time.time() predict_pieces = [] if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = [int(i) for i in n_tiles.split(',')] if 'n_tiles' in inputs: df_tiles, df_tiles_rc, tile_size = stem.get_tiles( n_tiles, xsize, ysize, tx) empty_tiles = [] ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) tile_dir = os.path.join(out_dir, 'predict_tiles') if not os.path.isdir(tile_dir): os.mkdir(tile_dir) for i, (ind, tile_coords) in enumerate(df_tiles.iterrows()): print 'Predicting for tile %s of %s...' % (i + 1, len(df_tiles)) t1 = time.time() coords = tile_coords[['ul_x', 'ul_y', 'lr_x', 'lr_y']].tolist() tsa_ar, tsa_off = mosaic.extract_kernel(ds, 1, coords, tx, xsize, ysize, nodata=nodata) tsa_mask = tsa_ar == 0 if tsa_mask.all(): print 'Tile %s empty. Skipping...' % ind continue tsa_ar[tsa_mask] = nodata # Get the ids of TSAs this kernel covers tsa_ids = np.unique(tsa_ar) #tsa_strs = ['0' + str(tsa) for tsa in tsa_ids if tsa!=nodata] tsa_strs = [str(tsa) for tsa in tsa_ids if tsa != nodata] array_shape = tsa_ar.shape # Get an array of predictors where each column is a flattened 2D array of a # single predictor variable temp_nodata = -9999 ar_predictors = stem.get_predictors(df_var, tx, tsa_strs, tsa_ar, coords, tsa_mask, temp_nodata, 1) nodata_mask = ~np.any(ar_predictors == temp_nodata, axis=1) predictors = ar_predictors[nodata_mask] t2 = time.time() if agg_method == 'mode': args = [] for dt in rf_model.estimators_: args.append([dt, predictors]) pool = Pool(rf_model.n_jobs) t3 = time.time() dt_predictions = np.vstack( pool.map(forest.par_predict_from_dt, args, 1)) print 'Prediction time: %.1f minutes' % ( (time.time() - t3) / 60) t3 = time.time() predictions = stem.mode(dt_predictions, axis=0) print 'Aggregation time: %.1f minutes' % ( (time.time() - t3) / 60) del dt_predictions t3 = time.time() pool.close() pool.join() print 'Closing time: %.1f minutes' % ((time.time() - t3) / 60) else: predictions = rf_model.predict(ar_predictors[nodata_mask]) print 'Prediction time: %.1f minutes' % ((time.time() - t2) / 60) ar_tile = np.full(ar_predictors.shape[0], nodata, dtype=np.uint8) ar_tile[nodata_mask] = predictions.astype(np.uint8) ul_r, lr_r, ul_c, lr_c = df_tiles_rc.ix[ind] ar_out[ul_r:lr_r, ul_c:lr_c] = ar_tile.reshape(array_shape) tx_tile = tile_coords.ul_x, x_res, x_rot, tile_coords.ul_y, y_rot, y_res mosaic.array_to_raster(ar_tile.reshape(array_shape), tx_tile, prj, driver, os.path.join(tile_dir, 'tile_%s.tif' % ind), dtype=gdal.GDT_Byte, nodata=nodata) print 'Total time for this piece: %.1f minutes\n' % ( (time.time() - t1) / 60) #del ar_predictors, nodata_mask, ar_prediction''' #ar_prediction = np.concatenate(predict_pieces) #del predict_pieces '''ar_out = np.full((ysize, xsize), nodata, dtype=np.uint8) for ind, tile_coords in df_tiles_rc.iterrows(): if ind in empty_tiles: continue ul_r, lr_r, ul_c, lr_c = tile_coords tile_file = os.path.join(tile_dir, 'tile_%s.tif' % ind) if not os.path.exists(tile_file): continue ds_t = gdal.Open(tile_file) ar_tile = ds_t.ReadAsArray() t_ulx = df_tiles.ix[ind, ['ul_x', 'ul_y']] ar_out[ul_r : lr_r, ul_c : lr_c] = ar_tile''' else: ar_predictors, nodata_mask = forest.get_predictors(df_var, nodata) # If the predictions are too large (i.e. cause memory errors), split the predictor array into pieces and predict # separately, then stack them back together if split_predictors: split_predictors = int(split_predictors) predictions = [] for i, p in enumerate( np.array_split(ar_predictors, split_predictors)): t1 = time.time() print '\nPredicting for %s of %s pieces of the final array...' % ( i + 1, split_predictors) predictions.append(rf_model.predict(p)) print '%.1f minutes' % ((time.time() - t1) / 60) predictions = np.concatenate(predictions) print '' else: print 'Predicting in one chunk...' predictions = rf_model.predict(ar_predictors) ar_prediction = np.full(nodata_mask.shape[0], nodata, dtype=np.float32) ar_prediction[nodata_mask] = predictions del ar_predictors, predictions # Save the prediction array to disk stamp = os.path.basename(out_dir) out_path = os.path.join(out_dir, '%s_rf_vote.tif' % stamp) #ar_prediction = ar_prediction.reshape(ysize, xsize) if constant_vars: out_path = out_path.replace('.tif', '_yr%s.tif' % year) forest.array_to_raster(ar_out, tx, prj, driver, out_path, gdal.GDT_Byte, nodata) #""" # Delete the tiles shutil.rmtree(tile_dir) ds = None '''stamp = os.path.basename(out_dir) path = os.path.join(out_dir, 'final_%s_yr2011.tif' % stamp) stamp = os.path.basename(os.path.dirname(path)) ds = gdal.Open(path) ar_prediction = ds.ReadAsArray() ds = None#''' if 'test_params' in inputs: #df_test = pd.read_csv(test_samples, sep='\t', index_col='obs_id') print '\nEvaluating the model...' t1 = time.time() test_dict = forest.read_params(test_params) for i in test_dict: exec("{0} = str({1})").format(i, test_dict[i]) if 'n_trials' in test_dict: n_trials = int(n_trials) else: 'n_trials not specified. Setting default to 50...\n' n_trials = 50 if 'year' in test_dict: year = int(year) else: year = None cell_size = [int(i) for i in cell_size.split(',')] n_per_cell = int(n_per_cell) param_bn = os.path.basename(test_params) shutil.copy2( test_params, os.path.join(out_dir, param_bn.replace('.txt', '_%s.txt' % year))) df, samples, roc_curves = evaluate_ebird(sample_txt, ar_prediction, tx, cell_size, target_col, n_per_cell, n_trials, year) if len(roc_curves) > 0: for fpr, tpr, thresholds in roc_curves: plt.plot(fpr, tpr, 'k', alpha=.1) out_png = os.path.join(out_dir, '{0}_roc_curve_{1}.png'.format(stamp, year)) plt.savefig(out_png) if 'lc_path' in test_dict: '''df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join('/vol/v2/stem/ebird/results/performance_by_lc', '{0}_eval_{1}_land_cover.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t')''' #df_samples = pd.read_csv(sample_txt, sep='\t', index_col='obs_id') df_lc = evaluate_by_lc(samples, ar_prediction, lc_path, target_col) out_txt = os.path.join( out_dir, '{0}_eval_{1}_land_cover_all_samples.txt'.format(stamp, year)) df_lc.to_csv(out_txt, sep='\t') if 'inventory_txt' in test_dict: score_cols = sorted(df.columns) df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') for col in score_cols: score_mean = df[col].mean() df_inv.ix[stamp, col] = score_mean print 'Average %s: %2.3f' % (col.upper(), score_mean) df_inv.to_csv(inventory_txt, sep='\t') out_txt = os.path.join(out_dir, '{0}_eval_{1}.txt'.format(stamp, year)) df.to_csv(out_txt, sep='\t', index=False) samples.to_csv(out_txt.replace('.txt', '_samples.txt'), sep='\t') print '\nTotal eval time: %.1f minutes\n' % ((time.time() - t1) / 60) else: print '\nEither "test_samples" or "inventory_txt" was not specified.' +\ ' This model will not be evaluated...' print '\nTotal runtime: %.1f minutes' % ((time.time() - t0) / 60) return out_path
def main(params, n_tiles=(25, 15), n_jobs=20, kernel_type='circle', filter_value=None): t0 = time.time() # Read params and make variables from text inputs = read_params(params) # Check params try: path = inputs['path'] function = inputs['function'] out_path = inputs['out_path'] kernel_size = int(inputs['kernel_size']) databand = int(inputs['databand']) except KeyError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) if 'n_jobs' in inputs: n_jobs = int(inputs['n_jobs']) if 'n_tiles' in inputs: n_tiles = [int(n) for n in inputs['n_tiles'].split(',')] if 'nodata' in inputs: nodata = int(inputs['nodata']) extra_args = () # The default for ndi.generic_filter 'extra_args' is an empty tuple if 'average' in function.lower(): func = np.nanmean elif 'mode' in function.lower(): func = mode elif 'area' in function.lower(): func = pct_nonzero if not filter_value and not 'filter_value' in inputs: sys.exit('Cannot calculate percent area without filter_value. ' +\ 'Try specifying filter_value in parameters file.') else: filter_value = int(inputs['filter_value']) elif 'equal' in function.lower(): func = is_equal_to center_idx = kernel_size**2/2 extra_args = tuple([center_idx]) else: sys.exit('Could not find filtering function for alias: %s' % function) out_dir = os.path.dirname(out_path) if not os.path.exists(out_dir): os.mkdir(out_dir) shutil.copy2(params, out_dir) print '\nReading input raster...\n' t1 = time.time() ds = gdal.Open(path) band = ds.GetRasterBand(databand) tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = ds.GetDriver() xsize = ds.RasterXSize ysize = ds.RasterYSize # Get an array and mask out nodata values with nans if 'nodata' not in inputs: print 'nodata not specified in params. Getting nodata value from input dataset...\n' nodata = band.GetNoDataValue() '''ar = band.ReadAsArray() ds = None array_dtype = ar.dtype ar = ar.astype(np.float16) mask = (ar != nodata) #& (ar != 255) ar[~mask] = np.nan''' if 'area' in function.lower(): ar[(ar != filter_value) & mask] = 0 #import pdb; pdb.set_trace() #ysize, xsize = ar.shape print '%.1f minutes\n' % ((time.time() - t1)/60) if kernel_type.lower() == 'circle': #kernel_size /= 2 kernel = circle_mask(kernel_size) else: kernel = np.ones((kernel_size, kernel_size)) tile_buffer = kernel.shape[0]/2 # Tile up the array to filter in parallel # Find empty tiles print 'Finding empty tiles...' t1 = time.time() df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) '''empty_tiles = find_empty_tiles(df_tiles, mask, tx) df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles) print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60)''' # Add buffer around each tile df_buf = df_tiles_rc.copy() df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize) df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize) # Get arrays print 'Getting buffered arrays...' t1 = time.time() n_full_tiles = len(df_tiles) args = [] temp_dir = os.path.join(out_dir, 'tiles') if not os.path.exists(temp_dir): os.mkdir(temp_dir) for i, (ind, r) in enumerate(df_buf.iterrows()): #this_ar = ar[r.ul_r : r.lr_r, r.ul_c : r.lr_c] #args.append([ind, this_ar, func, kernel, extra_args, i + 1, n_full_tiles]) args.append([ind, path, databand, nodata, r, df_tiles.ix[ind], temp_dir, func, kernel, extra_args, i + 1, n_full_tiles]) #arrays.append([i, this_ar]) print '%.1f minutes\n' % ((time.time() - t1)/60) print 'Filtering chunks in parallel with %s jobs...' % n_jobs p = Pool(n_jobs) tiles = p.map(par_filter, args, 1) print '\nTotal time for filtering: %.1f minutes\n' % ((time.time() - t1)/60)#''' print 'Tiling pieces back together...' t1 = time.time() gdal_dtype = band.DataType array_dtype = gdalnumeric.GDALTypeCodeToNumericTypeCode(gdal_dtype) filtered = np.full((ysize, xsize), nodata, dtype=array_dtype) for i, tile_path in tiles: if not tile_path: continue ds_t = gdal.Open(tile_path) buffered_tile = ds_t.ReadAsArray() b_inds = df_buf.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']] t_inds = df_tiles_rc.ix[i, ['ul_r', 'lr_r', 'ul_c', 'lr_c']] d_ulr, d_lrr, d_ulc, d_lrc = t_inds - b_inds tile = buffered_tile[d_ulr : d_lrr, d_ulc : d_lrc] tile[np.isnan(tile)] = nodata tile = tile.astype(array_dtype) t_ulr, t_lrr, t_ulc, t_lrc = t_inds filtered[t_ulr : t_lrr, t_ulc : t_lrc] = tile print '%.1f minutes\n' % ((time.time() - t1)/60) #filtered = filtered.astype(array_dtype) if 'out_nodata' in inputs: #filtered[np.isnan(filtered) | ~mask] = nodata filtered[filtered == nodata] = int(inputs['out_nodata']) nodata = int(inputs['out_nodata']) try: array_to_raster(filtered, tx, prj, driver, out_path, dtype=gdal_dtype, nodata=nodata) except: array_to_raster(filtered, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) desc = ('Raster filtered by kernel of shape {kernel_type} and size ' +\ '{kernel_size} and function {func}').format(kernel_type=kernel_type, kernel_size=kernel_size, func=function) meta_path = createMetadata(sys.argv, out_path, description=desc) write_params_to_meta(meta_path, params) del ar, filtered, tiles, args, p ds = None import pdb; pdb.set_trace() shutil.rmtree(temp_dir) print 'Total time: %.1f minutes' % ((time.time() - t0)/60)
def main(params): '''### copy params to out_dir #### ''' #read_params(params) inputs, df_var = read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) try: num_vars = vars_to_numbers(cell_size, support_size, sets_per_cell, min_obs, pct_train, n_tiles, nodata) cell_size, support_size, sets_per_cell, min_obs, pct_train, n_tiles, nodata = num_vars str_check = sample_txt, target_col, mosaic_path, tsa_txt, dep_var_name, out_dir except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) return None now = datetime.now() date_str = str(now.date()).replace('-','') time_str = str(now.time()).replace(':','')[:4] stamp = '{0}_{1}_{2}'.format(dep_var_name, date_str, time_str) out_dir = os.path.join(out_dir, stamp) os.makedirs(out_dir) # With a timestamp in dir, no need to check if it exists shutil.copy2(params, out_dir) #Copy the params for reference # Get samples and support set bounds if 'gsrd_shp' not in locals(): gsrd_shp = None out_txt = os.path.join(out_dir, stamp + '.txt') dfs = gsrd.get_gsrd(mosaic_path, cell_size, support_size, sets_per_cell, sample_txt, min_obs, pct_train, dep_var_name, out_txt, gsrd_shp) df_train, df_test, df_sets = dfs support_sets = df_train.set_id.unique() # Check that df_train has exactly the same columns as variables specified in df_vars # Last four characters in each column of df_train should be year unmatched_vars = [v for v in df_var.index if v not in [c for c in df_train]] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in sample_txt but specified in params:\n' + unmatched_str raise NameError(msg) predict_cols = sorted(np.unique([c for c in df_train.columns for v in df_var.index if v in c])) df_var = df_var.reindex(df_var.index.sort_values())# Make sure predict_cols and df_var are in the same order # Train a tree for each support set x_train = df_train.reindex(columns=predict_cols + ['set_id']) y_train = df_train[[target_col, 'set_id']] df_sets['dt_model'] = [fit_tree(x_train.ix[x_train.set_id==s, predict_cols],\ y_train.ix[y_train.set_id==s, target_col]) for s in support_sets] # Write df_sets and each decison tree to disk write_model(out_dir, df_sets) mosaic_ds = gdal.Open(mosaic_path, GA_ReadOnly) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() t0 = time.time() predict_dir = os.path.join(out_dir, 'predctions') os.mkdir(predict_dir) # Loop through each set and generate predictions m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx c = 1 total_sets = len(support_sets) predictions = {} for set_id, row in df_sets.iterrows(): print 'Predicting for set %s of %s' % (c, total_sets) ar_coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = predict_set(set_id, df_var, mosaic_ds, ar_coords, mosaic_tx, xsize, ysize, row.dt_model, nodata) #predictions[set_id] = ar_predict tx = ar_coords['ul_x'], x_res, x_rot, ar_coords['ul_y'], y_rot, y_res out_path = predict_dir + '/prediction_%s.bsq' % set_id mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, GDT_Int32, nodata=nodata) c += 1 mosaic_ds = None print '\nTotal time for predictions: %.1f minutes' % ((time.time() - t0)/60)#''' #Aggregate predictions by tile and stitch them back together aggr.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_tx, support_size, predict_dir, df_sets, out_dir, stamp, prj, driver)
def main(params, inventory_txt=None, constant_vars=None): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) # If constants were given, make a dict and make sure they match the training # constants if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) pred_constants = sorted(constant_vars.keys()) train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')] train_constants = sorted(train_constants) unmatched_vars = [v for v in pred_vars if v not in train_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in pred_constants if v not in train_constants] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in train_constants if v not in pred_constants] pred_vars += pred_constants # Add here because it would screw with stuff upstream if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs' in inputs: # Predict in parallel n_jobs = int(n_jobs) args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of TSA arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] dtype_code = mosaic_ds.GetRasterBand(1).DataType mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True) # Build list of args to pass to the Pool tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) ds = gdal.Open(tsa_raster) tsa_tx = ds.GetGeoTransform() ds = None tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx) args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1)/60) p = Pool(n_jobs) p.map(stem.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp)) ar_vote = ds.ReadAsArray() ds = None ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp)) ar_mean = ds.ReadAsArray() ds = None#''' if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nComputing confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') print confusion_params df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e '''print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] #mean_acc = df_m.ix['user','producer'] #mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap #print 'Mean accuracy .............. ', mean_acc #print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
for c, (set_id, row) in enumerate(df_sets.iterrows()): coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic if mosaic_predictors: if mosaic_path.endswith('.shp'): tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata) else: tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] np_dtype = get_min_numpy_dtype(tsa_ar) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True) tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out) else: set_mosaic_path = None tsa_ar = None tsa_off = None # Build list of args to pass to the Pool args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords, mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1)/60) p = Pool(n_jobs) p.map(stem_conus.par_predict, args, 1)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var = pd.read_csv(var_info, sep='\t', index_col='var_name') df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''), sep='\t', index_col='var_name') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if overwrite_tiles.lower() == 'false': overwrite_tiles = False if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db') if os.path.exists(db_path): engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') #''' else: set_txt = stem.find_file(model_dir, '*support_sets.txt') if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace( '.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format( clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True) #''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path, 1) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) i = 0 for feature in mosaic_ds: g = feature.GetGeometryRef() # Check that the feature is valid. Clipping can produce a feautre # w/ an area of 0 if g.GetArea() > 1: mosaic_geom.AddGeometry(g) else: fid = feature.GetFID() feature.Destroy() mosaic_ds.DeleteFeature(fid) #import pdb; pdb.set_trace() df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom.UnionCascaded()) xsize = int((max_x - min_x) / resolution) ysize = int((max_y - min_y) / resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"', '') snap_coord = [float(c) for c in snap_coord.split(',')] #''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df( mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 90 x 40 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(out_dir, '_temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: pattern = re.compile('tile_\d+_%s.tif' % stat) stat_match = [f.split('_')[1] for f in files if pattern.match(f)] try: tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) except: pass #import pdb; pdb.set_trace() index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False) tiles.set_index(index_field, inplace=True) #''' tiles['ul_x'] = [ stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows() ] tiles['ul_y'] = [ stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows() ] tiles['lr_x'] = [ xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows() ] tiles['lr_y'] = [ ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows() ] support_nrows = int(support_size[0] / abs(y_res)) support_ncols = int(support_size[1] / abs(x_res)) t1 = time.time() # Patch for unknown Landcover screwup args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin([ '1931', '2810', '0705', '0954', '2814', '1986', '2552', '2019', '2355', '3354', '2278', '2559' ])].iterrows())] args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[ tiles['name'].isin(['0705'])].iterrows())] # Patch for the GEE subset 2 outside-of-buffer 'slice' #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())] # Original line #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())] limits = [] for arg in args: print tile_info[tile_id_field] limits.append(stem.par_predict_tile(arg)) #''' ### return print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ( (time.time() - t1) / 3600) try: limits = pd.concat(limits) except: # They're all None pass t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: #dtype = mosaic.get_min_numpy_dtype(limits[stat]) dtype = np.int16 if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) #dtype) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=dtype) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join( tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) try: ds = gdal.Open(tile_file) except: print 'Tile not found' continue tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off:row_off + tile_rows, col_off:col_off + tile_cols] = ar_tile except Exception as e: pass #import pdb; pdb.set_trace() out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles #shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1) / 60) # Save the importance values importance = pd.DataFrame({ 'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [ int(r) for r in importance.pct_importance.rank(method='first', ascending=False) ] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t') #''' print '\nTotal prediction runtime: %.1f hours\n' % ( (time.time() - t0) / 3600)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs_pred=0, n_jobs_agg=0, mosaic_nodata=0): inputs, df_var = stem_conus.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem_conus.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) if mosaic_path.endswith('.shp'): mosaic_type = 'vector' if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() xsize = int((max_x - min_x) / resolution) ysize = int((max_y - min_y) / resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 mosaic_tx, extent = stem_conus.tx_from_shp(mosaic_path, x_res, y_res) #df_tiles = attributes_to_df(mosaic_path) else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 25, 15 else: n_tiles = [int(i) for i in n_tiles.split(',')] df_tiles, df_tiles_rc, tile_size = stem_conus.get_tiles( n_tiles, xsize, ysize, mosaic_tx) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob( os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs_pred' in inputs: n_jobs = int(n_jobs_pred) # Predict in parallel args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of tile arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] '''if mosaic_type == 'vector': tsa_ar, tsa_off = mosaic.kernel_from_shp(mosaic_ds, coords, mosaic_tx, nodata=0) else: tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) set_mosaic_path = os.path.join(predict_dir, 'tsa_%s.tif' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] np_dtype = get_min_numpy_dtype(tsa_ar) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, set_mosaic_path, gdal_dtype, silent=True) pct_progress = float(c + 1)/total_sets * 100 sys.stdout.write('\rRetreived points for feature %s of %s (%%%.1f)' % (c + 1, total_sets, pct_progress)) sys.stdout.flush()''' # Build list of args to pass to the Pool #tsa_off = stem_conus.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tx_out[0], tx_out[3]), tx_out) args.append([ coords, mosaic_type, mosaic_path, mosaic_tx, prj, nodata, c, total_sets, set_id, df_var, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir ]) #args.append([c, total_sets, set_id, df_var, set_mosaic_path, tsa_off, coords, #mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, #constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1) / 60) p = Pool(n_jobs) p.map(stem_conus.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem_conus.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.tif' % set_id) np_dtype = get_min_numpy_dtype(ar_predict) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ( (time.time() - t1) / 60) #mosaic = None print '\nTotal time for predicting: %.1f hours\n' % ( (time.time() - t0) / 3600) #''' """ #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) t1 = time.time() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] if 'n_jobs_agg' in inputs: n_jobs_agg = int(n_jobs_agg) if mosaic_type == 'vector': nodata_mask = mosaic_ds else: if 'mosaic_nodata' in inputs: mosaic_nodata = int(mosaic_nodata) nodata_mask = mosaic_ds.ReadAsArray() != mosaic_nodata ######################################################################################################################################## # jdb 6/22/17 check for sets that errored - if there are any, remove them from the df_sets DF so that the aggregation step doesn't expect them setErrorLog = os.path.dirname(predict_dir) + '/predication_errors.txt' if os.path.isfile(setErrorLog): with open(setErrorLog) as f: lines = f.readlines() badSets = [ int(line.split(':')[1].rstrip().strip()) for line in lines if 'set_id' in line ] for thisSet in badSets: df_sets = df_sets[df_sets.index != thisSet] ######################################################################################################################################## pct_importance, df_sets = stem_conus.aggregate_predictions( n_tiles, ysize, xsize, nodata, nodata_mask, mosaic_tx, support_size, agg_stats, predict_dir, df_sets, out_dir, file_stamp, prj, driver, n_jobs_agg) #print 'Total aggregation time: %.1f hours\n' % ((time.time() - t0)/3600) mosaic_ds = None mosaic_dataset = None # Save the importance values importance = pd.DataFrame({ 'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [ int(r) for r in importance.pct_importance.rank(method='first', ascending=False) ] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t') #''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user', 'producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = [ 'vote_accuracy', 'vote_kappa' ] #, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[ file_stamp, cols] = vote_acc, vote_kap #, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0) / 60)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, file_stamp + '.db') try: engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#''' except: set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True)#''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) for feature in mosaic_ds: mosaic_geom.AddGeometry(feature.GetGeometryRef()) df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom) xsize = int((max_x - min_x)/resolution) ysize = int((max_y - min_y)/resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"','') snap_coord = [float(c) for c in snap_coord.split(',')]#''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(model_dir, 'temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)] tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)] tiles.set_index(index_field, inplace=True) tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()] tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()] tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin','xmin']].iterrows()] tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin','ymin']].iterrows()] support_nrows = int(support_size[0]/abs(y_res)) support_ncols = int(support_size[1]/abs(x_res)) t1 = time.time() args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())] #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())] if n_jobs > 1: print 'Predicting with %s jobs...\n' % n_jobs pool = Pool(n_jobs) pool.map(stem.predict_tile, args, 1) pool.close() pool.join() else: for arg in args: print 'Predicting with 1 job ...\n' stem.predict_tile(*arg)#''' print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600) t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) ds = gdal.Open(tile_file) tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile except Exception as e: import pdb; pdb.set_trace() out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1)/60) # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user','producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)
def main(in_raster, snap_raster, in_nodata, out_nodata, out_path=None, mask_val=None, overwrite=False): t0 = time.time() in_nodata = int(in_nodata) out_nodata = int(out_nodata) print '\nOpening datasets... ' t1 = time.time() ds_in = gdal.Open(in_raster) ar_in = ds_in.ReadAsArray() tx_in = ds_in.GetGeoTransform() driver = ds_in.GetDriver() ds_in = None ds_snap = gdal.Open(snap_raster) ar_snap = ds_snap.ReadAsArray() tx_snap = ds_snap.GetGeoTransform() prj = ds_snap.GetProjection() ds_snap = None print '%.1f seconds\n' % (time.time() - t1) print 'Snapping input raster...' t1 = time.time() offset = calc_offset((tx_snap[0], tx_snap[3]), tx_in) snap_inds, in_inds = get_offset_array_indices(ar_snap.shape, ar_in.shape, offset) np_dtype = ar_in.dtype ar = np.full(ar_snap.shape, out_nodata, dtype=np_dtype) ar_in[ar_in == in_nodata] = out_nodata ar[snap_inds[0]:snap_inds[1], snap_inds[2]:snap_inds[3]] = ar_in[in_inds[0]:in_inds[1], in_inds[2]:in_inds[3]] if mask_val: mask_val = int(mask_val) ar[ar_snap == mask_val] = out_nodata print '%.1f seconds\n' % (time.time() - t1) if out_path: if ar.max() <= 255 and ar.min() >= 0: gdal_dtype = gdal.GDT_Byte else: gdal_dtype = gdal.GDT_Int16 if os.path.exists(out_path) and not overwrite: sys.exit('out_path already exists') array_to_raster(ar, tx_snap, prj, driver, out_path, gdal_dtype, out_nodata) # Write metadata desc = ('Input raster %s snapped to the extent of %s.') % (in_raster, snap_raster) if mask_val: desc += ' Data were masked from snap raster with value %s.' % mask_val createMetadata(sys.argv, out_path, description=desc) else: return ar print '\nTotal time to snap raster: %.1f seconds\n' % (time.time() - t0)