def buffered_tile_inds(n_tiles, xsize, ysize, tx, tile_buffer, mask): # Find empty tiles print 'Finding empty tiles...' t1 = time.time() df_tiles, df_tiles_rc, _ = get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) empty_tiles = find_empty_tiles(df_tiles, mask, tx) df_tiles = df_tiles_rc.select(lambda x: x not in empty_tiles) print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Add buffer around each tile df_buf = df_tiles.copy() df_buf[['ul_r', 'ul_c']] = df_buf[['ul_r', 'ul_c']] - tile_buffer df_buf[['lr_r', 'lr_c']] = df_buf[['lr_r', 'lr_c']] + tile_buffer df_buf[['ul_r', 'lr_r']] = df_buf[['ul_r', 'lr_r']].clip(0, ysize) df_buf[['ul_c', 'lr_c']] = df_buf[['ul_c', 'lr_c']].clip(0, xsize) return df_tiles, df_buf
def get_stratified_sample_by_tile(raster_path, col_name, data_band, n_samples, bins, min_sample=None, max_sample=None, pct_train=1, nodata=None, sampling_scheme='equal', zero_inflation=None, data_type='continuous', kernel=False, n_tiles=(1, 1), boundary_shp=None, bin_scale=1, n_per_tile=None): ''' Return a dataframe of stratified randomly sampled pixels from raster_path ''' print 'Reading the raster_path... %s\n' % datetime.now() ds = gdal.Open(raster_path) tx = ds.GetGeoTransform() band = ds.GetRasterBand(data_band) #ar_full = band.ReadAsArray() #ar_data = band.ReadAsArray() if nodata == None: nodata = band.GetNoDataValue() if nodata == None: sys.exit('Could not obtain nodata value from dataset and' +\ ' none specified in parameters file. Try re-running with' +\ 'nodata specified.') # Split up the raster into tiles and figure out how print 'Calculating sample size per tile...' t1 = time.time() xsize = ds.RasterXSize ysize = ds.RasterYSize df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, tx) total_tiles = len(df_tiles) # If a boundary shapefile is given, calculate the proportional area within # each tile if boundary_shp: boundary_ds = ogr.Open(boundary_shp) boundary_lyr = boundary_ds.GetLayer() empty_tiles = stem.find_empty_tiles(df_tiles, boundary_lyr, tx, nodata=0) df_tiles.drop(empty_tiles, inplace=True) df_tiles_rc.drop(empty_tiles, inplace=True) total_tiles = len(df_tiles) calc_proportional_area(df_tiles, boundary_shp) # Calcs area in place if n_per_tile: pct_area = df_tiles.pct_area df_tiles['pct_max_sample'] = pct_area / (pct_area.max() - pct_area.min()) df_tiles_rc['n_samples'] = (n_per_tile * df_tiles.pct_max_sample).astype(int) else: df_tiles_rc['n_samples'] = n_samples * df_tiles.pct_area else: if n_per_tile: df_tiles_rc['n_samples'] = n_per_tile else: df_tiles_rc['n_samples'] = float(n_samples) / total_tiles df_tiles['n_samples'] = df_tiles_rc.n_samples print '%.1f minutes\n' % ((time.time() - t1) / 60) # For each tile, get random sample for each bin train_rows = [] train_cols = [] test_rows = [] test_cols = [] empty_tiles = [] classes = ['_%s' % b[1] for b in bins] df_tiles = df_tiles.reindex(columns=df_tiles.columns.tolist() + classes, fill_value=0) for c, (i, tile_coords) in enumerate(df_tiles_rc.iterrows()): t1 = time.time() print 'Sampling for %d pixels for tile %s of %s...' % ( tile_coords.n_samples, c + 1, total_tiles) if tile_coords.n_samples == 0: print '\tSkipping this tile because all pixels == nodata...\n' empty_tiles.append(i) continue ul_r, lr_r, ul_c, lr_c = tile_coords[['ul_r', 'lr_r', 'ul_c', 'lr_c']] tile_ysize = lr_r - ul_r tile_xsize = lr_c - ul_c ar = band.ReadAsArray(ul_c, ul_r, tile_xsize, tile_ysize) if not type(ar) == np.ndarray: import pdb pdb.set_trace() nodata_mask = ar != nodata if not nodata_mask.any(): print '\tSkipping this tile because all pixels == nodata...\n' empty_tiles.append(i) continue ar_rows, ar_cols = np.indices(ar.shape) ar_rows = ar_rows + ul_r ar_cols = ar_cols + ul_c n_per_bin, scaled_pcts = calc_strata_sizes(ar, nodata_mask, tile_coords.n_samples, bins, sampling_scheme, bin_scale, zero_inflation) df_tiles.ix[i, classes] = n_per_bin #record sample size per bin for i, (this_min, this_max) in enumerate(bins): #t2 = time.time() try: this_sample_size = n_per_bin[i] if min_sample: this_sample_size = int(max(n_per_bin[i], min_sample)) if max_sample: if max_sample < this_sample_size: this_sample_size = max_sample except: import pdb pdb.set_trace() print 'Sampling between %s and %s: %s pixels (%.1f%% of sample for this tile) ' % ( this_min, this_max, this_sample_size, scaled_pcts[i] * 100) mask = (ar > this_min) & (ar <= this_max) & nodata_mask these_rows = ar_rows[mask] these_cols = ar_cols[mask] '''if this_max == 0 and zero_inflation: this_sample_size *= zero_inflation''' # If there aren't enough pixels to generate samples for this bin if these_rows.size < this_sample_size: #import pdb; pdb.set_trace() print (('Not enough pixels between {0} and {1} to generate {2}' +\ ' random samples. Returning all {3} pixels for this bin.'))\ .format(this_min, this_max, this_sample_size, these_rows.size) tr_rows = these_rows tr_cols = these_cols else: samples = random.sample(xrange(len(these_rows)), this_sample_size) tr_rows = these_rows[samples] tr_cols = these_cols[samples] # If pct_train is specified, split the sample indices into train/test sets te_rows = [] te_cols = [] if pct_train: split_ind = len(tr_rows) * pct_train te_rows = tr_rows[split_ind:] te_cols = tr_cols[split_ind:] tr_rows = tr_rows[:split_ind] tr_cols = tr_cols[:split_ind] train_rows.extend(tr_rows) train_cols.extend(tr_cols) test_rows.extend(te_rows) test_cols.extend(te_cols) #print '%.1f seconds\n' % (time.time() - t2) print 'Time for this tile: %.1f minutes\n' % ((time.time() - t1) / 60) del tr_rows, tr_cols, te_rows, te_cols, ar # Read the whole raster in to extract stuff ar = band.ReadAsArray() # If True, extract with 3x3 kernel. Otherwise, just get the vals (row,col) if kernel: train_vals = extract_by_kernel(ar, train_rows, train_cols, data_type, col_name, nodata) else: train_vals = ar[train_rows, train_cols] # Calculate x and y for later extractions ul_x, x_res, x_rot, ul_y, y_rot, y_res = tx train_x = [int(ul_x + c * x_res) for c in train_cols] train_y = [int(ul_y + r * y_res) for r in train_rows] df_train = pd.DataFrame({ 'x': train_x, 'y': train_y, 'row': train_rows, 'col': train_cols, col_name: train_vals }) # If training and testing samples were split, get test vals df_test = None if pct_train > 1: if kernel: test_vals = extract_by_kernel(ar, train_rows, train_cols, data_type, col_name, nodata) else: test_vals = ar[test_rows, test_cols] test_x = [int(ul_x + c * x_res) for c in test_cols] test_y = [int(ul_y + r * y_res) for r in test_rows] df_test = pd.DataFrame({ 'x': test_x, 'y': test_y, 'row': test_rows, 'col': test_cols, col_name: test_vals }) ds = None # In case empty tiles weren't filtered out already df_tiles = df_tiles.ix[~df_tiles.index.isin(empty_tiles)] return df_train, df_test, df_tiles
def main(model_dir, n_tiles, **kwargs): t0 = time.time() n_tiles = [int(n) for n in n_tiles.split(',')] if not os.path.isdir(model_dir): message = 'model directory given does not exist or is not a directory: ', model_dir raise IOError(message) model = os.path.basename(model_dir) dt_dir = os.path.join(model_dir, 'decisiontree_models') set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model) df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') pred_param_path = glob(os.path.join(model_dir, 'predict_stem_*params.txt'))[0] predict_params, df_var = stem.read_params(pred_param_path) train_param_path = glob(os.path.join(model_dir, 'train_stem_*params.txt'))[0] train_params, _ = stem.read_params(train_param_path) df_var.sort_index(inplace=True) nodata = int(predict_params['nodata'].replace('"', '')) if len(kwargs) == 0: var_ids = df_sets.max_importance.unique() var_names = df_var.ix[var_ids].index variables = zip(var_ids, var_names) else: variables = [(variable_id, variable_name) for variable_name, variable_id in kwargs] mask_path = os.path.join(model_dir, '%s_vote.bsq' % model) if not os.path.exists(mask_path): mask_path = mask_path.replace('.bsq', '.tif') mask_ds = gdal.Open(mask_path) mask_tx = mask_ds.GetGeoTransform() xsize = mask_ds.RasterXSize ysize = mask_ds.RasterYSize prj = mask_ds.GetProjection() df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mask_tx) total_tiles = len(df_tiles) df_tiles['tile'] = df_tiles.index # Find the tiles that have only nodata values t1 = time.time() print '\nFinding empty tiles...' mask = mask_ds.ReadAsArray() == nodata empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx) mask_ds = None print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Select only tiles that are not empty df_tiles = df_tiles.select(lambda x: x not in empty_tiles) total_tiles = len(df_tiles) #some_set = df_sets.iloc[0] support_size = [ int(s) for s in train_params['support_size'].replace('"', '').split(',') ] set_size = [int(abs(s / mask_tx[1])) for s in support_size] out_dir = os.path.join(model_dir, 'importance_maps') if not os.path.exists(out_dir): os.mkdir(out_dir) print variables for vi, (v_id, v_name) in enumerate(variables): t1 = time.time() print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1, len(variables)) ar = np.full((ysize, xsize), nodata, dtype=np.uint8) for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()): t2 = time.time() print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles) # Calculate the size of this tile in case it's at the edge where the # tile size will be slightly different this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x - t_row.ul_x) df_these_sets = stem.get_overlapping_sets(df_sets, t_row, this_size, support_size) rc = df_tiles_rc.ix[t_ind] this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c n_sets = len(df_these_sets) # Load overlapping predictions from disk and read them as arrays tile_ul = t_row[['ul_x', 'ul_y']] print n_sets, ' Overlapping sets' importance_bands = [] importance_values = [] for s_ind, s_row in df_these_sets.iterrows(): # Calculate offset and array/tile indices offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y), mask_tx) #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]): tile_inds, a_inds = mosaic.get_offset_array_indices( tile_size, set_size, offset) # Get feature with maximum importance and fill tile with that val try: with open(s_row.dt_file, 'rb') as f: dt_model = pickle.load(f) importance_value = int( dt_model.feature_importances_[v_id] * 100) importance_values.append(importance_value) #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8) #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata) import_band = np.full(this_size, np.nan, dtype=np.float16) import_band[tile_inds[0]:tile_inds[1], tile_inds[2]:tile_inds[3]] = importance_value importance_bands.append(import_band) except Exception as e: print e continue #''' print 'Average importance for this tile: %.1f' % np.mean( importance_values) #Aggregate importance_stack = np.dstack(importance_bands) importance_tile = np.nanmean(importance_stack, axis=2) tile_mask = mask[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] | np.isnan(importance_tile) importance_tile[tile_mask] = nodata ar[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8) print 'Aggregation time for this tile: %.1f minutes\n' % ( (time.time() - t2) / 60) '''temp_dir = os.path.join(out_dir, 'delete') if not os.path.isdir(temp_dir): os.mkdir(temp_dir) t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30 array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)''' out_path = os.path.join(out_dir, '%s_importance_%s.tif' % (model, v_name)) try: array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'), out_path, gdal.GDT_Byte, nodata) except Exception as e: print e import pdb pdb.set_trace() print 'Time for this variable: %.1f minutes\n' % ( (time.time() - t1) / 60) print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), ( (time.time() - t0) / 3600))