def get_zone_inds(ar_size, zone_size, tx, feat): ''' Return the array offset indices for pixels overlapping a feature from a vector dataset. Array indices are returned as (upper_row, lower_row, left_col,_right col) to be used to index an array as [upper_row : lower_row, left_col : right_col] ''' geom = feat.GetGeometryRef() x1, x2, y1, y2 = geom.GetEnvelope() # Get the feature ul x and y, and calculate the pixel offset ar_ulx, x_res, x_rot, ar_uly, y_rot, y_res = tx x_sign = x_res/abs(x_res) y_sign = y_res/abs(y_res) f_ulx = min([x0/x_sign for x0 in [x1, x2]])/x_sign f_uly = min([y0/y_sign for y0 in [y1, y2]])/y_sign offset = stem.calc_offset((ar_ulx, ar_uly), (f_ulx, f_uly), tx) # Get the inds for the overlapping portions of each array a_inds, m_inds = mosaic.get_offset_array_indices(ar_size, zone_size, offset) return a_inds, m_inds
def main(params, inventory_txt=None, constant_vars=None): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: n_tiles = [int(i) for i in n_tiles.split(',')] support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) # If constants were given, make a dict and make sure they match the training # constants if 'constant_vars' in inputs: constant_vars = parse_constant_vars(constant_vars) pred_constants = sorted(constant_vars.keys()) train_constants = [i.replace(' ', '') for i in train_inputs['constant_vars'].strip('"').split(',')] train_constants = sorted(train_constants) unmatched_vars = [v for v in pred_vars if v not in train_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in pred_constants if v not in train_constants] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in train params but specified in predict params:\n' + unmatched_str raise NameError(msg) unmatched_vars = [v for v in train_vars if v not in pred_vars] if 'constant_vars' in inputs: unmatched_vars += [v for v in train_constants if v not in pred_constants] pred_vars += pred_constants # Add here because it would screw with stuff upstream if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') total_sets = len(df_sets) t0 = time.time() if 'n_jobs' in inputs: # Predict in parallel n_jobs = int(n_jobs) args = [] t1 = time.time() print 'Predicting in parallel with %s jobs...' % n_jobs print 'Building args and making rasters of TSA arrays...' for c, (set_id, row) in enumerate(df_sets.iterrows()): # Save rasters of tsa arrays ahead of time to avoid needing to pickle or fork mosaic_ds coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] tsa_ar, tsa_off = mosaic.extract_kernel(mosaic_ds, 1, coords, mosaic_tx, xsize, ysize, nodata=nodata) tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) tx_out = row.ul_x, mosaic_tx[1], mosaic_tx[2], row.ul_y, mosaic_tx[4], mosaic_tx[5] dtype_code = mosaic_ds.GetRasterBand(1).DataType mosaic.array_to_raster(tsa_ar, tx_out, prj, driver, tsa_raster, stem.get_gdal_dtype(dtype_code), silent=True) # Build list of args to pass to the Pool tsa_raster = os.path.join(predict_dir, 'tsa_%s.bsq' % set_id) ds = gdal.Open(tsa_raster) tsa_tx = ds.GetGeoTransform() ds = None tsa_off = stem.calc_offset((mosaic_tx[0], mosaic_tx[3]), (tsa_tx[0], tsa_tx[3]), tsa_tx) args.append([c, total_sets, set_id, df_var, tsa_raster, tsa_off, coords, mosaic_tx, xsize, ysize, row.dt_file, nodata, np.uint8, constant_vars, predict_dir]) print '%.1f minutes\n' % ((time.time() - t1)/60) p = Pool(n_jobs) p.map(stem.par_predict, args, 1) else: # Loop through each set and generate predictions for c, (set_id, row) in enumerate(df_sets.ix[1043:].iterrows()): t1 = time.time() with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) print '\nPredicting for set %s of %s' % (c + 1, total_sets) coords = row[['ul_x', 'ul_y', 'lr_x', 'lr_y']] ar_predict = stem.predict_set(set_id, df_var, mosaic_ds, coords, mosaic_tx, xsize, ysize, dt_model, nodata, np.int16, constant_vars) tx = coords.ul_x, x_res, x_rot, coords.ul_y, y_rot, y_res out_path = os.path.join(predict_dir, 'prediction_%s.bsq' % set_id) mosaic.array_to_raster(ar_predict, tx, prj, driver, out_path, gdal.GDT_Byte, nodata=nodata) print 'Total time for this set: %.1f minutes' % ((time.time() - t1)/60) #mosaic_ds = None print '\nTotal time for predicting: %.1f hours\n' % ((time.time() - t0)/3600)#''' #Aggregate predictions by tile and stitch them back together if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) ar_vote, pct_importance, df_sets = stem.aggregate_predictions(ysize, xsize, nodata, n_tiles, mosaic_ds, support_size, predict_dir, df_sets, out_dir, file_stamp, prj, driver, 0) #df_sets.to_csv(set_txt, sep='\t')''' mosaic_ds = None # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' '''ds = gdal.Open(os.path.join(model_dir, '%s_vote.bsq' % file_stamp)) ar_vote = ds.ReadAsArray() ds = None ds = gdal.Open(os.path.join(model_dir, '%s_mean.bsq' % file_stamp)) ar_mean = ds.ReadAsArray() ds = None#''' if 'confusion_params' in locals(): import confusion_matrix as confusion vote_dir = os.path.join(model_dir, 'evaluation_vote') mean_dir = os.path.join(model_dir, 'evaluation_mean') print '\nComputing confusion matrix for vote...' out_txt = os.path.join(vote_dir, 'confusion.txt') print confusion_params df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e '''print '\nGetting confusion matrix for mean...' out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] #mean_acc = df_m.ix['user','producer'] #mean_kap = df_m.ix['user', 'kappa'] if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap #print 'Mean accuracy .............. ', mean_acc #print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f\n' % ((time.time() - t0)/60)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs = stem.read_params(params) for i in inputs: exec("{0} = str({1})").format(i, inputs[i]) df_var = pd.read_csv(var_info, sep='\t', index_col='var_name') df_var.data_band = [int(b) for b in df_var.data_band] #sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = pd.read_csv(train_inputs['var_info'].replace('"', ''), sep='\t', index_col='var_name') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if overwrite_tiles.lower() == 'false': overwrite_tiles = False if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, os.path.basename(model_dir) + '.db') if os.path.exists(db_path): engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id') #''' else: set_txt = stem.find_file(model_dir, '*support_sets.txt') if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace( '.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format( clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True) #''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path, 1) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) i = 0 for feature in mosaic_ds: g = feature.GetGeometryRef() # Check that the feature is valid. Clipping can produce a feautre # w/ an area of 0 if g.GetArea() > 1: mosaic_geom.AddGeometry(g) else: fid = feature.GetFID() feature.Destroy() mosaic_ds.DeleteFeature(fid) #import pdb; pdb.set_trace() df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom.UnionCascaded()) xsize = int((max_x - min_x) / resolution) ysize = int((max_y - min_y) / resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"', '') snap_coord = [float(c) for c in snap_coord.split(',')] #''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df( mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 90 x 40 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(out_dir, '_temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: pattern = re.compile('tile_\d+_%s.tif' % stat) stat_match = [f.split('_')[1] for f in files if pattern.match(f)] try: tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) except: pass #import pdb; pdb.set_trace() index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False) tiles.set_index(index_field, inplace=True) #''' tiles['ul_x'] = [ stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin', 'xmax']].iterrows() ] tiles['ul_y'] = [ stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin', 'ymax']].iterrows() ] tiles['lr_x'] = [ xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin', 'xmin']].iterrows() ] tiles['lr_y'] = [ ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin', 'ymin']].iterrows() ] support_nrows = int(support_size[0] / abs(y_res)) support_ncols = int(support_size[1] / abs(x_res)) t1 = time.time() # Patch for unknown Landcover screwup args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin([ '1931', '2810', '0705', '0954', '2814', '1986', '2552', '2019', '2355', '3354', '2278', '2559' ])].iterrows())] args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[ tiles['name'].isin(['0705'])].iterrows())] # Patch for the GEE subset 2 outside-of-buffer 'slice' #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tiles['name'].isin(['0639','0718','0797','0876','0955','1034'])].iterrows())] # Original line #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.loc[tile_files.isnull().any(axis=1).values].iterrows())] limits = [] for arg in args: print tile_info[tile_id_field] limits.append(stem.par_predict_tile(arg)) #''' ### return print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ( (time.time() - t1) / 3600) try: limits = pd.concat(limits) except: # They're all None pass t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: #dtype = mosaic.get_min_numpy_dtype(limits[stat]) dtype = np.int16 if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) #dtype) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=dtype) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join( tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) try: ds = gdal.Open(tile_file) except: print 'Tile not found' continue tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off:row_off + tile_rows, col_off:col_off + tile_cols] = ar_tile except Exception as e: pass #import pdb; pdb.set_trace() out_path = os.path.join(out_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles #shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1) / 60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1) / 60) # Save the importance values importance = pd.DataFrame({ 'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [ int(r) for r in importance.pct_importance.rank(method='first', ascending=False) ] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t') #''' print '\nTotal prediction runtime: %.1f hours\n' % ( (time.time() - t0) / 3600)
def main(txt, n_sample, out_txt, bins, train_params, by_psu=True, extract_predictors=True): n_sample = int(n_sample) bins = parse_bins(bins) df = pd.read_csv(txt, sep='\t', dtype={'tile_id': object}) sample = pd.DataFrame(columns=df.columns) n_bins = len(bins) psu_ids = df.tile_id.unique() train_params = stem.read_params(train_params) for var in train_params: exec ("{0} = str({1})").format(var, train_params[var]) tiles = attributes_to_df(MOSAIC_SHP) if extract_predictors: var_info = pd.read_csv(var_info, sep='\t', index_col='var_name') for i, tile in enumerate(psu_ids): print("extracting %s of %s" % (i, len(psu_ids))) sample_mask = df.tile_id == tile this_sample = df.loc[sample_mask] tile_ul = tiles.loc[tiles['name'] == tile, ['xmin', 'ymax']].values[0] #point_dict = get_point_dict(df, psu_ids) mosaic_tx, extent = stem.tx_from_shp(MOSAIC_SHP, 30, -30) row_off, col_off = stem.calc_offset([mosaic_tx[0], mosaic_tx[3]], tile_ul, mosaic_tx) this_sample['local_row'] = this_sample.row - row_off this_sample['local_col'] = this_sample.col - col_off for var_name, var_row in var_info.iterrows(): #tiles = pd.DataFrame({'tile_id': psu_ids, 'tile_str': psu_ids}) file_path = stem.find_file(var_row.basepath, var_row.search_str, tile) ds = gdal.Open(file_path) ar = ds.GetRasterBand(var_row.data_band).ReadAsArray() try: if len(this_sample) == ar.size: df.loc[sample_mask, var_name] = ar.ravel() else: df.loc[sample_mask, var_name] = ar[this_sample.local_row, this_sample.local_col] except Exception as e: print(e) import pdb; pdb.set_trace() ds = None df.to_csv(txt.replace('.txt', '_predictors.txt')) #df[var_name], _ = extract.extract_var('', var_name, var_row.by_tile, var_row.data_band, var_row.data_type, tiles, df, point_dict, var_row.basepath, var_row.search_str, var_row.path_filter, mosaic_tx, 0, 0, silent=True) if by_psu: n_per_psu = n_sample/len(psu_ids) n_per_bin = n_per_psu/n_bins for i, pid in enumerate(psu_ids): psu_pixels = df.loc[df.tile_id == pid] print("Sampling for %s of %s PSUs" % (i + 1, len(psu_ids))) for l, u in bins: this_bin = psu_pixels.loc[(l < psu_pixels.value) & (psu_pixels.value <= u)] if len(this_bin) > 0: bin_sample_size = min(n_per_bin, len(this_bin)) sample = pd.concat([sample, this_bin.sample(bin_sample_size)]) print("Sampled %s for bin %s-%s" % (n_per_bin, l, u)) else: print("No pixels between %s and %s found" % (l, u)) print("") else: n_per_bin = n_sample/n_bins for l, u in bins: sample = pd.concat([sample, df.sample(n_per_bin)]) sample.to_csv(out_txt, index=False) print 'Sample written to ', out_txt
def main(raster, nodata, psu_shp, out_dir): nodata = int(nodata) psus = attributes_to_df(psu_shp) ds = gdal.Open(raster) ar = ds.GetVirtualMemArray() #ReadAsArray() tx = ds.GetGeoTransform() prj = ds.GetProjection() driver = gdal.GetDriverByName('gtiff') # Just extract the test sample first test_sample_dfs = [] print '\nGetting test samples for PSUs...' for i, psu in psus.iterrows(): # Calc offsets row_off, col_off = calc_offset((tx[0], tx[3]), psu[['ul_x', 'ul_y']], tx) n_rows = abs(int((psu.ymax - psu.ymin) / tx[5])) n_cols = abs(int((psu.xmax - psu.xmin) / tx[1])) # Get values test_data = ar[row_off:row_off + n_rows, col_off:col_off + n_cols].ravel() mask = test_data != nodata # Get row/col and x/y vals test_data = test_data[mask] row_inds, col_inds = np.indices((n_rows, n_cols), dtype=np.uint32) row_inds = row_inds.ravel()[mask] col_inds = col_inds.ravel()[mask] these_row_inds = row_inds + row_off these_col_inds = col_inds + col_off y_coords = row_inds * tx[5] + psu.ul_y x_coords = col_inds * tx[1] + psu.ul_x df = pd.DataFrame({ 'row': these_row_inds, 'col': these_col_inds, 'y': y_coords, 'x': x_coords, 'value': test_data, 'tile_id': psu['name'] }) #import pdb; pdb.set_trace() test_sample_dfs.append(df) test_sample = pd.concat(test_sample_dfs, ignore_index=True) basename = os.path.basename(raster) out_txt = os.path.join(out_dir, basename.replace(basename[-4:], '_test.txt')) test_sample.to_csv(out_txt, sep='\t', index=False) # Read the raster as a write-able array and set all test samples to nodata print '\nAssigning nodata val to PSUs in training raster...\n' ar = ds.ReadAsArray() ar[test_sample.row, test_sample.col] = nodata out_raster = out_txt.replace('_test.txt', '_train.tif') array_to_raster(ar, tx, prj, driver, out_raster, nodata=nodata) desc = 'Training raster and test sample (text file with the same name but "_test" at the end) for making and evaluating STEM CONUS maps. Primary sampling units (PSUs) reserved for testing are assigned nodata.' desc += '\n\tInput raster: %s' % os.path.abspath(raster) desc += '\n\tNodata value: %s' % nodata desc += '\n\tPSU shapefile: %s' % os.path.abspath(psu_shp) desc += '\n\tOutput directory: %s\n' % os.path.abspath(out_dir) createMetadata(sys.argv, out_raster, description=desc) #''' ds = None
def main(model_dir, n_tiles, **kwargs): t0 = time.time() n_tiles = [int(n) for n in n_tiles.split(',')] if not os.path.isdir(model_dir): message = 'model directory given does not exist or is not a directory: ', model_dir raise IOError(message) model = os.path.basename(model_dir) dt_dir = os.path.join(model_dir, 'decisiontree_models') set_txt = os.path.join(dt_dir, '%s_support_sets.txt' % model) df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') pred_param_path = glob(os.path.join(model_dir, 'predict_stem_*params.txt'))[0] predict_params, df_var = stem.read_params(pred_param_path) train_param_path = glob(os.path.join(model_dir, 'train_stem_*params.txt'))[0] train_params, _ = stem.read_params(train_param_path) df_var.sort_index(inplace=True) nodata = int(predict_params['nodata'].replace('"', '')) if len(kwargs) == 0: var_ids = df_sets.max_importance.unique() var_names = df_var.ix[var_ids].index variables = zip(var_ids, var_names) else: variables = [(variable_id, variable_name) for variable_name, variable_id in kwargs] mask_path = os.path.join(model_dir, '%s_vote.bsq' % model) if not os.path.exists(mask_path): mask_path = mask_path.replace('.bsq', '.tif') mask_ds = gdal.Open(mask_path) mask_tx = mask_ds.GetGeoTransform() xsize = mask_ds.RasterXSize ysize = mask_ds.RasterYSize prj = mask_ds.GetProjection() df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mask_tx) total_tiles = len(df_tiles) df_tiles['tile'] = df_tiles.index # Find the tiles that have only nodata values t1 = time.time() print '\nFinding empty tiles...' mask = mask_ds.ReadAsArray() == nodata empty_tiles = stem.find_empty_tiles(df_tiles, ~mask, mask_tx) mask_ds = None print '%s empty tiles found of %s total tiles\n%.1f minutes\n' %\ (len(empty_tiles), total_tiles, (time.time() - t1)/60) # Select only tiles that are not empty df_tiles = df_tiles.select(lambda x: x not in empty_tiles) total_tiles = len(df_tiles) #some_set = df_sets.iloc[0] support_size = [ int(s) for s in train_params['support_size'].replace('"', '').split(',') ] set_size = [int(abs(s / mask_tx[1])) for s in support_size] out_dir = os.path.join(model_dir, 'importance_maps') if not os.path.exists(out_dir): os.mkdir(out_dir) print variables for vi, (v_id, v_name) in enumerate(variables): t1 = time.time() print 'Making map for %s: %s of %s variables\n' % (v_name, vi + 1, len(variables)) ar = np.full((ysize, xsize), nodata, dtype=np.uint8) for i, (t_ind, t_row) in enumerate(df_tiles.iterrows()): t2 = time.time() print 'Aggregating for %s of %s tiles' % (i + 1, total_tiles) # Calculate the size of this tile in case it's at the edge where the # tile size will be slightly different this_size = abs(t_row.lr_y - t_row.ul_y), abs(t_row.lr_x - t_row.ul_x) df_these_sets = stem.get_overlapping_sets(df_sets, t_row, this_size, support_size) rc = df_tiles_rc.ix[t_ind] this_size = rc.lr_r - rc.ul_r, rc.lr_c - rc.ul_c n_sets = len(df_these_sets) # Load overlapping predictions from disk and read them as arrays tile_ul = t_row[['ul_x', 'ul_y']] print n_sets, ' Overlapping sets' importance_bands = [] importance_values = [] for s_ind, s_row in df_these_sets.iterrows(): # Calculate offset and array/tile indices offset = stem.calc_offset(tile_ul, (s_row.ul_x, s_row.ul_y), mask_tx) #if abs(offset[0]) > this_size[0] or abs(offset[1] > this_size[1]): tile_inds, a_inds = mosaic.get_offset_array_indices( tile_size, set_size, offset) # Get feature with maximum importance and fill tile with that val try: with open(s_row.dt_file, 'rb') as f: dt_model = pickle.load(f) importance_value = int( dt_model.feature_importances_[v_id] * 100) importance_values.append(importance_value) #filled = np.full((nrows, ncols), importance_value, dtype=np.uint8) #import_band = stem.fill_tile_band(this_size, filled, tile_inds, nodata) import_band = np.full(this_size, np.nan, dtype=np.float16) import_band[tile_inds[0]:tile_inds[1], tile_inds[2]:tile_inds[3]] = importance_value importance_bands.append(import_band) except Exception as e: print e continue #''' print 'Average importance for this tile: %.1f' % np.mean( importance_values) #Aggregate importance_stack = np.dstack(importance_bands) importance_tile = np.nanmean(importance_stack, axis=2) tile_mask = mask[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] | np.isnan(importance_tile) importance_tile[tile_mask] = nodata ar[rc.ul_r:rc.lr_r, rc.ul_c:rc.lr_c] = np.round(importance_tile).astype(np.uint8) print 'Aggregation time for this tile: %.1f minutes\n' % ( (time.time() - t2) / 60) '''temp_dir = os.path.join(out_dir, 'delete') if not os.path.isdir(temp_dir): os.mkdir(temp_dir) t_tx = tile_ul[0], 30, 0, tile_ul[1], 0, -30 array_to_raster(np.round(importance_tile).astype(np.uint8), t_tx, prj, gdal.GetDriverByName('gtiff'), os.path.join(temp_dir, 'delete_%s.tif' % t_ind), gdal.GDT_Byte, 255, True)''' out_path = os.path.join(out_dir, '%s_importance_%s.tif' % (model, v_name)) try: array_to_raster(ar, mask_tx, prj, gdal.GetDriverByName('gtiff'), out_path, gdal.GDT_Byte, nodata) except Exception as e: print e import pdb pdb.set_trace() print 'Time for this variable: %.1f minutes\n' % ( (time.time() - t1) / 60) print '\nTotal time for %s variables: %.1f hours\n' % (len(variables), ( (time.time() - t0) / 3600))
def main(params, pct_train=None, aggregate_presence=False): t0 = time.time() # Read params. Make variables from each line of the 1-line variables inputs, df_vars = stem.read_params(params) for var in inputs: exec("{0} = str({1})").format(var, inputs[var]) try: if 'years' in inputs: years = np.array([int(yr) for yr in years.split(',')]) else: year_start = int(year_start) year_end = int(year_end) years = np.arange(year_start, year_end + 1) '''tsa_mosaic = inputs['tsa_mosaic'] search_dir = inputs['search_dir'] search_str = inputs['search_str'] obs_txt = inputs['obs_txt'] index_col = inputs['index_col'] year_col = inputs['year_col'] target_col = inputs['target_col'] out_txt = inputs['out_txt']''' add_file_tag = int(add_file_tag) #count_type = inputs['count_type'] except KeyError as e: missing_var = str(e).split("'")[1] if missing_var in ['year_start', 'year_end', 'years']: msg = ('No list of years or year_start/year_end specified in' +\ ' param file:\n%s\n. Re-run script with either of these' +\ ' parameters given.') % params msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) out_dir, original_bn = os.path.split(out_txt) # Add informative tags to output dir and basename if add_file_tag: res = years[1] - years[0] #out_dir = os.path.basename(out_dir) now = datetime.datetime.now() date_str = str(now.date()).replace('-', '') time_str = str(now.time()).replace(':', '')[:4] out_dirname = '{0}_res{1}yr_{2}_{3}'.format(target_col, res, date_str, time_str) out_dir = os.path.join(out_dir, out_dirname) if not os.path.exists(out_dir): os.mkdir(out_dir) out_bn = '{0}_{1}'.format( os.path.basename(obs_txt).replace('.txt', ''), original_bn) out_txt = os.path.join(out_dir, out_bn) if params != os.path.exists(os.path.join(out_dir, os.path.basename(params))): print 'Copying params to output dir: %s\n' % out_dir shutil.copy2(params, out_dir) print 'Getting predictors... ' t1 = time.time() df_obs = pd.read_csv(obs_txt, sep='\t', index_col=index_col) original_columns = df_obs.columns df = get_predictors(years, search_dir, search_str, df_obs, index_col, year_col, df_vars) print '%.1f seconds\n' % (time.time() - t1) # Select count type and date range if 'count_type' in inputs: count_type = [t.strip() for t in count_type.split(',')] df = df[df.COUNT_TYPE.isin(count_type)] #df.drop(['COUNT_TYPE'], axis=1, inplace=True) if 'P21' in count_type: df = df[df.EFFORT_DISTANCE_KM < .1] if 'day_minmax' in inputs: day_min, day_max = [int(d) for d in day_minmax.split(',')] df = df[(df.DAY >= day_min) & (df.DAY <= day_max)] if 'time_minmax' in inputs: time_min, time_max = [int(t) for t in time_minmax.split(',')] df = df[(df.TIME >= time_min) & (df.TIME <= time_max)] if 'max_effort_time' in inputs: max_effort_time = int(max_effort_time) df = df[df.EFFORT_HRS < max_effort_time] if 'max_effort_dist' in inputs: max_effort_dist = int(max_effort_dist) df = df[df.EFFORT_DISTANCE_KM < max_effort_time] #df = df[(df.YEAR >= min(years)) & (df.YEAR <= max(years))] #df[target_col] *= 100 # To be able to keep stuff as 8 bit ints # Calc row and col from x, y ds = gdal.Open(tsa_mosaic) tx = ds.GetGeoTransform() ds = None ul_xy = tx[0], tx[3] df['row'], df['col'] = zip(*[ stem.calc_offset(ul_xy, xy, tx) for i, xy in df[['x', 'y']].iterrows() ]) if 'kernel_dist' in inputs: t1 = time.time() print 'Calculating kernel density...' kernel_dist = int(kernel_dist) for yr in df.YEAR.unique(): yr_mask = df.YEAR == yr df_w = gaussain_weights(df[yr_mask], target_col, kernel_dist) df.ix[yr_mask, target_col] = df_w.weighted ''' df_w = gaussain_weights(df, target_col, kernel_dist) df[target_col] = df_w.weighted #df = df.drop_duplicates(subset=[target_col, 'row', 'col'])''' print '%.1f seconds\n' % (time.time() - t1) #""" if aggregate_presence: t1 = time.time() print 'Aggregating presence records...' df.ix[df[target_col] > 0, target_col] = 1 for yr in df.YEAR.unique(): yr_mask = df.YEAR == yr df_yr = df[yr_mask] # Get unique locations for this year unique = df_yr[['row', 'col']].drop_duplicates().values for row, col in unique: this_loc = df_yr[(df_yr.row == row) & (df_yr.col == col)] #If there are ones and 0s, drop the 0s if this_loc[target_col].min( ) == 0 and this_loc[target_col].max() == 1: df.drop(this_loc[this_loc[target_col] == 0].index, inplace=True) print '%.1f seconds\n' % (time.time() - t1) if pct_train: print 'Splitting training and test sets...' pct_train = float(pct_train) #n_test = int(len(df) * (1 - pct_train)) unique = df[['row', 'col']].drop_duplicates().values n_test = int(len(unique) * (1 - pct_train)) random_idx = random.sample(xrange(len(unique)), n_test) random_row, random_col = zip(*unique[random_idx]) df_test = df[df.row.isin(random_row) & df.col.isin(random_col)] test_idx = df_test.index test_txt = out_txt.replace('.txt', '_test.txt') df = df[~df.index.isin(test_idx)] df_test.to_csv(test_txt, sep='\t') df.to_csv(out_txt, sep='\t') obs_out_txt = out_txt.replace('_' + original_bn[:-4], '') df[original_columns].to_csv(obs_out_txt, sep='\t') print '\nLength of output df:', len(df) print 'Text file written to: ', out_txt print '\nTotal time: %.1f minutes' % ((time.time() - t0) / 60)
def main(params, inventory_txt=None, constant_vars=None, mosaic_shp=None, resolution=30, n_jobs=0, n_jobs_agg=0, mosaic_nodata=0, snap_coord=None, overwrite_tiles=False, tile_id_field='name'): inputs, df_var = stem.read_params(params) for i in inputs: exec ("{0} = str({1})").format(i, inputs[i]) df_var.data_band = [int(b) for b in df_var.data_band]#sometimes read as float try: support_size = [int(i) for i in support_size.split(',')] nodata = int(nodata) str_check = model_dir, mosaic_path, out_dir, train_params except NameError as e: missing_var = str(e).split("'")[1] msg = "Variable '%s' not specified in param file:\n%s" % (missing_var, params) raise NameError(msg) # Check that all the variables given were used in training and vice versa try: train_inputs, train_vars = stem.read_params(train_params) except: raise NameError('train_params not specified or does not exist') train_vars = sorted(train_vars.index) pred_vars = sorted(df_var.index) # Make sure vars are sorted alphabetically since they were for training df_var = df_var.reindex(pred_vars) unmatched_vars = [v for v in pred_vars if v not in train_vars] if len(unmatched_vars) != 0: unmatched_str = '\n'.join(unmatched_vars) msg = 'Columns not in predict params but specified in train params:\n' + unmatched_str raise NameError(msg) if not os.path.exists(out_dir): os.mkdir(out_dir) else: print ('WARNING: out_dir already exists:\n%s\nAny existing files ' + \ 'will be overwritten...\n') % out_dir if not os.path.exists(os.path.join(out_dir, os.path.basename(params))): shutil.copy2(params, out_dir) #Copy the params for reference if 'confusion_params' in inputs: conf_bn = os.path.basename(confusion_params) new_conf_path = os.path.join(out_dir, conf_bn) if not os.path.exists(new_conf_path): shutil.copy2(confusion_params, out_dir) confusion_params = new_conf_path if not os.path.exists(model_dir): sys.exit('model_dir does not exist:\n%s' % model_dir) if not os.path.exists(mosaic_path): sys.exit('mosaic_path does not exist:\n%s' % mosaic_path) predict_dir = os.path.join(out_dir, 'decisiontree_predictions') if not os.path.exists(predict_dir): os.mkdir(predict_dir) if not 'file_stamp' in inputs: file_stamp = os.path.basename(model_dir) db_path = os.path.join(model_dir, file_stamp + '.db') try: engine = sqlalchemy.create_engine('sqlite:///%s' % db_path) with engine.connect() as con, con.begin(): df_sets = pd.read_sql_table('support_sets', con, index_col='set_id')#''' except: set_txt = glob.glob(os.path.join(model_dir, 'decisiontree_models/*support_sets.txt'))[0] if not os.path.isfile(set_txt): raise IOError('No database or support set txt file found') df_sets = pd.read_csv(set_txt, sep='\t', index_col='set_id') if mosaic_path.endswith('.shp'): mosaic_type = 'vector' # if subset specified, clip the mosaic and set mosaic path to clipped shp if 'subset_shp' in inputs: out_shp_bn = os.path.basename(mosaic_path).replace('.shp', '_clipped.shp') out_shp = os.path.join(out_dir, out_shp_bn) cmd = 'ogr2ogr -clipsrc {clip_shp} {out_shp} {in_shp}'.format(clip_shp=subset_shp, out_shp=out_shp, in_shp=mosaic_path) subprocess.call(cmd, shell=True)#''' mosaic_path = out_shp mosaic_dataset = ogr.Open(mosaic_path) mosaic_ds = mosaic_dataset.GetLayer() min_x, max_x, min_y, max_y = mosaic_ds.GetExtent() if 'resolution' not in inputs: warnings.warn('Resolution not specified. Using default of 30...\n') # If subset specified, just get sets that overlap the subset if 'subset_shp' in inputs: mosaic_geom = ogr.Geometry(ogr.wkbMultiPolygon) for feature in mosaic_ds: mosaic_geom.AddGeometry(feature.GetGeometryRef()) df_sets = stem.get_overlapping_sets(df_sets, mosaic_geom) xsize = int((max_x - min_x)/resolution) ysize = int((max_y - min_y)/resolution) prj = mosaic_ds.GetSpatialRef().ExportToWkt() x_res = resolution y_res = -resolution x_rot = 0 y_rot = 0 if 'snap_coord' in train_inputs: snap_coord = train_inputs['snap_coord'].replace('"','') snap_coord = [float(c) for c in snap_coord.split(',')]#''' mosaic_tx, extent = stem.tx_from_shp(mosaic_path, x_res, y_res, snap_coord=snap_coord) tiles = stem.attributes_to_df(mosaic_path) # Change to accept arbittary geometry else: mosaic_type = 'raster' mosaic_ds = gdal.Open(mosaic_path) mosaic_tx = mosaic_ds.GetGeoTransform() xsize = mosaic_ds.RasterXSize ysize = mosaic_ds.RasterYSize prj = mosaic_ds.GetProjection() driver = mosaic_ds.GetDriver() m_ulx, x_res, x_rot, m_uly, y_rot, y_res = mosaic_tx #driver = gdal.GetDriverByName('gtiff') # If number of tiles not given, need to set it if 'n_tiles' not in inputs: print 'n_tiles not specified. Using default: 25 x 15 ...\n' n_tiles = 90, 40 else: n_tiles = [int(i) for i in n_tiles.split(',')] #df_tiles, df_tiles_rc, tile_size = stem.get_tiles(n_tiles, xsize, ysize, mosaic_tx) total_sets = len(df_sets) t0 = time.time() last_dts = pd.Series() agg_stats = [s.strip().lower() for s in agg_stats.split(',')] n_jobs = int(n_jobs) tile_dir = os.path.join(model_dir, 'temp_tiles') #tile_dir = '/home/server/pi/homes/shooper/delete_test' if not os.path.isdir(tile_dir): os.mkdir(tile_dir) tile_path_template = os.path.join(tile_dir, 'tile_{tile_id}_%(stat)s.tif') n_tiles = len(tiles) if not overwrite_tiles: files = os.listdir(tile_dir) tile_files = pd.DataFrame(columns=agg_stats, index=tiles[tile_id_field]) for stat in agg_stats: stat_match = [f.split('_')[1] for f in fnmatch.filter(files, 'tile*%s.tif' % stat)] tile_files[stat] = pd.Series(np.ones(len(stat_match)), index=stat_match) index_field = tiles.index.name tiles[index_field] = tiles.index tiles = tiles.set_index(tile_id_field, drop=False)[tile_files.isnull().any(axis=1)] tiles.set_index(index_field, inplace=True) tiles['ul_x'] = [stem.get_ul_coord(xmin, xmax, x_res) for i, (xmin, xmax) in tiles[['xmin','xmax']].iterrows()] tiles['ul_y'] = [stem.get_ul_coord(ymin, ymax, y_res) for i, (ymin, ymax) in tiles[['ymin','ymax']].iterrows()] tiles['lr_x'] = [xmax if ulx == xmin else xmin for i, (ulx, xmin, xmax) in tiles[['ul_x', 'xmin','xmin']].iterrows()] tiles['lr_y'] = [ymax if uly == ymin else ymin for i, (uly, ymin, ymax) in tiles[['ul_y', 'ymin','ymin']].iterrows()] support_nrows = int(support_size[0]/abs(y_res)) support_ncols = int(support_size[1]/abs(x_res)) t1 = time.time() args = [(tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles[tiles['name'].isin(['1771', '3224', '0333', '0558'])].iterrows())] #args = [(i + 1, n_tiles, t1, tile_info, mosaic_path, mosaic_tx, df_sets, df_var, (support_nrows, support_ncols), agg_stats, tile_path_template, prj, nodata, snap_coord) for i, (t_ind, tile_info) in enumerate(tiles.iterrows())] if n_jobs > 1: print 'Predicting with %s jobs...\n' % n_jobs pool = Pool(n_jobs) pool.map(stem.predict_tile, args, 1) pool.close() pool.join() else: for arg in args: print 'Predicting with 1 job ...\n' stem.predict_tile(*arg)#''' print '\n\nFinished predicting in %.1f hours. \n\nStitching tiles...' % ((time.time() - t1)/3600) t1 = time.time() mosaic_ul = mosaic_tx[0], mosaic_tx[3] driver = gdal.GetDriverByName('gtiff') for stat in agg_stats: if stat == 'stdv': this_nodata = -9999 ar = np.full((ysize, xsize), this_nodata, dtype=np.int16) else: this_nodata = nodata ar = np.full((ysize, xsize), this_nodata, dtype=np.uint8) for tile_id, tile_coords in tiles.iterrows(): tile_file = os.path.join(tile_dir, 'tile_%s_%s.tif' % (tile_coords[tile_id_field], stat)) ds = gdal.Open(tile_file) tile_tx = ds.GetGeoTransform() tile_ul = tile_tx[0], tile_tx[3] row_off, col_off = stem.calc_offset(mosaic_ul, tile_ul, mosaic_tx) # Make sure the tile doesn't exceed the size of ar tile_rows = min(ds.RasterYSize + row_off, ysize) - row_off tile_cols = min(ds.RasterXSize + col_off, xsize) - col_off ar_tile = ds.ReadAsArray(0, 0, tile_cols, tile_rows) try: ar[row_off : row_off + tile_rows, col_off : col_off + tile_cols] = ar_tile except Exception as e: import pdb; pdb.set_trace() out_path = os.path.join(model_dir, '%s_%s.tif' % (file_stamp, stat)) #out_path = os.path.join('/home/server/pi/homes/shooper/delete_test', '%s_%s.tif' % (file_stamp, stat)) gdal_dtype = gdal_array.NumericTypeCodeToGDALTypeCode(ar.dtype) mosaic.array_to_raster(ar, mosaic_tx, prj, driver, out_path, gdal_dtype, nodata=this_nodata) # Clean up the tiles shutil.rmtree(tile_dir) print 'Time for stitching: %.1f minutes\n' % ((time.time() - t1)/60) # Get feature importances and max importance per set t1 = time.time() print 'Getting importance values...' importance_cols = sorted([c for c in df_sets.columns if 'importance' in c]) df_sets['max_importance'] = nodata if len(importance_cols) == 0: # Loop through and get importance importance_per_var = [] for s, row in df_sets.iterrows(): with open(row.dt_file, 'rb') as f: dt_model = pickle.load(f) max_importance, this_importance = stem.get_max_importance(dt_model) df_sets.ix[s, 'max_importance'] = max_importance importance_per_var.append(this_importance) importance = np.array(importance_per_var).mean(axis=0) else: df_sets['max_importance'] = np.argmax(df_sets[importance_cols].values, axis=1) importance = df_sets[importance_cols].mean(axis=0).values pct_importance = importance / importance.sum() print '%.1f minutes\n' % ((time.time() - t1)/60) # Save the importance values importance = pd.DataFrame({'variable': pred_vars, 'pct_importance': pct_importance, 'index': range(len(pred_vars)) }) importance.set_index('index', inplace=True) importance['rank'] = [int(r) for r in importance.pct_importance.rank(method='first', ascending=False)] out_txt = os.path.join(out_dir, '%s_importance.txt' % file_stamp) importance.to_csv(out_txt, sep='\t')#''' if 'confusion_params' in locals(): import confusion_matrix as confusion ''' Read the mean or vote back in ''' if 'vote' in agg_stats: vote_path = os.path.join(out_dir, '%s_vote.tif' % file_stamp) ar_vote = gdal.Open(vote_path) print '\nComputing confusion matrix for vote...' vote_dir = os.path.join(model_dir, 'evaluation_vote') out_txt = os.path.join(vote_dir, 'confusion.txt') df_v = confusion.main(confusion_params, ar_vote, out_txt, match=True) vote_acc = df_v.ix['producer', 'user'] vote_kap = df_v.ix['producer', 'kappa'] '''try: out_txt = os.path.join(vote_dir, 'confusion_avg_kernel.txt') df_v_off = confusion.main(confusion_params, ar_vote, out_txt) except Exception as e: print e''' if 'mean' in agg_stats: mean_path = os.path.join(out_dir, '%s_mean.tif' % file_stamp) ar_mean = gdal.Open(mean_path) print '\nGetting confusion matrix for mean...' mean_dir = os.path.join(model_dir, 'evaluation_mean') out_txt = os.path.join(mean_dir, 'confusion.txt') df_m = confusion.main(confusion_params, ar_mean, out_txt, match=True) mean_acc = df_m.ix['user','producer'] mean_kap = df_m.ix['user', 'kappa'] '''try: out_txt = os.path.join(mean_dir, 'confusion_avg_kernel.txt') df_m_off = confusion.main(confusion_params, ar_mean, out_txt) except Exception as e: print e#''' if 'inventory_txt' in inputs: df_inv = pd.read_csv(inventory_txt, sep='\t', index_col='stamp') cols = ['vote_accuracy', 'vote_kappa']#, 'vote_mask', 'mean_accuracy', 'mean_kappa', 'vote_mask'] df_inv.ix[file_stamp, cols] = vote_acc, vote_kap#, False, mean_acc, mean_kap, False df_inv.to_csv(inventory_txt, sep='\t') else: print '\n"inventory_txt" was not specified.' +\ ' Model evaluation scores will not be recorded...' print '' if 'vote' in agg_stats: print 'Vote accuracy .............. ', vote_acc print 'Vote kappa ................. ', vote_kap if 'mean' in agg_stats: print 'Mean accuracy .............. ', mean_acc print 'Mean kappa ................. ', mean_kap else: print '\n"confusion_params" was not specified.' +\ ' This model will not be evaluated...' #''' print '\nTotal prediction runtime: %.1f hours\n' % ((time.time() - t0)/3600)