def tune(training_raster, training_fit_raster, start=10, step=10, stop=100): """ Performs 5 fold cross validation to determine optimal parameters Args: training_raster: Rasterized training data training_fit_raster: Raster which data is drawn over """ X, y = load_data(training_raster, training_fit_raster) X_train, X_test, y_train, y_test = split_data(training_raster, training_fit_raster) n_estimators = [int(x) for x in np.linspace(start=start, stop=stop, num=step)] min_samples_leaf = [int(x) for x in np.linspace(start=start, stop=stop, num=step)] random_grid = { 'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf } etc = ExtraTreesClassifier() clf = RandomizedSearchCV(etc, random_grid, random_state=0, verbose=3) clf.fit(X_train, y_train) print(clf.best_params_) return clf.cv_results_
def split_data(training_raster, training_fit_raster): """ Split data into training and testing data Args: training_raster: Rasterized training data training_fit_raster: Raster which data is drawn over """ X, y = load_data(training_raster, training_fit_raster) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) return X_train, X_test, y_train, y_test
def split_data(training_raster, training_fit_raster): """ Split data into training and testing data Parameters ---------- training_raster : str, filename The rasterized training data. training_fit_raster : str, filename The vegetation index raster that the rasterized training data will be fit with. Returns ------- X_train, X_test, y_train, y_test: array Split training and test datasets """ X, y = load_data(training_raster, training_fit_raster) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) return X_train, X_test, y_train, y_test
def rf_class(training_raster, training_fit_raster, in_raster, out_tiff, smoothing=True, class_parameters=None): """ This function enables canopy classification of remotely sensed imagery using Scikit-learns Random Forests supervised classification algorithm. Parameters ---------- training_raster : str, filename The rasterized training data. training_fit_raster : str, filename The vegetation index raster that the rasterized training data will be fit with. in_raster : str, filepath Raster training raster will be applied to out_tiff : str, filepath Final output classified raster smoothing : bool, defualt=True Applies a 3x3 median filter to output classified raster. class_parameters : dict arguments for Scikit-learns ET Classifier {"n_estimators": 100, "criterion": 'gini', "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 'auto', "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_impurity_split": None, "bootstrap": True, "oob_score": False, "n_jobs": None, "random_state": None, "verbose": 0, "warm_start": False, "class_weight": None, "ccp_alpha": 0.0, "max_samples": None} """ X, y = load_data(training_raster, training_fit_raster) if class_parameters is None: parameters = {"n_estimators": 100, "criterion": 'gini', "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 'auto', "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_impurity_split": None, "bootstrap": True, "oob_score": False, "n_jobs": None, "random_state": None, "verbose": 0, "warm_start": False, "class_weight": None, "ccp_alpha": 0.0, "max_samples": None} clf = RandomForestClassifier(**parameters) else: parameters = class_parameters clf = RandomForestClassifier(**parameters) ras = clf.fit(X, y) r = gdal.Open(in_raster) class_raster = r.GetRasterBand(1).ReadAsArray().astype(np.float64) class_raster[np.isnan(class_raster)] = 0 class_mask = np.ma.MaskedArray(class_raster, mask=(class_raster == 0)) class_mask.reshape(class_raster.shape) class_array = class_mask.reshape(-1, 1) ras_pre = ras.predict(class_array) ras_final = ras_pre.reshape(class_raster.shape) ras_byte = ras_final.astype(dtype=np.byte) if smoothing: smooth_ras = ndimage.median_filter(ras_byte, size=3) driver = gdal.GetDriverByName('GTiff') metadata = driver.GetMetadata() shape = class_raster.shape dst_ds = driver.Create(out_tiff, xsize=shape[1], ysize=shape[0], bands=1, eType=gdal.GDT_Byte) proj = r.GetProjection() geo = r.GetGeoTransform() dst_ds.SetGeoTransform(geo) dst_ds.SetProjection(proj) dst_ds.GetRasterBand(1).WriteArray(smooth_ras) dst_ds.FlushCache() dst_ds = None if not smoothing: driver = gdal.GetDriverByName('GTiff') metadata = driver.GetMetadata() shape = class_raster.shape dst_ds = driver.Create(out_tiff, xsize=shape[1], ysize=shape[0], bands=1, eType=gdal.GDT_Byte) proj = r.GetProjection() geo = r.GetGeoTransform() dst_ds.SetGeoTransform(geo) dst_ds.SetProjection(proj) dst_ds.GetRasterBand(1).WriteArray(ras_byte) dst_ds.FlushCache() dst_ds = None print(out_tiff)
def batch_et_class(pid, smoothing=True, class_parameters=None): """ This function enables batch classification of NAIP imagery using a sklearn Extra Trees supervised classification algorithm. --- Args: phy_id: int :: Physio Id for the region to be processed. smoothing: True :: applies median filter to output classified raster Keyword Args class_parameters: Dict:: arguments for Scikit-learns ET Classifier {"n_estimators": 100, "criterion": 'gini', "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 'auto', "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_impurity_split": None, "bootstrap": False, "oob_score": False, "n_jobs": None, "random_state": None, "verbose": 0, "warm_start": False, "class_weight": None, "ccp_alpha": 0.0, "max_samples": None} """ shp = config.naipqq_shp results_dir = config.results training_raster = config.training_raster training_fit_raster = config.training_fit_raster id_field = config.procid_field # Query region name, create input and output folder paths region_dir = '%s/%s' % (results_dir, pid) in_dir = '%s/Inputs' % region_dir out_dir = '%s/Outputs' % region_dir if not os.path.exists(in_dir): raise IOError('Input directory does not exist.') if not os.path.exists(out_dir): os.mkdir(out_dir) # Read training & fit raster file and shape to be trained X, y = load_data(training_raster, training_fit_raster) # Train Extra Trees Classifier if class_parameters is None: parameters = { "n_estimators": 100, "criterion": 'gini', "max_depth": None, "min_samples_split": 2, "min_samples_leaf": 1, "min_weight_fraction_leaf": 0.0, "max_features": 'auto', "max_leaf_nodes": None, "min_impurity_decrease": 0.0, "min_impurity_split": None, "bootstrap": False, "oob_score": False, "n_jobs": None, "random_state": None, "verbose": 0, "warm_start": False, "class_weight": None, "ccp_alpha": 0.0, "max_samples": None } clf = ExtraTreesClassifier(**parameters) else: parameters = class_parameters clf = ExtraTreesClassifier(**parameters) ras = clf.fit(X, y) # Open naip_qq shapefile and iterate over attributes to select naip tiles # in desired pid. src = ogr.Open(shp) lyr = src.GetLayer() FileName = [] phyregs = [] filtered = [] paths = [] query = '%d' % pid outputs = [] for i in lyr: FileName.append(i.GetField('FileName')) phyregs.append(str(i.GetField(id_field))) # Get raw file names from naip_qq layer by iterating over phyregs list and # retreving corresponding file name from filenames list. for j in range(len(phyregs)): if query == phyregs[j]: filtered.append(FileName[j]) for i in range(len(filtered)): # Edit filenames to get true file names # create output filenames and # paths. file = '%s%s' % ('arvi_', filtered[i]) filename = '%s.tif' % file[:-13] in_path = '%s/%s' % (in_dir, filename) out_file = '%s/%s%s' % (out_dir, 'c_', filename) outputs.append(out_file) paths.append(in_path) if os.path.exists(out_file): continue # Check if input file exists if not os.path.exists(paths[i]): print('Missing file: ', paths[i]) continue if os.path.exists(paths[i]): # If input file exists open with gdal and convert to NumPy array. r = gdal.Open(paths[i]) class_raster = r.GetRasterBand(1).ReadAsArray().astype(np.float32) class_raster[np.isnan(class_raster)] = 0 class_mask = np.ma.MaskedArray(class_raster, mask=(class_raster == 0)) class_mask.reshape(class_raster.shape) class_array = class_mask.reshape(-1, 1) ras_pre = ras.predict(class_array) # Convert back to original shape and make data type Byte ras_final = ras_pre.reshape(class_raster.shape) ras_byte = ras_final.astype(dtype=np.byte) if smoothing: # If smoothing = True, apply SciPy median_filter to array and # then save. smooth_ras = ndimage.median_filter(ras_byte, size=5) driver = gdal.GetDriverByName('GTiff') metadata = driver.GetMetadata() shape = class_raster.shape dst_ds = driver.Create(outputs[i], shape[1], shape[0], 1, gdal.GDT_Byte, ['NBITS=2']) proj = r.GetProjection() geo = r.GetGeoTransform() dst_ds.SetGeoTransform(geo) dst_ds.SetProjection(proj) dst_ds.GetRasterBand(1).WriteArray(smooth_ras) dst_ds.FlushCache() dst_ds = None if not smoothing: # If smoothing = False, save numpy array as raster with out # smoothing driver = gdal.GetDriverByName('GTiff') metadata = driver.GetMetadata() shape = class_raster.shape dst_ds = driver.Create(outputs[i], shape[1], shape[0], 1, gdal.GDT_Byte, ['NBITS=2']) proj = r.GetProjection() geo = r.GetGeoTransform() dst_ds.SetGeoTransform(geo) dst_ds.SetProjection(proj) dst_ds.GetRasterBand(1).WriteArray(ras_byte) dst_ds.FlushCache() dst_ds = None