def test_construct_grids(): batch = fetch_species_distributions(data_home=None, download_if_missing=True) keep = construct_grids(batch) xmin = batch.x_left_lower_corner + batch.grid_size xmax = xmin + (batch.Nx * batch.grid_size) ymin = batch.y_left_lower_corner + batch.grid_size ymax = ymin + (batch.Ny * batch.grid_size) xgrid = np.arange(xmin, xmax, batch.grid_size) ygrid = np.arange(ymin, ymax, batch.grid_size) assert_array_equal(keep[0], xgrid) assert_array_equal(keep[1], ygrid)
latlon = np.vstack([data.train['dd lat'], data.train['dd long']]).T species = np.array( [d.decode('ascii').startswith('micro') for d in data.train['species']], dtype='int') import os import conda conda_file_dir = conda.__file__ conda_dir = conda_file_dir.split('lib')[0] proj_lib = os.path.join(os.path.join(conda_dir, 'share'), 'proj') os.environ["PROJ_LIB"] = proj_lib from mpl_toolkits.basemap import Basemap from sklearn.datasets.species_distributions import construct_grids xgrid, ygrid = construct_grids(data) # plot coastlines with Basemap m = Basemap(projection='cyl', resolution='c', llcrnrlat=ygrid.min(), urcrnrlat=ygrid.max(), llcrnrlon=xgrid.min(), urcrnrlon=xgrid.max()) m.drawmapboundary(fill_color='#DDEEFF') m.fillcontinents(color='#FFEEDD') m.drawcoastlines(color='gray', zorder=2) m.drawcountries(color='gray', zorder=2) # plot locations m.scatter(latlon[:, 1], latlon[:, 0], zorder=3,
def plot_species_distribution(species=("bradypus_variegatus_0", "microryzomys_minutus_0")): """ Plot the species distribution. """ if len(species) > 2: print("Note: when more than two species are provided," " only the first two will be used") t0 = time() # Load the compressed data data = fetch_species_distributions() # Set up the data grid xgrid, ygrid = construct_grids(data) # The grid in x,y coordinates X, Y = np.meshgrid(xgrid, ygrid[::-1]) # create a bunch for each species BV_bunch = create_species_bunch(species[0], data.train, data.test, data.coverages, xgrid, ygrid) MM_bunch = create_species_bunch(species[1], data.train, data.test, data.coverages, xgrid, ygrid) # background points (grid coordinates) for evaluation np.random.seed(13) background_points = np.c_[ np.random.randint(low=0, high=data.Ny, size=10000), np.random.randint(low=0, high=data.Nx, size=10000)].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. land_reference = data.coverages[6] # Fit, predict, and plot for each species. for i, species in enumerate([BV_bunch, MM_bunch]): print("_" * 80) print("Modeling distribution of species '%s'" % species.name) # Standardize features mean = species.cov_train.mean(axis=0) std = species.cov_train.std(axis=0) train_cover_std = (species.cov_train - mean) / std # Fit OneClassSVM print(" - fit OneClassSVM ... ", end='') clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) print("done.") # Plot map of South America plt.subplot(1, 2, i + 1) if basemap: print(" - plot coastlines using basemap") m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(), resolution='c') m.drawcoastlines() m.drawcountries() else: print(" - plot coastlines from coverage") plt.contour(X, Y, land_reference, levels=[-9998], colors="k", linestyles="solid") plt.xticks([]) plt.yticks([]) print(" - predict species distribution") # Predict species distribution using the training data Z = np.ones((data.Ny, data.Nx), dtype=np.float64) # We'll predict only for the land points. idx = np.where(land_reference > -9999) coverages_land = data.coverages[:, idx[0], idx[1]].T pred = clf.decision_function((coverages_land - mean) / std) Z *= pred.min() Z[idx[0], idx[1]] = pred levels = np.linspace(Z.min(), Z.max(), 25) Z[land_reference == -9999] = -9999 # plot contours of the prediction plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) plt.colorbar(format='%.2f') # scatter training/testing points plt.scatter(species.pts_train['dd long'], species.pts_train['dd lat'], s=2**2, c='black', marker='^', label='train') plt.scatter(species.pts_test['dd long'], species.pts_test['dd lat'], s=2**2, c='black', marker='x', label='test') plt.legend() plt.title(species.name) plt.axis('equal') # Compute AUC with regards to background points pred_background = Z[background_points[0], background_points[1]] pred_test = clf.decision_function((species.cov_test - mean) / std) scores = np.r_[pred_test, pred_background] y = np.r_[np.ones(pred_test.shape), np.zeros(pred_background.shape)] fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) plt.text(-35, -70, "AUC: %.3f" % roc_auc, ha="right") print("\n Area under the ROC curve : %f" % roc_auc) print("\ntime elapsed: %.2fs" % (time() - t0))
def test2(): import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import fetch_species_distributions from sklearn.datasets.species_distributions import construct_grids from sklearn.neighbors import KernelDensity # if basemap is available, we'll use it. # otherwise, we'll improvise later... try: from mpl_toolkits.basemap import Basemap basemap = True except ImportError: basemap = False # Get matrices/arrays of species IDs and locations data = fetch_species_distributions() species_names = ['Bradypus Variegatus', 'Microryzomys Minutus'] Xtrain = np.vstack([data['train']['dd lat'], data['train']['dd long']]).T ytrain = np.array([d.startswith('micro') for d in data['train']['species']], dtype='int') Xtrain *= np.pi / 180. # Convert lat/long to radians # Set up the data grid for the contour plot xgrid, ygrid = construct_grids(data) return ygrid, xgrid X, Y = np.meshgrid(xgrid[::5], ygrid[::5][::-1]) land_reference = data.coverages[6][::5, ::5] land_mask = (land_reference > -9999).ravel() xy = np.vstack([Y.ravel(), X.ravel()]).T xy = xy[land_mask] xy *= np.pi / 180. # Plot map of South America with distributions of each species fig = plt.figure() fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05) for i in range(2): plt.subplot(1, 2, i + 1) # construct a kernel density estimate of the distribution print(" - computing KDE in spherical coordinates") kde = KernelDensity(bandwidth=0.04, metric='haversine', kernel='gaussian', algorithm='ball_tree') print Xtrain[ytrain == i].shape kde.fit(Xtrain[ytrain == i]) # evaluate only on the land: -9999 indicates ocean Z = -9999 + np.zeros(land_mask.shape[0]) Z[land_mask] = np.exp(kde.score_samples(xy)) Z = Z.reshape(X.shape) # plot contours of the density levels = np.linspace(0, Z.max(), 25) print map(lambda x: x.shape, [X,Y,Z]) plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) if basemap: print(" - plot coastlines using basemap") m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(), resolution='c') m.drawcoastlines() m.drawcountries() else: print(" - plot coastlines from coverage") plt.contour(X, Y, land_reference, levels=[-9999], colors="k", linestyles="solid") plt.xticks([]) plt.yticks([]) plt.title(species_names[i]) plt.show()
basemap = True except ImportError: basemap = False # Get matrices/arrays of species IDs and locations data = fetch_species_distributions() species_names = ['Bradypus Variegatus', 'Microryzomys Minutus'] Xtrain = np.vstack([data['train']['dd lat'], data['train']['dd long']]).T ytrain = np.array([d.decode('ascii').startswith('micro') for d in data['train']['species']], dtype='int') Xtrain *= np.pi / 180. # Convert lat/long to radians # Set up the data grid for the contour plot xgrid, ygrid = construct_grids(data) X, Y = np.meshgrid(xgrid[::5], ygrid[::5][::-1]) land_reference = data.coverages[6][::5, ::5] land_mask = (land_reference > -9999).ravel() xy = np.vstack([Y.ravel(), X.ravel()]).T xy = xy[land_mask] xy *= np.pi / 180. # Plot map of South America with distributions of each species fig = plt.figure() fig.subplots_adjust(left=0.05, right=0.95, wspace=0.05) for i in range(2): plt.subplot(1, 2, i + 1)
def plot_species_distribution(species=("bradypus_variegatus_0", "microryzomys_minutus_0")): """ Plot the species distribution. """ if len(species) > 2: print("Note: when more than two species are provided," " only the first two will be used") t0 = time() # Load the compressed data data = fetch_species_distributions() # Set up the data grid xgrid, ygrid = construct_grids(data) # The grid in x,y coordinates X, Y = np.meshgrid(xgrid, ygrid[::-1]) # create a bunch for each species BV_bunch = create_species_bunch(species[0], data.train, data.test, data.coverages, xgrid, ygrid) MM_bunch = create_species_bunch(species[1], data.train, data.test, data.coverages, xgrid, ygrid) # background points (grid coordinates) for evaluation np.random.seed(13) background_points = np.c_[np.random.randint(low=0, high=data.Ny, size=10000), np.random.randint(low=0, high=data.Nx, size=10000)].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. land_reference = data.coverages[6] # Fit, predict, and plot for each species. for i, species in enumerate([BV_bunch, MM_bunch]): print("_" * 80) print("Modeling distribution of species '%s'" % species.name) # Standardize features mean = species.cov_train.mean(axis=0) std = species.cov_train.std(axis=0) train_cover_std = (species.cov_train - mean) / std # Fit OneClassSVM print(" - fit OneClassSVM ... ", end='') print(train_cover_std.shape) clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) print("done.") # Plot map of South America plt.subplot(1, 2, i + 1) if basemap: print(" - plot coastlines using basemap") m = Basemap(projection='cyl', llcrnrlat=Y.min(), urcrnrlat=Y.max(), llcrnrlon=X.min(), urcrnrlon=X.max(), resolution='c') m.drawcoastlines() m.drawcountries() else: print(" - plot coastlines from coverage") plt.contour(X, Y, land_reference, levels=[-9999], colors="k", linestyles="solid") plt.xticks([]) plt.yticks([]) print(" - predict species distribution") # Predict species distribution using the training data Z = np.ones((data.Ny, data.Nx), dtype=np.float64) # We'll predict only for the land points. idx = np.where(land_reference > -9999) coverages_land = data.coverages[:, idx[0], idx[1]].T pred = clf.decision_function((coverages_land - mean) / std)[:, 0] Z *= pred.min() Z[idx[0], idx[1]] = pred levels = np.linspace(Z.min(), Z.max(), 25) Z[land_reference == -9999] = -9999 # plot contours of the prediction plt.contourf(X, Y, Z, levels=levels, cmap=plt.cm.Reds) plt.colorbar(format='%.2f') # scatter training/testing points plt.scatter(species.pts_train['dd long'], species.pts_train['dd lat'], s=2 ** 2, c='black', marker='^', label='train') plt.scatter(species.pts_test['dd long'], species.pts_test['dd lat'], s=2 ** 2, c='black', marker='x', label='test') plt.legend() plt.title(species.name) plt.axis('equal') # Compute AUC with regards to background points pred_background = Z[background_points[0], background_points[1]] pred_test = clf.decision_function((species.cov_test - mean) / std)[:, 0] scores = np.r_[pred_test, pred_background] y = np.r_[np.ones(pred_test.shape), np.zeros(pred_background.shape)] fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) plt.text(-35, -70, "AUC: %.3f" % roc_auc, ha="right") print("\n Area under the ROC curve : %f" % roc_auc) print("\ntime elapsed: %.2fs" % (time() - t0))
def processAlgorithm(self, progress): # Set up the data as sklearn bunch (basically just a dictionary with specific attributes) data = Bunch() # Vector layer vector = self.getParameterValue(self.SPECIES) v = Processing.getObject(vector) v_crs = v.crs() # Environmental layers envlayers = self.getParameterValue(self.ENV) if func.unificationNecessary(envlayers.split(";")): raise GeoAlgorithmExecutionException( "All input environmental layers need to have the same resolution and extent. Use the Unify tool beforehand" ) #TODO: Enable option to do this automatically progress.setConsoleInfo("Loading Coverage Data") # Check Projection and Cellsize for lay in envlayers.split(";"): r = Processing.getObject(lay) # QgsRasterLayer object if r.crs() != v_crs: raise GeoAlgorithmExecutionException( "All input layers need to have the same projection") if round(r.rasterUnitsPerPixelX()) != round( r.rasterUnitsPerPixelY()): raise GeoAlgorithmExecutionException( "Grid Cell size values are not equal. Please be sure that grid cells are squares." ) # Set coverage parameters r = Processing.getObject( envlayers.split(";")[0]) # QgsRasterLayer object ex = r.extent() data["grid_size"] = r.rasterUnitsPerPixelX() data["Nx"] = r.width() data["Ny"] = r.height() data["x_left_lower_corner"] = ex.xMinimum() data["y_left_lower_corner"] = ex.yMinimum() # Load in Coverage values coverage = [] for lay in envlayers.split(";"): raster = gdal.Open(str(lay)) if raster.RasterCount > 1: progress.setConsoleInfo( "Warning: Multiple bands for layer detected. Using only first band." ) array = raster.GetRasterBand(1).ReadAsArray() NA = raster.GetRasterBand(1).GetNoDataValue() if NA == None: raise GeoAlgorithmExecutionException( "Warning: Raster layer has no no-data value. Please specify a no-data value for this dataset." ) else: array[array == NA] = -9999 # Replace nodata-values of array with -9999 coverage.append(array) data["coverages"] = numpy.array( coverage) # Load all the coverage values into the bunch # Setup parameters for output prediction a = gdal.Open(envlayers.split(";")[0]) columns = a.RasterXSize rows = a.RasterYSize driver = a.GetDriver() NA = -9999 gt = a.GetGeoTransform() proj = a.GetProjection() output = self.getOutputValue(self.OUT_PRED) # Set up the data grid xgrid, ygrid = construct_grids(data) # The grid in x,y coordinates X, Y = numpy.meshgrid(xgrid, ygrid[::-1]) # background points (grid coordinates) for evaluation numpy.random.seed(100) background_points = numpy.c_[ numpy.random.randint(low=0, high=data.Ny, size=10000), numpy.random.randint(low=0, high=data.Nx, size=10000)].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. # FIXME: Assuming that all predictors have a similar distribution. Might be violated land_reference = data.coverages[0] progress.setConsoleInfo("Loading Occurence Data and coverage") # Creating response train = [] for feature in v.getFeatures(): geom = feature.geometry().asPoint() mx = geom.x() my = geom.y() train.append((mx, my)) data["train"] = numpy.array(train) # Add to bunch as training dataset # create species bunch sp_Bunch = Bunch(name="Species") points = dict(train=data.train) for label, pts in points.iteritems(): #determine coverage values for each of the training & testing points ix = numpy.searchsorted(xgrid, pts[0]) iy = numpy.searchsorted(ygrid, pts[1]) bunch['cov_%s' % label] = data.coverages[:, -iy, ix].T progress.setConsoleInfo( "Finished loading coverage data of environmental layers") # Starting modelling progress.setConsoleInfo("Finished preparing the data for the analysis") progress.setConsoleInfo("----") progress.setConsoleInfo("Starting Modelling with support of sklearn") # Standardize features #TODO: Enable different or no Standardization methods mean = sp_Bunch.cov.mean(axis=0) std = sp_Bunch.cov.std(axis=0) train_cover_std = (sp_Bunch.cov - mean) / std # Fit OneClassSVM progress.setConsoleInfo("Fitting Support Vector Machine") # TODO: Allow the user to vary the input clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) progress.setConsoleInfo("Fitting done") # Predict species distribution using the training data Z = numpy.ones((data.Ny, data.Nx), dtype=numpy.float64) # We'll predict only for the land points. idx = numpy.where(land_reference > -9999) coverages_land = data.coverages[:, idx[0], idx[1]].T pred = clf.decision_function((coverages_land - mean) / std)[:, 0] Z *= pred.min() Z[idx[0], idx[1]] = pred levels = numpy.linspace(Z.min(), Z.max(), 25) Z[land_reference == -9999] = -9999 result = Z # save the final results scores # Compute AUC w.r.t. background points pred_background = Z[background_points[0], background_points[1]] pred_test = clf.decision_function((species.cov_test - mean) / std)[:, 0] scores = numpy.r_[pred_test, pred_background] y = numpy.r_[numpy.ones(pred_test.shape), numpy.zeros(pred_background.shape)] fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) # Area under the ROC curve # TODO: Evaluate the availability of other metrics to compute on (average mean error, etc.. ) # Create Output Prediction File output = self.getOutputValue(self.OUT_PRED_RES) titles = ['AUC'] res_pred = [roc_auc] # Save Output func.saveToCSV(res_pred, titles, output) # Create Output for resulting prediction metadata = driver.GetMetadata() if metadata.has_key( gdal.DCAP_CREATE) and metadata[gdal.DCAP_CREATE] == "YES": pass else: progress.setConsoleInfo( "Output creation of input Fileformat is not supported by gdal. Create GTiff by default." ) driver = gdal.GetDriverByName("GTiff") data_type = result.dtype try: outData = driver.Create(output, columns, rows, 1, data_type) except Exception, e: ProcessingLog.addToLog(ProcessingLog.LOG_ERROR, "Output file could not be created!")
def processAlgorithm(self, progress): # Set up the data as sklearn bunch (basically just a dictionary with specific attributes) data = Bunch() # Vector layer vector = self.getParameterValue(self.SPECIES) v = Processing.getObject(vector) v_crs = v.crs() # Environmental layers envlayers = self.getParameterValue(self.ENV) if func.unificationNecessary(envlayers.split(";")): raise GeoAlgorithmExecutionException("All input environmental layers need to have the same resolution and extent. Use the Unify tool beforehand") #TODO: Enable option to do this automatically progress.setConsoleInfo("Loading Coverage Data") # Check Projection and Cellsize for lay in envlayers.split(";"): r = Processing.getObject(lay) # QgsRasterLayer object if r.crs() != v_crs: raise GeoAlgorithmExecutionException("All input layers need to have the same projection") if round(r.rasterUnitsPerPixelX()) != round(r.rasterUnitsPerPixelY()): raise GeoAlgorithmExecutionException("Grid Cell size values are not equal. Please be sure that grid cells are squares.") # Set coverage parameters r = Processing.getObject(envlayers.split(";")[0]) # QgsRasterLayer object ex = r.extent() data["grid_size"] = r.rasterUnitsPerPixelX() data["Nx"] = r.width() data["Ny"] = r.height() data["x_left_lower_corner"] = ex.xMinimum() data["y_left_lower_corner"] = ex.yMinimum() # Load in Coverage values coverage = [] for lay in envlayers.split(";"): raster = gdal.Open(str(lay)) if raster.RasterCount > 1: progress.setConsoleInfo("Warning: Multiple bands for layer detected. Using only first band.") array = raster.GetRasterBand(1).ReadAsArray() NA = raster.GetRasterBand(1).GetNoDataValue() if NA == None: raise GeoAlgorithmExecutionException("Warning: Raster layer has no no-data value. Please specify a no-data value for this dataset.") else: array[array==NA] = -9999 # Replace nodata-values of array with -9999 coverage.append(array) data["coverages"] = numpy.array( coverage ) # Load all the coverage values into the bunch # Setup parameters for output prediction a = gdal.Open(envlayers.split(";")[0]) columns = a.RasterXSize rows = a.RasterYSize driver = a.GetDriver() NA = -9999 gt = a.GetGeoTransform() proj = a.GetProjection() output = self.getOutputValue(self.OUT_PRED) # Set up the data grid xgrid, ygrid = construct_grids(data) # The grid in x,y coordinates X, Y = numpy.meshgrid(xgrid, ygrid[::-1]) # background points (grid coordinates) for evaluation numpy.random.seed(100) background_points = numpy.c_[numpy.random.randint(low=0, high=data.Ny, size=10000), numpy.random.randint(low=0, high=data.Nx, size=10000)].T # We'll make use of the fact that coverages[6] has measurements at all # land points. This will help us decide between land and water. # FIXME: Assuming that all predictors have a similar distribution. Might be violated land_reference = data.coverages[0] progress.setConsoleInfo("Loading Occurence Data and coverage") # Creating response train = [] for feature in v.getFeatures(): geom = feature.geometry().asPoint() mx = geom.x() my = geom.y() train.append((mx,my)) data["train"] = numpy.array(train) # Add to bunch as training dataset # create species bunch sp_Bunch = Bunch(name="Species") points = dict(train=data.train) for label, pts in points.iteritems(): #determine coverage values for each of the training & testing points ix = numpy.searchsorted(xgrid, pts[0]) iy = numpy.searchsorted(ygrid, pts[1]) bunch['cov_%s' % label] = data.coverages[:, -iy, ix].T progress.setConsoleInfo("Finished loading coverage data of environmental layers") # Starting modelling progress.setConsoleInfo("Finished preparing the data for the analysis") progress.setConsoleInfo("----") progress.setConsoleInfo("Starting Modelling with support of sklearn") # Standardize features #TODO: Enable different or no Standardization methods mean = sp_Bunch.cov.mean(axis=0) std = sp_Bunch.cov.std(axis=0) train_cover_std = (sp_Bunch.cov - mean) / std # Fit OneClassSVM progress.setConsoleInfo("Fitting Support Vector Machine") # TODO: Allow the user to vary the input clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5) clf.fit(train_cover_std) progress.setConsoleInfo("Fitting done") # Predict species distribution using the training data Z = numpy.ones((data.Ny, data.Nx), dtype=numpy.float64) # We'll predict only for the land points. idx = numpy.where(land_reference > -9999) coverages_land = data.coverages[:, idx[0], idx[1]].T pred = clf.decision_function((coverages_land - mean) / std)[:, 0] Z *= pred.min() Z[idx[0], idx[1]] = pred levels = numpy.linspace(Z.min(), Z.max(), 25) Z[land_reference == -9999] = -9999 result = Z # save the final results scores # Compute AUC w.r.t. background points pred_background = Z[background_points[0], background_points[1]] pred_test = clf.decision_function((species.cov_test - mean) / std)[:, 0] scores = numpy.r_[pred_test, pred_background] y = numpy.r_[numpy.ones(pred_test.shape), numpy.zeros(pred_background.shape)] fpr, tpr, thresholds = metrics.roc_curve(y, scores) roc_auc = metrics.auc(fpr, tpr) # Area under the ROC curve # TODO: Evaluate the availability of other metrics to compute on (average mean error, etc.. ) # Create Output Prediction File output = self.getOutputValue(self.OUT_PRED_RES) titles = ['AUC'] res_pred = [roc_auc] # Save Output func.saveToCSV(res_pred, titles, output) # Create Output for resulting prediction metadata = driver.GetMetadata() if metadata.has_key( gdal.DCAP_CREATE ) and metadata[ gdal.DCAP_CREATE ] == "YES": pass else: progress.setConsoleInfo("Output creation of input Fileformat is not supported by gdal. Create GTiff by default.") driver = gdal.GetDriverByName("GTiff") data_type = result.dtype try: outData = driver.Create(output, columns, rows, 1, data_type) except Exception, e: ProcessingLog.addToLog(ProcessingLog.LOG_ERROR,"Output file could not be created!")