def __init__(self, inRaster, inVector, inField='Class', outModel=None, inSplit=1, inSeed=0, outMatrix=None, inClassifier='GMM'): learningProgress = progressBar('Learning model...', 6) # Convert vector to raster try: try: temp_folder = tempfile.mkdtemp() filename = os.path.join(temp_folder, 'temp.tif') data = gdal.Open(inRaster, gdal.GA_ReadOnly) shp = ogr.Open(inVector) lyr = shp.GetLayer() except: QgsMessageLog.logMessage( "Problem with making tempfile or opening raster or vector") # Create temporary data set try: driver = gdal.GetDriverByName('GTiff') dst_ds = driver.Create(filename, data.RasterXSize, data.RasterYSize, 1, gdal.GDT_Byte) dst_ds.SetGeoTransform(data.GetGeoTransform()) dst_ds.SetProjection(data.GetProjection()) OPTIONS = 'ATTRIBUTE=' + inField gdal.RasterizeLayer(dst_ds, [1], lyr, None, options=[OPTIONS]) data, dst_ds, shp, lyr = None, None, None, None except: QgsMessageLog.logMessage("Cannot create temporary data set") # Load Training set try: X, Y = dataraster.get_samples_from_roi(inRaster, filename) except: QgsMessageLog.logMessage( "Problem while getting samples from ROI with" + inRaster) QgsMessageLog.logMessage( "Are you sure to have only integer values in your " + str(inField) + " column ?") [n, d] = X.shape C = int(Y.max()) SPLIT = inSplit os.remove(filename) os.rmdir(temp_folder) # Scale the data X, M, m = self.scale(X) learningProgress.addStep() # Add Step to ProgressBar # Learning process take split of groundthruth pixels for training and the remaining for testing try: if SPLIT < 1: # Random selection of the sample x = sp.array([]).reshape(0, d) y = sp.array([]).reshape(0, 1) xt = sp.array([]).reshape(0, d) yt = sp.array([]).reshape(0, 1) sp.random.seed(inSeed) # Set the random generator state for i in range(C): t = sp.where((i + 1) == Y)[0] nc = t.size ns = int(nc * SPLIT) rp = sp.random.permutation(nc) x = sp.concatenate((X[t[rp[0:ns]], :], x)) xt = sp.concatenate((X[t[rp[ns:]], :], xt)) y = sp.concatenate((Y[t[rp[0:ns]]], y)) yt = sp.concatenate((Y[t[rp[ns:]]], yt)) else: x, y = X, Y except: QgsMessageLog.logMessage("Problem while learning if SPLIT <1") learningProgress.addStep() # Add Step to ProgressBar # Train Classifier if inClassifier == 'GMM': try: # tau=10.0**sp.arange(-8,8,0.5) model = gmmr.GMMR() model.learn(x, y) # htau,err = model.cross_validation(x,y,tau) # model.tau = htau except: QgsMessageLog.logMessage("Cannot train with GMMM") else: try: from sklearn import neighbors from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier try: model_selection = True from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV except: model_selection = False from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV try: # AS Qgis in Windows doensn't manage multiprocessing, force to use 1 thread for not linux system if os.name == 'posix': n_jobs = -1 else: n_jobs = 1 # if inClassifier == 'RF': param_grid_rf = dict(n_estimators=3**sp.arange( 1, 5), max_features=sp.arange(1, 4)) y.shape = (y.size, ) if model_selection: cv = StratifiedKFold(n_splits=3).split(x, y) #cv = cv.get_n_splits(y) else: cv = StratifiedKFold(y, n_folds=3) grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, cv=cv, n_jobs=n_jobs) grid.fit(x, y) model = grid.best_estimator_ model.fit(x, y) elif inClassifier == 'SVM': param_grid_svm = dict(gamma=2.0**sp.arange(-4, 4), C=10.0**sp.arange(-2, 5)) y.shape = (y.size, ) if model_selection: cv = StratifiedKFold(n_splits=5).split(x, y) else: cv = StratifiedKFold(y, n_folds=5) grid = GridSearchCV(SVC(), param_grid=param_grid_svm, cv=cv, n_jobs=n_jobs) grid.fit(x, y) model = grid.best_estimator_ model.fit(x, y) elif inClassifier == 'KNN': param_grid_knn = dict( n_neighbors=sp.arange(1, 20, 4)) y.shape = (y.size, ) if model_selection: cv = StratifiedKFold(n_splits=3).split(x, y) else: cv = StratifiedKFold(y, n_folds=3) grid = GridSearchCV( neighbors.KNeighborsClassifier(), param_grid=param_grid_knn, cv=cv, n_jobs=n_jobs) grid.fit(x, y) model = grid.best_estimator_ model.fit(x, y) except: QgsMessageLog.logMessage( "Cannot train with classifier " + inClassifier) except: QgsMessageLog.logMessage( "You must have sklearn dependencies on your computer. Please consult the documentation for installation." ) learningProgress.prgBar.setValue(5) # Add Step to ProgressBar # Assess the quality of the model if SPLIT < 1: # if inClassifier == 'GMM': # = model.predict(xt)[0] # else: yp = model.predict(xt) CONF = ai.CONFUSION_MATRIX() CONF.compute_confusion_matrix(yp, yt) sp.savetxt(outMatrix, CONF.confusion_matrix, delimiter=',', fmt='%1.4d') # Save Tree model if outModel is not None: output = open(outModel, 'wb') pickle.dump([model, M, m], output) output.close() learningProgress.addStep() # Add Step to ProgressBar # Close progressBar learningProgress.reset() learningProgress = None except: learningProgress.reset()
def __init__(self, inRaster, inVector, inField='Class', outModel=None, inSplit=100, inSeed=0, outMatrix=None, inClassifier='GMM', extraParam=False, feedback=None): """!@brief Learn model with a shp file and a raster image. ********** Parameters ---------- inRaster : Filtered image name ('sample_filtered.tif',str). inVector : Name of the training shpfile ('training.shp',str). inField : Column name where are stored class number (str). inSplit : (int) or str 'SLOO' or 'STAND' if 'STAND', extraParam['SLOO'] is by default False, and extraParam['maxIter'] is 5. \n if 'SLOO', extraParam['distance'] must be given. extraParam['maxIter'] is False, extraParam['minTrain'] is 0.5 for 50\% \n Please specify a extraParam['saveDir'] to save results/confusion matrix. inSeed : (int). outModel : Name of the model to save, will be compulsory for the 3rd step (classifying). outMatrix : Default the name of the file inRaster(minus the extension)_inClassifier_inSeed_confu.csv (str). inClassifier : GMM,KNN,SVM, or RF. (str). Output ---------- Model file. Confusion Matrix. """ # Convert vector to raster needXY = True pushFeedback('Learning model...', feedback=feedback) pushFeedback(0, feedback=feedback) total = 100 / 10 SPLIT = inSplit if feedback == 'gui': progress = pB.progressBar('Loading...', 6) try: if isinstance(inRaster, np.ndarray): needXY = False X = inRaster if isinstance(inVector, np.ndarray): Y = inVector else: msg = 'You have to give an array for label when using array for raster' pushFeedback(msg, feedback=feedback) if extraParam: if 'readROIFromVector' in extraParam.keys(): if extraParam['readROIFromVector'] is not False: try: from function_vector import readROIFromVector X, Y = readROIFromVector( inVector, extraParam['readROIFromVector'], inField) needXY = False except BaseException: msg = 'Problem when importing readFieldVector from functions in dzetsaka' pushFeedback(msg, feedback=feedback) if 'saveDir' in extraParam.keys(): saveDir = extraParam['saveDir'] if not os.path.exists(saveDir): os.makedirs(saveDir) if not os.path.exists(os.path.join(saveDir, 'matrix/')): os.makedirs(os.path.join(saveDir, 'matrix/')) inVectorTest = False if isinstance(SPLIT, str): if SPLIT.endswith(('.shp', '.sqlite')): inVectorTest = SPLIT if needXY: ROI = rasterize(inRaster, inVector, inField) if inVectorTest: ROIt = rasterize(inRaster, inVectorTest, inField) X, Y = dataraster.get_samples_from_roi(inRaster, ROI) Xt, yt = dataraster.get_samples_from_roi(inRaster, ROIt) xt, N, n = self.scale(Xt) #x,y = dataraster.get_samples_from_roi(inRaster,ROI,getCoords=True,convertTo4326=True) y = Y # Create temporary data set if SPLIT == 'SLOO': from sklearn.metrics import confusion_matrix try: from function_vector import distanceCV, distMatrix except BaseException: from .function_vector import distanceCV, distMatrix from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score """ distanceFile = os.path.splitext(inVector)[0]+'_'+str(inField)+'_distMatrix.npy' if os.path.exists(distanceFile): print('Distance array loaded') distanceArray = np.load(distanceFile) X,Y = dataraster.get_samples_from_roi(inRaster,ROI) else: print('Generate distance array') """ if 'readROIFromVector' in extraParam.keys(): if extraParam['readROIFromVector'] is not False: try: coords = extraParam['coords'] except BaseException: pushFeedback( 'Can\'t read coords array', feedback=feedback) else: X, Y, coords = dataraster.get_samples_from_roi( inRaster, ROI, getCoords=True) try: coords = extraParam['coords'] except BaseException: X, Y, coords = dataraster.get_samples_from_roi( inRaster, ROI, getCoords=True) distanceArray = distMatrix(coords) # np.save(os.path.splitext(distanceFile)[0],distanceArray) else: if SPLIT == 'STAND': from sklearn.metrics import confusion_matrix try: from .function_vector import standCV # ,readFieldVector except BaseException: from function_vector import standCV # ,readFieldVector try: from sklearn.metrics import cohen_kappa_score, accuracy_score, f1_score except BaseException: pass if 'inStand' in extraParam.keys(): inStand = extraParam['inStand'] else: inStand = 'stand' STAND = rasterize(inRaster, inVector, inStand) X, Y, STDs = dataraster.get_samples_from_roi( inRaster, ROI, STAND) #ROIStand = rasterize(inRaster,inVector,inStand) #temp, STDs = dataraster.get_samples_from_roi(inRaster,ROIStand) # FIDs,STDs,srs=readFieldVector(inVector,inField,inStand,getFeatures=False) elif needXY: X, Y = dataraster.get_samples_from_roi(inRaster, ROI) except BaseException: msg = "Problem with getting samples from ROI \n \ Are you sure to have only integer values in your " + str(inField) + " field ?\n " pushFeedback(msg, feedback=feedback) [n, d] = X.shape C = int(Y.max()) SPLIT = inSplit try: #pushFeedback(str(ROI),feedback=feedback) os.remove(ROI) except BaseException: pass # os.remove(filename) # os.rmdir(temp_folder) # Scale the data X, M, m = self.scale(X) pushFeedback(int(1 * total)) if feedback == 'gui': progress.addStep() # Add Step to ProgressBar # Learning process take split of groundthruth pixels for training and # the remaining for testing try: if isinstance(SPLIT, int) or isinstance(SPLIT, float): if SPLIT < 100: # Random selection of the sample x = np.array([]).reshape(0, d) y = np.array([]).reshape(0, 1) xt = np.array([]).reshape(0, d) yt = np.array([]).reshape(0, 1) np.random.seed(inSeed) # Set the random generator state for i in range(C): t = np.where((i + 1) == Y)[0] nc = t.size ns = int(nc * (SPLIT / float(100))) rp = np.random.permutation(nc) x = np.concatenate((X[t[rp[0:ns]], :], x)) xt = np.concatenate((X[t[rp[ns:]], :], xt)) y = np.concatenate((Y[t[rp[0:ns]]], y)) yt = np.concatenate((Y[t[rp[ns:]]], yt)) else: x, y = X, Y self.x = x self.y = y else: x, y = X, Y self.x = x self.y = y except BaseException: pushFeedback( "Problem while learning if SPLIT <1", feedback=feedback) pushFeedback(int(2 * total), feedback=feedback) if feedback == 'gui': progress.addStep() pushFeedback('Learning process...', feedback=feedback) pushFeedback( 'This step could take a lot of time... So be patient, even if the progress bar stucks at 20% :)', feedback=feedback) if feedback == 'gui': progress.addStep() # Add Step to ProgressBar # Train Classifier if inClassifier == 'GMM': try: from . import gmm_ridge as gmmr except BaseException: import gmm_ridge as gmmr try: # tau=10.0**sp.arange(-8,8,0.5) model = gmmr.GMMR() model.learn(x, y) # htau,err = model.cross_validation(x,y,tau) # model.tau = htau except BaseException: pushFeedback("Cannot train with GMM", feedback=feedback) else: #from sklearn import neighbors #from sklearn.svm import SVC #from sklearn.ensemble import RandomForestClassifier #model_selection = True from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import GridSearchCV try: if extraParam: if 'param_algo' in extraParam.keys(): param_algo = extraParam['param_algo'] # AS Qgis in Windows doensn't manage multiprocessing, force to # use 1 thread for not linux system if SPLIT == 'STAND': label = np.copy(Y) if extraParam: if 'SLOO' in extraParam.keys(): SLOO = extraParam['SLOO'] else: SLOO = False if 'maxIter' in extraParam.keys(): maxIter = extraParam['maxIter'] else: maxIter = 5 else: SLOO = False maxIter = 5 rawCV = standCV(label, STDs, maxIter, SLOO, seed=inSeed) print(rawCV) cvDistance = [] for tr, vl in rawCV: # sts.append(stat) cvDistance.append((tr, vl)) if SPLIT == 'SLOO': # Compute CV for Learning later label = np.copy(Y) if extraParam: if 'distance' in extraParam.keys(): distance = extraParam['distance'] else: pushFeedback( 'You need distance in extraParam', feedback=feedback) if 'minTrain' in extraParam.keys(): minTrain = float(extraParam['minTrain']) else: minTrain = -1 if 'SLOO' in extraParam.keys(): SLOO = extraParam['SLOO'] else: SLOO = True if 'maxIter' in extraParam.keys(): maxIter = extraParam['maxIter'] else: maxIter = False if 'otherLevel' in extraParam.keys(): otherLevel = extraParam['otherLevel'] else: otherLevel = False #sts = [] cvDistance = [] """ rawCV = distanceCV(distanceArray,label,distanceThresold=distance,minTrain=minTrain,SLOO=SLOO,maxIter=maxIter,verbose=False,stats=False) """ #feedback.setProgressText('distance is '+str(extraParam['distance'])) pushFeedback('label is ' + str(label.shape), feedback=feedback) pushFeedback('distance array shape is ' + str(distanceArray.shape), feedback=feedback) pushFeedback( 'minTrain is ' + str(minTrain), feedback=feedback) pushFeedback('SLOO is ' + str(SLOO), feedback=feedback) pushFeedback( 'maxIter is ' + str(maxIter), feedback=feedback) rawCV = distanceCV( distanceArray, label, distanceThresold=distance, minTrain=minTrain, SLOO=SLOO, maxIter=maxIter, stats=False) pushFeedback( 'Computing SLOO Cross Validation', feedback=feedback) for tr, vl in rawCV: pushFeedback('Training size is ' + str(tr.shape), feedback=feedback) pushFeedback('Validation size is ' + str(vl.shape), feedback=feedback) # sts.append(stat) cvDistance.append((tr, vl)) """ for tr,vl,stat in rawCV : sts.append(stat) cvDistance.append((tr,vl)) """ # if inClassifier == 'RF': from sklearn.ensemble import RandomForestClassifier param_grid = dict( n_estimators=3**np.arange( 1, 5), max_features=range( 1, x.shape[1], int( x.shape[1] / 3))) if 'param_algo' in locals(): classifier = RandomForestClassifier(**param_algo) else: classifier = RandomForestClassifier() n_splits = 5 elif inClassifier == 'SVM': from sklearn.svm import SVC param_grid = dict( gamma=2.0**np.arange(-4, 4), C=10.0**np.arange(-2, 5)) if 'param_algo' in locals(): classifier = SVC(probability=True, **param_algo) print('Found param algo : ' + str(param_algo)) else: classifier = SVC(probability=True, kernel="rbf") n_splits = 5 elif inClassifier == 'KNN': from sklearn import neighbors param_grid = dict(n_neighbors=np.arange(1, 20, 4)) if 'param_algo' in locals(): classifier = neighbors.KNeighborsClassifier( **param_algo) else: classifier = neighbors.KNeighborsClassifier() n_splits = 3 except BaseException: pushFeedback( "Cannot train with classifier " + inClassifier, feedback=feedback) if feedback == 'gui': progress.prgBar.setValue(5) # Add Step to ProgressBar if isinstance(SPLIT, int): cv = StratifiedKFold(n_splits=n_splits) # .split(x,y) else: cv = cvDistance y.shape = (y.size,) if extraParam: if 'param_grid' in extraParam.keys(): param_grid = extraParam['param_grid'] pushFeedback( 'Custom param for Grid Search CV has been found : ' + str(param_grid), feedback=feedback) grid = GridSearchCV( classifier, param_grid=param_grid, cv=cv, n_jobs=1) grid.fit(x, y) model = grid.best_estimator_ model.fit(x, y) if isinstance(SPLIT, str): CM = [] testIndex = [] for train_index, test_index in cv: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] model.fit(X_train, y_train) X_pred = model.predict(X_test) CM.append(confusion_matrix(y_test, X_pred)) testIndex.append(test_index) for i, j in enumerate(CM): if SPLIT == 'SLOO': # np.savetxt((saveDir+'matrix/'+str(distance)+'_'+str(inField)+'_'+str(minTrain)+'_'+str(i)+'.csv'),CM[i],delimiter=',',fmt='%.d') np.savetxt( os.path.join( saveDir, 'matrix/' + str(distance) + '_' + str(inField) + '_' + str(minTrain) + '_' + str(i) + '.csv'), CM[i], delimiter=',', fmt='%.d') if otherLevel is not False: otherLevelFolder = os.path.join( saveDir, 'matrix/level3/') if not os.path.exists(otherLevelFolder): os.makedirs(otherLevelFolder) bigCM = np.zeros([14, 14], dtype=np.byte) arr = CM[i] curLevel = otherLevel[testIndex[i]] curLevel = np.sort(curLevel, axis=0) for lvl in range(curLevel.shape[0]): bigCM[curLevel.astype( int) - 1, curLevel[lvl].astype(int) - 1] = arr[:, lvl].reshape(-1, 1) np.savetxt( os.path.join( otherLevelFolder, str(distance) + '_' + str(inField) + '_' + str(minTrain) + '_' + str(i) + '.csv'), bigCM, delimiter=',', fmt='%.d') elif SPLIT == 'STAND': # np.savetxt((saveDir+'matrix/stand_'+str(inField)+'_'+str(i)+'.csv'),CM[i],delimiter=',',fmt='%.d') np.savetxt( os.path.join( saveDir, 'matrix/stand_' + str(inField) + '_' + str(i) + '.csv'), CM[i], delimiter=',', fmt='%.d') pushFeedback(int(9 * total), feedback=feedback) # Assess the quality of the model if feedback == 'gui': progress.prgBar.setValue(90) if inVectorTest or isinstance(SPLIT, int): if SPLIT != 100 or inVectorTest: #from sklearn.metrics import cohen_kappa_score,accuracy_score,f1_score # if inClassifier == 'GMM': # = model.predict(xt)[0] # else: yp = model.predict(xt) CONF = ai.CONFUSION_MATRIX() CONF.compute_confusion_matrix(yp, yt) if outMatrix is not None: if not os.path.exists(os.path.dirname(outMatrix)): os.makedirs(os.path.dirname(outMatrix)) np.savetxt( outMatrix, CONF.confusion_matrix, delimiter=',', header='Columns=prediction,Lines=reference.', fmt='%1.4d') if inClassifier != 'GMM': for key in param_grid.keys(): message = 'best ' + key + ' : ' + \ str(grid.best_params_[key]) if feedback == 'gui': QgsMessageLog.logMessage(message) elif feedback: feedback.setProgressText(message) else: print(message) """ self.kappa = cohen_kappa_score(yp,yt) self.f1 = f1_score(yp,yt,average='micro') self.oa = accuracy_score(yp,yt) """ res = { 'Overall Accuracy': CONF.OA, 'Kappa': CONF.Kappa, 'f1': CONF.F1mean} for estim in res: pushFeedback(estim + ' : ' + str(res[estim]), feedback=feedback) # Save Tree model self.model = model self.M = M self.m = m if outModel is not None: output = open(outModel, 'wb') pickle.dump([model, M, m, inClassifier], output) output.close() pushFeedback(int(10 * total), feedback=feedback) if feedback == 'gui': progress.reset() progress = None