def trainMulticlassBaggingModel(DataSet,VegeTypes,varnames,params,multiclassmethod,baggingmetric='kappa',baggingweightindex=1,\ baggingmetricthres=0.7,varlabelweights=[-1],colsamplerate=0.7,train_percent=0.75,runtimes=300,\ bool_autolabel=True,varlabels=[],n_varlabels=5,bool_weight=False,bool_strclass=False,labelHeaderName="",\ bool_save=False,savedirbase=""): if bool_autolabel: varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels) selectruntimesvarnames = _stratifiedRandomChoice_column( varnames, varlabels, varlabelweights, colsamplerate, runtimes) evalValues = np.zeros(runtimes) weights = np.zeros(runtimes) for runtime in range(runtimes): savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=True) try: if multiclassmethod == 'softmax': ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalValues[runtime] = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric) print("Runtime: %d model done. Evaluation Value = %f" % (runtime, evalValues[runtime])) except: print("Model not established!") evalValues[runtime] = 0.0 weights = _calWeight(evalValues, baggingweightindex, baggingmetricthres) evalFiledirto = savedirbase + os.sep + "Runtime_Model_Evaluation_Weights.csv" init.writeArrayListToCSV([evalValues, weights], [baggingmetric, 'weight'], evalFiledirto) #Write Each Runtime Model Variables Names selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" save = pd.DataFrame({}) for runtime in range(runtimes): pdtmp = pd.DataFrame({ "SelectVarName_run" + str(runtime): selectruntimesvarnames[runtime] }) save = pd.concat([save, pdtmp], axis=1) save.to_csv(selectvarnamesfiledir, index=False, header=True)
def _trainSingleclassBaggingModel(CPIDs,DataSet,vtname,params,baggingmetric,bool_gpu,n_gpus,n_parallels,selectruntimesvarnames,\ runtime,train_percent,single_thres,bool_balance,bool_strclass,labelHeaderName,bool_save,savedirbase): #Assign task to worker print("Training #%d model..." % runtime) params_parallel = copy.deepcopy(params) process_pid = os.getpid() if len(CPIDs) < n_parallels: CPIDs.append(process_pid) process_pid_index = CPIDs.index(process_pid) print("Worker #%d: PID = %d" % (process_pid_index, process_pid)) if bool_gpu: params_parallel['gpu_id'] = process_pid_index % n_gpus #Execute model training process RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=1) if bool_balance: if len(train_y.shape) > 1: ratio = np.float( np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1) else: ratio = np.float(np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1) params_parallel['scale_pos_weight'] = ratio model = xgbf.TrainModel(train_x, train_y, params_parallel) savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) modelName = vtname + '_xgboost_singleclass_run' + str(runtime) + ".model" modeldir = savedir + os.sep + modelName model.save_model(modeldir) [pred_Y, pred_pY] = xgbf.Predict(model, test_x, bool_binary=1, threshold=single_thres) evalValue = xgbf.Evaluate(test_y, pred_Y, pred_pY, baggingmetric) print("Runtime: %d model training finished. Evaluation Value = %f\n" % (runtime, evalValue)) return evalValue
def _trainMulticlassBaggingModel(CPIDs,DataSet,VegeTypes,varnames,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\ selectruntimesvarnames,runtime,train_percent,baggingmetric,bool_weight,bool_strclass,labelHeaderName,\ bool_save,savedirbase): #Assign task to worker print("Training #%d model..." % runtime) params_parallel = copy.deepcopy(params) process_pid = os.getpid() if len(CPIDs) < n_parallels: CPIDs.append(process_pid) process_pid_index = CPIDs.index(process_pid) print("Worker #%d: PID = %d" % (process_pid_index, process_pid)) if bool_gpu: params_parallel['gpu_id'] = process_pid_index % n_gpus #Execute model training process savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=1) if multiclassmethod == 'softmax': ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\ bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\ runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\ bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\ runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalValue = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric) # evalValues[runtime]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,access_method) print("Runtime: %d model training finished. Evaluation Value = %f\n" % (runtime, evalValue)) return evalValue
bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,varnames,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") #%% #Evaluate results evalueFolder = dirto xgbf.mlcEvalAndWriteResult(evalueFolder, pred_Y, pred_pY, test_Y) #Plot confusion matrix plotfiledirto = evalueFolder + os.sep + "conf_mat.png" plot.plot_confusion_matrix(xgbf.Evaluate(test_Y,pred_Y,pred_pY,'confmat'),VegeTypes,title='Confusion Matrix',cmap=None,normalize=False,\ figsize=(8, 6),fontsize=11,labelsize=11,savedir=plotfiledirto) #%% #Predict mapping results print("Predict region...") nanDefault = -9999 [TiffList, Total] = init.generateVarialeTiffList(variableFolderdir, varnames, postfix) [MatX, Driver, GeoTransform, Proj, nrow, ncol] = ptf.readTiffAsNumpy(TiffList) multiclassFolderName = "Multiclass_XGBoost_" + multiclassmethod + "_Model" savedir = root + os.sep + modelFolderName + os.sep + multiclassFolderName if multiclassmethod == 'softmax': pred_X = init.fomatMulticlassSoftmaxMatrix(MatX) pred_pY = mlc.predictMulticlassSoftmaxModelCvted(ModelList, pred_X,
def trainSingleclassBaggingModel(DataSet,vtname,varnames,params,baggingmetric='auc',baggingweightindex=1,\ baggingmetricthres=0.7,single_thres=0.5,varlabelweights=[-1],colsamplerate=0.7,\ train_percent=0.75,runtimes=300,bool_autolabel=True,varlabels=[],n_varlabels=5,bool_balance=True,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""): ModelList = [] if bool_autolabel: varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels) selectruntimesvarnames = _stratifiedRandomChoice_column( varnames, varlabels, varlabelweights, colsamplerate, runtimes) evalValues = np.zeros(runtimes) for runtime in range(runtimes): RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=1) if bool_balance: if len(train_y.shape) > 1: ratio = np.float( np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1) else: ratio = np.float( np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1) params['scale_pos_weight'] = ratio model = xgbf.TrainModel(train_x, train_y, params) if bool_save: savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) modelName = vtname + '_xgboost_singleclass_run' + str( runtime) + ".model" modeldir = savedir + os.sep + modelName model.save_model(modeldir) else: ModelList.append(model) [pred_Y, pred_pY] = xgbf.Predict(model, test_x, bool_binary=1, threshold=single_thres) evalValues[runtime] = xgbf.Evaluate(test_y, pred_Y, pred_pY, baggingmetric) print("Runtime: %d model done. Evaluation Value = %f" % (runtime, evalValues[runtime])) baggingweights = _calWeight(evalValues, runtimes, baggingweightindex, baggingmetricthres) if bool_save: #Save Weights evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName evalweightsarray = np.zeros([runtimes, 2]) evalweightsarray[:, 0] = evalValues evalweightsarray[:, 1] = baggingweights evalweightarrayname = [baggingmetric, 'weight'] init.writeArrayToCSV(evalweightsarray, evalweightarrayname, evalweightsFiledirto) #Save Used Parameters selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv" save = pd.DataFrame({}) for runtime in range(runtimes): pdtmp = pd.DataFrame({ "SelectVarName_run" + str(runtime): selectruntimesvarnames[runtime] }) save = pd.concat([save, pdtmp], axis=1) save.to_csv(selectvarnamesfiledir, index=False, header=True) return [] else: return [ModelList, selectruntimesvarnames, baggingweights]
init.getListFromPandas(HierRelationsFiledir,labelHeaderName_L)) #%% #Produce merged predicted test set pred_Y = hmap.predictHierUpMapping(baseMapTestResult, VegeTypes1, VegeTypes2, HierRelations) test_Y = hmap.predictHierUpMapping(realTestY, VegeTypes1, VegeTypes2, HierRelations) #Evaluate EvalueFolder = dirto xgbf.mlcEvalAndWriteResult(EvalueFolder, pred_Y, np.zeros_like(pred_Y), test_Y) #Plot confusion matrix plotfiledirto = EvalueFolder + os.sep + "conf_mat.png" chsize = 5 plot.plot_confusion_matrix(xgbf.Evaluate(test_Y,pred_Y,np.zeros_like(pred_Y),'confmat'),VegeTypes1,title='Confusion Matrix',cmap=None,normalize=False,\ figsize=(8, 6),fontsize=chsize,labelsize=chsize,savedir=plotfiledirto) #%% #Produce merged map print("Predict region...") nanDefault = -9999 [baseMapLayer, Driver, GeoTransform, Proj, nrow, ncol] = ptf.readTiffAsNumpy([baseMapLayerFiledir]) baseMapLayer = baseMapLayer[:, :, 0].astype(np.int32) pred_Y = hmap.predictHierUpMapping(baseMapLayer, VegeTypes1, VegeTypes2, HierRelations) #Write mapping results Filename2 = "Merging_Mapping_result" + postfix
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\ bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir): print("Trying to evalute feature: %s"%evaluate_feature) params_parallel=copy.deepcopy(params) process_pid=os.getpid() if len(CPIDs)<n_parallels: CPIDs.append(process_pid) process_pid_index=CPIDs.index(process_pid) print("Worker #%d: PID = %d"%(process_pid_index,process_pid)) if bool_gpu: params_parallel['gpu_id']=process_pid_index%n_gpus if bool_cv==1: [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) if not bool_strclass: class_labels=init.mergeCategories(Y) else: class_labels=Y pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) pred_pY_cv=np.zeros(len(class_labels)*cv_num) test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) last_cv_idx=0 current_cv_idx=0 for cv_i in range(cv_num): skf=StratifiedKFold(n_splits=skf_split,shuffle=True) cv_j=0 for train, test in skf.split(X,class_labels): train_x=X[train] train_y=Y[train] test_x=X[test] test_y=Y[test] if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") current_cv_idx=len(test_Y)+last_cv_idx pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y # pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY test_Y_cv[last_cv_idx:current_cv_idx]=test_Y last_cv_idx=current_cv_idx # evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) cv_j=cv_j+1 evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) else: if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue)) return evalue
def _estabModelAndPred(TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,EvalueFolder,variableFolderdir,postfix,\ bool_predictmap,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir): num_class=len(VegeTypes) #Establish Training Model if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") #Write Test Results YArray=np.zeros([len(test_Y),2]) YArray[:,0]=test_Y YArray[:,1]=pred_Y YFiledirto=EvalueFolder+os.sep+"Best_Feature_Real_and_Predicted_Results.csv" init.writeArrayToCSV(YArray,['real','predict'],YFiledirto) #Evaluate Model and Write Result evalArray=np.zeros([1,2]) evalArray[0,0]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'accuracy') evalArray[0,1]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'kappa') evalFiledirto=EvalueFolder+os.sep+"Best_Feature_Model_Evaluation_ValidDataSet.csv" init.writeArrayToCSV(evalArray,['accuracy','kappa'],evalFiledirto) #Find XGBoost Feature Scores featureScoreFiledirto=EvalueFolder+os.sep+"Feature_Scores.csv" model=ModelList[0] feature_scores=model.get_fscore() [feature_names,fscores]=locateFeatureScores(feature_names,feature_scores) init.writeArrayListToCSV([feature_names,fscores],['VariableName','FeatureScore'],featureScoreFiledirto) if bool_predictmap: #Predict Mapping Results print("Predict region...") nanDefault=-9999 [TiffList,Total]=init.generateVarialeTiffList(variableFolderdir,feature_names,postfix) [MatX,Driver,GeoTransform,Proj,nrow,ncol]=ptf.readTiffAsNumpy(TiffList) BestFeatureProductFolder=EvalueFolder+os.sep+"Best_Features_Mapping_Results" if multiclassmethod=='softmax': pred_X=init.fomatMulticlassSoftmaxMatrix(MatX) pred_pY=mtc.predictMulticlassSoftmaxModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=False) elif multiclassmethod=='category': pred_X=init.formatMulticlassCategoryMatrix(MatX,num_class) pred_pY=mtc.predictMulticlassCategoryModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=True) for i in range(len(VegeTypes)): vtname=VegeTypes[i] ProductFolder=BestFeatureProductFolder+os.sep+vtname if not os.path.exists(ProductFolder): os.makedirs(ProductFolder) Filename1=vtname+"_xgboost_"+multiclassmethod+postfix ProductFiledirto1=ProductFolder+os.sep+Filename1 ptf.writeNumpyToTiff(pred_pY[:,:,i],Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto1,datatype='Float32') Filename2="VegeMap_XGBoost_multiclass_"+multiclassmethod+postfix ProductFolder=BestFeatureProductFolder ProductFiledirto2=ProductFolder+os.sep+Filename2 ptf.writeNumpyToTiff(pred_Y,Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto2,datatype='Int16') return fscores