def _testMulticlassBaggingModel(CPIDs,RuntimeList,TestDataSet,VegeTypes,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\ selectruntimesvarnames,baggingweights,bool_strclass,labelHeaderName,bool_save,savedirbase): print("Predicting Multiclass Bagging Ensemble Models...") params_parallel = copy.deepcopy(params) process_pid = os.getpid() if len(CPIDs) < n_parallels: CPIDs.append(process_pid) process_pid_index = CPIDs.index(process_pid) print("Worker #%d: PID = %d" % (process_pid_index, process_pid)) if bool_gpu: params_parallel['gpu_id'] = process_pid_index % n_gpus num_class = len(VegeTypes) pred_pY_ense = np.zeros([len(TestDataSet), num_class]) for runtime in RuntimeList: if baggingweights[runtime] == 0: print("Model not established!") continue print("Predicting runtime = %d" % runtime) savedir = savedirbase + os.sep + "runtime_" + str(runtime) if multiclassmethod == 'softmax': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params_parallel,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params_parallel,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class) pred_pY_ense = pred_pY_ense + baggingweights[ runtime] * pred_Y_epd.astype(np.float32) return [pred_Y, pred_pY_ense, test_Y]
def trainMulticlassBaggingModel(DataSet,VegeTypes,varnames,params,multiclassmethod,baggingmetric='kappa',baggingweightindex=1,\ baggingmetricthres=0.7,varlabelweights=[-1],colsamplerate=0.7,train_percent=0.75,runtimes=300,\ bool_autolabel=True,varlabels=[],n_varlabels=5,bool_weight=False,bool_strclass=False,labelHeaderName="",\ bool_save=False,savedirbase=""): if bool_autolabel: varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels) selectruntimesvarnames = _stratifiedRandomChoice_column( varnames, varlabels, varlabelweights, colsamplerate, runtimes) evalValues = np.zeros(runtimes) weights = np.zeros(runtimes) for runtime in range(runtimes): savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=True) try: if multiclassmethod == 'softmax': ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\ bool_pandas=False,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalValues[runtime] = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric) print("Runtime: %d model done. Evaluation Value = %f" % (runtime, evalValues[runtime])) except: print("Model not established!") evalValues[runtime] = 0.0 weights = _calWeight(evalValues, baggingweightindex, baggingmetricthres) evalFiledirto = savedirbase + os.sep + "Runtime_Model_Evaluation_Weights.csv" init.writeArrayListToCSV([evalValues, weights], [baggingmetric, 'weight'], evalFiledirto) #Write Each Runtime Model Variables Names selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" save = pd.DataFrame({}) for runtime in range(runtimes): pdtmp = pd.DataFrame({ "SelectVarName_run" + str(runtime): selectruntimesvarnames[runtime] }) save = pd.concat([save, pdtmp], axis=1) save.to_csv(selectvarnamesfiledir, index=False, header=True)
def testMulticlassBaggingModel(TestDataSet,VegeTypes,params,multiclassmethod,runtimes=300,bool_strclass=False,labelHeaderName="",\ bool_save=True,savedirbase=""): if not bool_save: print("Bagging Method has to save models!") return num_class = len(VegeTypes) evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv" selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv" evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight') selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir) selectruntimesvarnames = [] for runtime in range(runtimes): selectruntimesvarnames.append( init.getListFrompdDataSet(selrunvarspdData, "SelectVarName_run" + str(runtime))) del selrunvarspdData pred_pY_ense = np.zeros([len(TestDataSet), num_class]) for runtime in range(runtimes): if baggingweights[runtime] == 0: print("Model not established!") continue print("Predicting runtime = %d" % runtime) savedir = savedirbase + os.sep + "runtime_" + str(runtime) if multiclassmethod == 'softmax': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\ params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\ bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") # pred_pY_ense=pred_pY_ense+pred_pY*baggingweights[runtime] pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class) pred_pY_ense = pred_pY_ense + baggingweights[ runtime] * pred_Y_epd.astype(np.float32) pred_Y = np.argmax(pred_pY_ense, axis=1) return [pred_Y, pred_pY_ense, test_Y]
def _trainMulticlassBaggingModel(CPIDs,DataSet,VegeTypes,varnames,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\ selectruntimesvarnames,runtime,train_percent,baggingmetric,bool_weight,bool_strclass,labelHeaderName,\ bool_save,savedirbase): #Assign task to worker print("Training #%d model..." % runtime) params_parallel = copy.deepcopy(params) process_pid = os.getpid() if len(CPIDs) < n_parallels: CPIDs.append(process_pid) process_pid_index = CPIDs.index(process_pid) print("Worker #%d: PID = %d" % (process_pid_index, process_pid)) if bool_gpu: params_parallel['gpu_id'] = process_pid_index % n_gpus #Execute model training process savedir = savedirbase + os.sep + "runtime_" + str(runtime) if not os.path.exists(savedir): os.makedirs(savedir) RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) [train_x, test_x, train_y, test_y] = xgbf.splitTrainTestData(RuntimeDataSet, train_percent, bool_stratify=1) if multiclassmethod == 'softmax': ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\ bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\ runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\ bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\ runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalValue = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric) # evalValues[runtime]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,access_method) print("Runtime: %d model training finished. Evaluation Value = %f\n" % (runtime, evalValue)) return evalValue
min_child_weight,scale_pos_weight,eta,nthread,max_delta_step=max_delta_step,gpu_id=0) #%% #SMOTE for balanced dataset #tar_ratio is max(num. of classes)/min(num. of classes). -1 represents full balance, recommended here. if bool_smote: TrainDataSet=smote.createSMOTEDataSet(TrainDataSet,VegeTypes,varnames,method='regular',tar_ratio=-1,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) #%% #Train model multiclassFolderName = "Multiclass_XGBoost_" + multiclassmethod + "_Model" savedir = root + os.sep + modelFolderName + os.sep + multiclassFolderName print("Start training model... method: %s" % multiclassmethod) if multiclassmethod == 'softmax': ModelList=mlc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,varnames,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,TestDataSet,VegeTypes,varnames,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod == 'category': ModelList=mlc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,varnames,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") #%% #Evaluate results evalueFolder = dirto xgbf.mlcEvalAndWriteResult(evalueFolder, pred_Y, pred_pY, test_Y) #Plot confusion matrix plotfiledirto = evalueFolder + os.sep + "conf_mat.png" plot.plot_confusion_matrix(xgbf.Evaluate(test_Y,pred_Y,pred_pY,'confmat'),VegeTypes,title='Confusion Matrix',cmap=None,normalize=False,\
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\ bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir): print("Trying to evalute feature: %s"%evaluate_feature) params_parallel=copy.deepcopy(params) process_pid=os.getpid() if len(CPIDs)<n_parallels: CPIDs.append(process_pid) process_pid_index=CPIDs.index(process_pid) print("Worker #%d: PID = %d"%(process_pid_index,process_pid)) if bool_gpu: params_parallel['gpu_id']=process_pid_index%n_gpus if bool_cv==1: [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) if not bool_strclass: class_labels=init.mergeCategories(Y) else: class_labels=Y pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) pred_pY_cv=np.zeros(len(class_labels)*cv_num) test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) last_cv_idx=0 current_cv_idx=0 for cv_i in range(cv_num): skf=StratifiedKFold(n_splits=skf_split,shuffle=True) cv_j=0 for train, test in skf.split(X,class_labels): train_x=X[train] train_y=Y[train] test_x=X[test] test_y=Y[test] if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") current_cv_idx=len(test_Y)+last_cv_idx pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y # pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY test_Y_cv[last_cv_idx:current_cv_idx]=test_Y last_cv_idx=current_cv_idx # evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) cv_j=cv_j+1 evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) else: if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue)) return evalue
def _estabModelAndPred(TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,EvalueFolder,variableFolderdir,postfix,\ bool_predictmap,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir): num_class=len(VegeTypes) #Establish Training Model if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") #Write Test Results YArray=np.zeros([len(test_Y),2]) YArray[:,0]=test_Y YArray[:,1]=pred_Y YFiledirto=EvalueFolder+os.sep+"Best_Feature_Real_and_Predicted_Results.csv" init.writeArrayToCSV(YArray,['real','predict'],YFiledirto) #Evaluate Model and Write Result evalArray=np.zeros([1,2]) evalArray[0,0]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'accuracy') evalArray[0,1]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'kappa') evalFiledirto=EvalueFolder+os.sep+"Best_Feature_Model_Evaluation_ValidDataSet.csv" init.writeArrayToCSV(evalArray,['accuracy','kappa'],evalFiledirto) #Find XGBoost Feature Scores featureScoreFiledirto=EvalueFolder+os.sep+"Feature_Scores.csv" model=ModelList[0] feature_scores=model.get_fscore() [feature_names,fscores]=locateFeatureScores(feature_names,feature_scores) init.writeArrayListToCSV([feature_names,fscores],['VariableName','FeatureScore'],featureScoreFiledirto) if bool_predictmap: #Predict Mapping Results print("Predict region...") nanDefault=-9999 [TiffList,Total]=init.generateVarialeTiffList(variableFolderdir,feature_names,postfix) [MatX,Driver,GeoTransform,Proj,nrow,ncol]=ptf.readTiffAsNumpy(TiffList) BestFeatureProductFolder=EvalueFolder+os.sep+"Best_Features_Mapping_Results" if multiclassmethod=='softmax': pred_X=init.fomatMulticlassSoftmaxMatrix(MatX) pred_pY=mtc.predictMulticlassSoftmaxModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=False) elif multiclassmethod=='category': pred_X=init.formatMulticlassCategoryMatrix(MatX,num_class) pred_pY=mtc.predictMulticlassCategoryModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=True) for i in range(len(VegeTypes)): vtname=VegeTypes[i] ProductFolder=BestFeatureProductFolder+os.sep+vtname if not os.path.exists(ProductFolder): os.makedirs(ProductFolder) Filename1=vtname+"_xgboost_"+multiclassmethod+postfix ProductFiledirto1=ProductFolder+os.sep+Filename1 ptf.writeNumpyToTiff(pred_pY[:,:,i],Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto1,datatype='Float32') Filename2="VegeMap_XGBoost_multiclass_"+multiclassmethod+postfix ProductFolder=BestFeatureProductFolder ProductFiledirto2=ProductFolder+os.sep+Filename2 ptf.writeNumpyToTiff(pred_Y,Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto2,datatype='Int16') return fscores