Exemple #1
0
def _testMulticlassBaggingModel(CPIDs,RuntimeList,TestDataSet,VegeTypes,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\
                                selectruntimesvarnames,baggingweights,bool_strclass,labelHeaderName,bool_save,savedirbase):
    print("Predicting Multiclass Bagging Ensemble Models...")
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus
    num_class = len(VegeTypes)
    pred_pY_ense = np.zeros([len(TestDataSet), num_class])
    for runtime in RuntimeList:
        if baggingweights[runtime] == 0:
            print("Model not established!")
            continue
        print("Predicting runtime = %d" % runtime)
        savedir = savedirbase + os.sep + "runtime_" + str(runtime)
        if multiclassmethod == 'softmax':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params_parallel,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        elif multiclassmethod == 'category':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params_parallel,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")
        pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class)
        pred_pY_ense = pred_pY_ense + baggingweights[
            runtime] * pred_Y_epd.astype(np.float32)
    return [pred_Y, pred_pY_ense, test_Y]
Exemple #2
0
def trainMulticlassBaggingModel(DataSet,VegeTypes,varnames,params,multiclassmethod,baggingmetric='kappa',baggingweightindex=1,\
                                baggingmetricthres=0.7,varlabelweights=[-1],colsamplerate=0.7,train_percent=0.75,runtimes=300,\
                                bool_autolabel=True,varlabels=[],n_varlabels=5,bool_weight=False,bool_strclass=False,labelHeaderName="",\
                                bool_save=False,savedirbase=""):
    if bool_autolabel:
        varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels)
    selectruntimesvarnames = _stratifiedRandomChoice_column(
        varnames, varlabels, varlabelweights, colsamplerate, runtimes)
    evalValues = np.zeros(runtimes)
    weights = np.zeros(runtimes)
    for runtime in range(runtimes):
        savedir = savedirbase + os.sep + "runtime_" + str(runtime)
        if not os.path.exists(savedir):
            os.makedirs(savedir)
        RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
        [train_x, test_x, train_y,
         test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                           train_percent,
                                           bool_stratify=True)
        try:
            if multiclassmethod == 'softmax':
                ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\
                                                          bool_pandas=False,bool_save=bool_save,savedir=savedir)
                [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\
                                                            bool_pandas=False,bool_save=bool_save,savedir=savedir)
            elif multiclassmethod == 'category':
                ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params,runtime=runtime,bool_weight=bool_weight,\
                                                           bool_pandas=False,bool_save=bool_save,savedir=savedir)
                [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params,runtime=runtime,\
                                                            bool_pandas=False,bool_save=bool_save,savedir=savedir)
            else:
                print("Invalid Multiclass Method Input!")
            evalValues[runtime] = xgbf.Evaluate(test_Y, pred_Y, pred_pY,
                                                baggingmetric)
            print("Runtime: %d model done. Evaluation Value = %f" %
                  (runtime, evalValues[runtime]))
        except:
            print("Model not established!")
            evalValues[runtime] = 0.0
    weights = _calWeight(evalValues, baggingweightindex, baggingmetricthres)
    evalFiledirto = savedirbase + os.sep + "Runtime_Model_Evaluation_Weights.csv"
    init.writeArrayListToCSV([evalValues, weights], [baggingmetric, 'weight'],
                             evalFiledirto)
    #Write Each Runtime Model Variables Names
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    save = pd.DataFrame({})
    for runtime in range(runtimes):
        pdtmp = pd.DataFrame({
            "SelectVarName_run" + str(runtime):
            selectruntimesvarnames[runtime]
        })
        save = pd.concat([save, pdtmp], axis=1)
    save.to_csv(selectvarnamesfiledir, index=False, header=True)
Exemple #3
0
def testMulticlassBaggingModel(TestDataSet,VegeTypes,params,multiclassmethod,runtimes=300,bool_strclass=False,labelHeaderName="",\
                               bool_save=True,savedirbase=""):
    if not bool_save:
        print("Bagging Method has to save models!")
        return
    num_class = len(VegeTypes)
    evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv"
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData

    pred_pY_ense = np.zeros([len(TestDataSet), num_class])
    for runtime in range(runtimes):
        if baggingweights[runtime] == 0:
            print("Model not established!")
            continue
        print("Predicting runtime = %d" % runtime)
        savedir = savedirbase + os.sep + "runtime_" + str(runtime)
        if multiclassmethod == 'softmax':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        elif multiclassmethod == 'category':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")


#        pred_pY_ense=pred_pY_ense+pred_pY*baggingweights[runtime]
        pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class)
        pred_pY_ense = pred_pY_ense + baggingweights[
            runtime] * pred_Y_epd.astype(np.float32)
    pred_Y = np.argmax(pred_pY_ense, axis=1)
    return [pred_Y, pred_pY_ense, test_Y]
Exemple #4
0
def _trainMulticlassBaggingModel(CPIDs,DataSet,VegeTypes,varnames,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\
                                 selectruntimesvarnames,runtime,train_percent,baggingmetric,bool_weight,bool_strclass,labelHeaderName,\
                                 bool_save,savedirbase):
    #Assign task to worker
    print("Training #%d model..." % runtime)
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    #Execute model training process
    savedir = savedirbase + os.sep + "runtime_" + str(runtime)
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    [train_x, test_x, train_y,
     test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                       train_percent,
                                       bool_stratify=1)
    if multiclassmethod == 'softmax':
        ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\
                                                  bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\
                                                    runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir)
    elif multiclassmethod == 'category':
        ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\
                                                   bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\
                                                    runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir)
    else:
        print("Invalid Multiclass Method Input!")
    evalValue = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric)
    #    evalValues[runtime]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,access_method)
    print("Runtime: %d model training finished. Evaluation Value = %f\n" %
          (runtime, evalValue))
    return evalValue
                          min_child_weight,scale_pos_weight,eta,nthread,max_delta_step=max_delta_step,gpu_id=0)
    #%%
    #SMOTE for balanced dataset
    #tar_ratio is max(num. of classes)/min(num. of classes). -1 represents full balance, recommended here.
    if bool_smote:
        TrainDataSet=smote.createSMOTEDataSet(TrainDataSet,VegeTypes,varnames,method='regular',tar_ratio=-1,\
                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
#%%
#Train model
    multiclassFolderName = "Multiclass_XGBoost_" + multiclassmethod + "_Model"
    savedir = root + os.sep + modelFolderName + os.sep + multiclassFolderName
    print("Start training model...  method: %s" % multiclassmethod)
    if multiclassmethod == 'softmax':
        ModelList=mlc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,varnames,params,bool_weight=bool_weight,\
                                                  bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,TestDataSet,VegeTypes,varnames,params,\
                                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
    elif multiclassmethod == 'category':
        ModelList=mlc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,varnames,params,bool_weight=bool_weight,\
                                                   bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,\
                                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
    else:
        print("Invalid Multiclass Method Input!")
#%%
#Evaluate results
    evalueFolder = dirto
    xgbf.mlcEvalAndWriteResult(evalueFolder, pred_Y, pred_pY, test_Y)

    #Plot confusion matrix
    plotfiledirto = evalueFolder + os.sep + "conf_mat.png"
    plot.plot_confusion_matrix(xgbf.Evaluate(test_Y,pred_Y,pred_pY,'confmat'),VegeTypes,title='Confusion Matrix',cmap=None,normalize=False,\
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\
                    bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir):
    print("Trying to evalute feature: %s"%evaluate_feature)
    params_parallel=copy.deepcopy(params)
    process_pid=os.getpid()
    if len(CPIDs)<n_parallels:
        CPIDs.append(process_pid)
    process_pid_index=CPIDs.index(process_pid)
    print("Worker #%d: PID = %d"%(process_pid_index,process_pid))
    if bool_gpu:
        params_parallel['gpu_id']=process_pid_index%n_gpus    
    if bool_cv==1:
        [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\
                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
        if not bool_strclass:
            class_labels=init.mergeCategories(Y)
        else:
            class_labels=Y
        pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        pred_pY_cv=np.zeros(len(class_labels)*cv_num)
        test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        last_cv_idx=0
        current_cv_idx=0
        for cv_i in range(cv_num):
            skf=StratifiedKFold(n_splits=skf_split,shuffle=True)
            cv_j=0
            for train, test in skf.split(X,class_labels):
                train_x=X[train]
                train_y=Y[train]
                test_x=X[test]
                test_y=Y[test]    
                if multiclassmethod=='softmax':
                    ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                elif multiclassmethod=='category':
                    ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                               bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                else:
                    print("Invalid Multiclass Method Input!")
                current_cv_idx=len(test_Y)+last_cv_idx
                pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y
#                    pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY
                test_Y_cv[last_cv_idx:current_cv_idx]=test_Y
                last_cv_idx=current_cv_idx
#                    evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method)                    
                cv_j=cv_j+1
        evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) 
    else:
        if multiclassmethod=='softmax':
            ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                      bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        elif multiclassmethod=='category':
            ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                       bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")
        evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) 
    print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue))
    return evalue
def _estabModelAndPred(TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,EvalueFolder,variableFolderdir,postfix,\
                           bool_predictmap,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir):
    num_class=len(VegeTypes)
    #Establish Training Model
    if multiclassmethod=='softmax':
        ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\
                                                  bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)            
        [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
    elif multiclassmethod=='category':
        ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params,bool_weight=bool_weight,\
                                                   bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
    else:
        print("Invalid Multiclass Method Input!")
    
    #Write Test Results
    YArray=np.zeros([len(test_Y),2])
    YArray[:,0]=test_Y
    YArray[:,1]=pred_Y
    YFiledirto=EvalueFolder+os.sep+"Best_Feature_Real_and_Predicted_Results.csv"
    init.writeArrayToCSV(YArray,['real','predict'],YFiledirto)     
    
    #Evaluate Model and Write Result
    evalArray=np.zeros([1,2])
    evalArray[0,0]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'accuracy')
    evalArray[0,1]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,'kappa')
    evalFiledirto=EvalueFolder+os.sep+"Best_Feature_Model_Evaluation_ValidDataSet.csv"
    init.writeArrayToCSV(evalArray,['accuracy','kappa'],evalFiledirto)
    
    #Find XGBoost Feature Scores
    featureScoreFiledirto=EvalueFolder+os.sep+"Feature_Scores.csv"
    model=ModelList[0]
    feature_scores=model.get_fscore()
    [feature_names,fscores]=locateFeatureScores(feature_names,feature_scores)
    init.writeArrayListToCSV([feature_names,fscores],['VariableName','FeatureScore'],featureScoreFiledirto)

    if bool_predictmap:
        #Predict Mapping Results
        print("Predict region...")
        nanDefault=-9999
        [TiffList,Total]=init.generateVarialeTiffList(variableFolderdir,feature_names,postfix)
        [MatX,Driver,GeoTransform,Proj,nrow,ncol]=ptf.readTiffAsNumpy(TiffList)
        BestFeatureProductFolder=EvalueFolder+os.sep+"Best_Features_Mapping_Results"
        if multiclassmethod=='softmax':
            pred_X=init.fomatMulticlassSoftmaxMatrix(MatX)
            pred_pY=mtc.predictMulticlassSoftmaxModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=False)
        elif multiclassmethod=='category':
            pred_X=init.formatMulticlassCategoryMatrix(MatX,num_class)
            pred_pY=mtc.predictMulticlassCategoryModelCvted(ModelList,pred_X,params,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY]=init.reshapeMulticlassMatrix(pred_pY,nrow,ncol,num_class,bool_onearray=True)
        for i in range(len(VegeTypes)):
            vtname=VegeTypes[i]
            ProductFolder=BestFeatureProductFolder+os.sep+vtname
            if not os.path.exists(ProductFolder):
                os.makedirs(ProductFolder)
            Filename1=vtname+"_xgboost_"+multiclassmethod+postfix
            ProductFiledirto1=ProductFolder+os.sep+Filename1 
            ptf.writeNumpyToTiff(pred_pY[:,:,i],Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto1,datatype='Float32')
        Filename2="VegeMap_XGBoost_multiclass_"+multiclassmethod+postfix
        ProductFolder=BestFeatureProductFolder
        ProductFiledirto2=ProductFolder+os.sep+Filename2
        ptf.writeNumpyToTiff(pred_Y,Driver,GeoTransform,Proj,nrow,ncol,nanDefault,ProductFiledirto2,datatype='Int16')    
    return fscores