Example #1
0
def predictMulticlassBaggingModel_parallel(MatX,nrow,ncol,varnames,num_class,params,multiclassmethod,\
                                           n_gpus,n_parallels,runtimes=300,bool_save=True,savedirbase=""):
    if not bool_save:
        print("Bagging Method has to save models!")
        return
    evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv"
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData
    bool_mask = init.getMask(MatX)
    #Assign task to worker
    RuntimeLists = [[] for i in range(n_parallels)]
    for runtime in range(runtimes):
        worker_id = runtime % n_parallels
        RuntimeLists[worker_id].append(runtime)
    #Judge bool_gpu
    if 'gpu' in params.get('tree_method'):
        bool_gpu = True
    else:
        bool_gpu = False

    P = Pool(n_parallels)
    results_parallel = []
    manager = Manager()
    CPIDs = manager.list()
    for i in range(n_parallels):
        results_parallel.append(P.apply_async(_predictMulticlassBaggingModel,(CPIDs,RuntimeLists[i],MatX,nrow,ncol,varnames,num_class,params,\
                                        selectruntimesvarnames,baggingweights,multiclassmethod,bool_gpu,n_gpus,n_parallels,bool_save,savedirbase)))
    P.close()
    P.join()
    del CPIDs

    #    if  multiclassmethod=='softmax':
    pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32)
    #    elif multiclassmethod=='category':
    #        pred_pY_ense=np.zeros(nrow*ncol*num_class,dtype=np.float32)

    for i in range(n_parallels):
        temp = results_parallel[i]
        pred_pY_ense_para = temp.get()
        pred_pY_ense = pred_pY_ense + pred_pY_ense_para

#    if multiclassmethod=='softmax':
    [pred_Y, pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense,
                                                     nrow,
                                                     ncol,
                                                     num_class,
                                                     bool_onearray=False,
                                                     mask=bool_mask.flatten())
    #    elif multiclassmethod=='category':
    #        [pred_Y,pred_pY]=init.ReshapeMulticlassMatrix(pred_pY_ense,nrow,ncol,num_class,1,bool_stretch=bool_stretch,mask=bool_mask.flatten())
    return [pred_Y, pred_pY]
Example #2
0
def testSingleclassBaggingModel_parallel(Models,TestDataSet,vtname,params,n_gpus,n_parallels,\
                                         single_thres=0.5,runtimes=300,bool_strclass=False,labelHeaderName="",\
                                         bool_save=False,savedirbase=""):
    ModelList = []
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models
    #Assign task to worker
    RuntimeLists = [[] for i in range(n_parallels)]
    for runtime in range(runtimes):
        worker_id = runtime % n_parallels
        RuntimeLists[worker_id].append(runtime)
    #Judge bool_gpu
    if 'gpu' in params.get('tree_method'):
        bool_gpu = True
    else:
        bool_gpu = False
    #Open multiprocessing parallel pools
    P = Pool(n_parallels)
    results_parallel = []
    manager = Manager()
    CPIDs = manager.list()
    for i in range(n_parallels):
        results_parallel.append(P.apply_async(_testSingleclassBaggingModel,(CPIDs,RuntimeLists[i],TestDataSet,vtname,runtime,params,ModelList,\
                                bool_gpu,n_gpus,n_parallels,selectruntimesvarnames,baggingweights,single_thres,bool_strclass,labelHeaderName,bool_save,savedirbase)))
    P.close()
    P.join()
    del CPIDs

    pred_pY_ense = np.zeros(len(TestDataSet))
    for i in range(n_parallels):
        temp = results_parallel[i]
        [pred_Y, pred_pY_ense_para, test_Y] = temp.get()
        pred_pY_ense = pred_pY_ense + pred_pY_ense_para
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    return [pred_Y, pred_pY_ense, test_Y]
Example #3
0
def testMulticlassBaggingModel_parallel(TestDataSet,VegeTypes,params,multiclassmethod,n_gpus,n_parallels,runtimes=300,\
                                        bool_strclass=False,labelHeaderName="",bool_save=True,savedirbase=""):
    if not bool_save:
        print("Bagging Method has to save models!")
        return
    evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv"
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData
    #Assign task to worker
    RuntimeLists = [[] for i in range(n_parallels)]
    for runtime in range(runtimes):
        worker_id = runtime % n_parallels
        RuntimeLists[worker_id].append(runtime)
    #Judge bool_gpu
    if 'gpu' in params.get('tree_method'):
        bool_gpu = True
    else:
        bool_gpu = False

    P = Pool(n_parallels)
    results_parallel = []
    manager = Manager()
    CPIDs = manager.list()
    for i in range(n_parallels):
        results_parallel.append(P.apply_async(_testMulticlassBaggingModel,(CPIDs,RuntimeLists[i],TestDataSet,VegeTypes,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\
                                    selectruntimesvarnames,baggingweights,bool_strclass,labelHeaderName,bool_save,savedirbase)))
    P.close()
    P.join()
    del CPIDs

    pred_pY_ense = np.zeros([len(TestDataSet), len(VegeTypes)])
    for i in range(n_parallels):
        temp = results_parallel[i]
        [pred_Y, pred_pY_ense_para, test_Y] = temp.get()
        pred_pY_ense = pred_pY_ense + pred_pY_ense_para

    pred_Y = np.argmax(pred_pY_ense, axis=1)
    return [pred_Y, pred_pY_ense, test_Y]
Example #4
0
def testSingleclassBaggingModel(Models,TestDataSet,vtname,params,single_thres=0.5,runtimes=300,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""):
    ModelList = []
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models

    pred_pY_ense = np.zeros(len(TestDataSet))
    for runtime in range(runtimes):
        print("Predicting runtime = %d" % runtime)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params)
        else:
            model = ModelList[runtime]
        varnames = selectruntimesvarnames[runtime]
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_X,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + pred_pY * ense_weights[runtime]
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    pred_pY = pred_pY_ense
    if len(test_Y.shape) > 1:
        test_Y = test_Y[:, 0]
    return [pred_Y, pred_pY, test_Y]
Example #5
0
def testMulticlassBaggingModel(TestDataSet,VegeTypes,params,multiclassmethod,runtimes=300,bool_strclass=False,labelHeaderName="",\
                               bool_save=True,savedirbase=""):
    if not bool_save:
        print("Bagging Method has to save models!")
        return
    num_class = len(VegeTypes)
    evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv"
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData

    pred_pY_ense = np.zeros([len(TestDataSet), num_class])
    for runtime in range(runtimes):
        if baggingweights[runtime] == 0:
            print("Model not established!")
            continue
        print("Predicting runtime = %d" % runtime)
        savedir = savedirbase + os.sep + "runtime_" + str(runtime)
        if multiclassmethod == 'softmax':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        elif multiclassmethod == 'category':
            [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel([],TestDataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                    params,runtime=runtime,bool_pandas=True,bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,\
                                    bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")


#        pred_pY_ense=pred_pY_ense+pred_pY*baggingweights[runtime]
        pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class)
        pred_pY_ense = pred_pY_ense + baggingweights[
            runtime] * pred_Y_epd.astype(np.float32)
    pred_Y = np.argmax(pred_pY_ense, axis=1)
    return [pred_Y, pred_pY_ense, test_Y]
    colsample_bytree = 0.75
    min_child_weight = 2
    scale_pos_weight = 1
    max_delta_step = 2
    eta = 0.05
    nthread = 1
    threshold = 0.5
    #%%
    #Read datasets
    trainDataSetFiledir = dirfrom + os.sep + trainDataSetName
    testDataSetFiledir = dirfrom + os.sep + testDataSetName
    selectVariableFiledir = dirfrom + os.sep + selectVariableName
    vegetypeFiledir = dirfrom + os.sep + vegetypeNames
    TrainDataSet = init.readCSVasPandas(trainDataSetFiledir)
    TestDataSet = init.readCSVasPandas(testDataSetFiledir)
    varnames = init.getListFromPandas(selectVariableFiledir, 'VariableName')
    varmeanings = init.getListFromPandas(selectVariableFiledir,
                                         'VariableMeaning')
    VegeTypes = init.getListFromPandas(vegetypeFiledir, 'VegeName')

    num_class = len(VegeTypes)
    #Set XGBoost parameters
    params=xgbf.setParams(bool_gpu,tree_method,num_class,eval_metric,max_depth,lamb,alpha,gamma,subsample,colsample_bytree,\
                          min_child_weight,scale_pos_weight,eta,nthread,max_delta_step=max_delta_step,gpu_id=0)
    #%%
    #SMOTE for balanced dataset
    #tar_ratio is max(num. of classes)/min(num. of classes). -1 represents full balance, recommended here.
    if bool_smote:
        TrainDataSet=smote.createSMOTEDataSet(TrainDataSet,VegeTypes,varnames,method='regular',tar_ratio=-1,\
                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
#%%
Example #7
0
def predictSingleclassBaggingModelMatrix_parallel(Models,MatX,vtname,varnames,params,n_gpus,n_parallels,\
                                                  single_thres=0.5,runtimes=300,filter_percent=0,bool_save=True,savedirbase=""):
    if not bool_save:
        print("Single Bagging Ensemble Only for bool_save=True!")
        return []
    #Read weights and features file
    evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
    selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData

    matshape = MatX.shape
    bool_mask = init.getMask(MatX)
    pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]],
                      dtype=np.float32)
    for i in range(matshape[2]):
        pred_X[:, i] = MatX[:, :, i].flatten()

    #Assign task to worker
    RuntimeLists = [[] for i in range(n_parallels)]
    for runtime in range(runtimes):
        worker_id = runtime % n_parallels
        RuntimeLists[worker_id].append(runtime)
    #Judge bool_gpu
    if 'gpu' in params.get('tree_method'):
        bool_gpu = True
    else:
        bool_gpu = False
    #Open multiprocessing parallel pools
    P = Pool(n_parallels)
    results_parallel = []
    manager = Manager()
    CPIDs = manager.list()

    for i in range(n_parallels):
        results_parallel.append(P.apply_async(_predictSingleclassBaggingModelMatrix,(CPIDs,RuntimeLists[i],vtname,pred_X,varnames,\
                                        selectruntimesvarnames,params,matshape,baggingweights,single_thres,bool_gpu,n_gpus,n_parallels,bool_save,savedirbase)))
    P.close()
    P.join()
    del CPIDs
    #Collect the multiprocessing results
    pred_pY_ense = np.zeros(matshape[0] * matshape[1], dtype=np.float32)
    for i in range(n_parallels):
        temp = results_parallel[i]
        pred_pY_ense_para = temp.get()
        pred_pY_ense = pred_pY_ense + pred_pY_ense_para

    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_pY_ense = pred_pY_ense.reshape(matshape[0], matshape[1])
    pred_Y_ense = pred_Y_ense.reshape(matshape[0], matshape[1])
    if filter_percent > 0:
        p_max = np.max(np.max(pred_pY_ense[bool_mask]))
        pred_pY_ense[pred_pY_ense < p_max * filter_percent] = 0
    return [pred_Y_ense, pred_pY_ense]
Example #8
0
def predictSingleclassBaggingModelMatrix(Models,MatX,vtname,varnames,params,single_thres=0.5,runtimes=300,filter_percent=0,\
                                         bool_save=False,savedirbase=""):
    count = 0.0
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models
    matshape = MatX.shape
    bool_mask = init.getMask(MatX)
    pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]],
                      dtype=np.float32)
    for i in range(matshape[2]):
        pred_X[:, i] = MatX[:, :, i].flatten()
    pred_pY_ense = np.zeros(matshape[0] * matshape[1], dtype=np.float32)
    time_start = time.time()
    for runtime in range(runtimes):
        print("Predicting runtime = %d..." % (runtime))
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params)
        else:
            model = ModelList[runtime]
        selruntimevarstr = selectruntimesvarnames[runtime]
        selruntimevaridx = _findListSubsetIndexes(selruntimevarstr, varnames)
        pred_X_runtime = pred_X[:, selruntimevaridx]
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         pred_X_runtime,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + ense_weights[runtime] * pred_pY
        time_stop = time.time()
        count = count + 1
        done = count / runtimes
        remain = (runtimes - count) / runtimes
        num_day, num_hour, num_min = _calDueTime(time_start, time_stop, done,
                                                 0.0)
        print(
            "Model: %d Calculating Finished!      Done: %.2f%%, Remaining: %.2f%%"
            % (runtime, 100 * done, 100 * remain))
        print("Calculating will finish in %d Days %d Hours %d Minutes\n" %
              (num_day, num_hour, num_min))
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_pY_ense = pred_pY_ense.reshape(matshape[0], matshape[1])
    pred_Y_ense = pred_Y_ense.reshape(matshape[0], matshape[1])
    if filter_percent > 0:
        p_max = np.max(np.max(pred_pY_ense[bool_mask]))
        pred_pY_ense[pred_pY_ense < p_max * filter_percent] = 0
    return [pred_Y_ense, pred_pY_ense]
Example #9
0
    postfix = '.tif'

    #Set class headers
    labelHeaderName_H = "type"  #upper system of HVCS
    labelHeaderName_L = "sub"  #lower system of HVCS

    #Set base map layer directories
    baseMapLayerFolderdir = r"XXX"
    baseMapLayerFileName = "VegeMap_XGB_BAG_softmax.tif"
    baseMapLayerFiledir = baseMapLayerFolderdir + os.sep + baseMapLayerFileName
    baseMapTestResultFileName = "Real_and_Predicted_Results.csv"
    baseMapTestResultFiledir = baseMapLayerFolderdir + os.sep + baseMapTestResultFileName

    #Read and format HVCS
    HierRelationsFiledir = dirfrom + os.sep + hierRelationsName
    baseMapTestResult = init.getListFromPandas(baseMapTestResultFiledir,
                                               'predict')
    realTestY = init.getListFromPandas(baseMapTestResultFiledir, 'real')
    [VegeTypes1,VegeTypes2,HierRelations]=hmap.getHierRelation(init.getListFromPandas(HierRelationsFiledir,labelHeaderName_H),\
                                                init.getListFromPandas(HierRelationsFiledir,labelHeaderName_L))
    #%%
    #Produce merged predicted test set
    pred_Y = hmap.predictHierUpMapping(baseMapTestResult, VegeTypes1,
                                       VegeTypes2, HierRelations)
    test_Y = hmap.predictHierUpMapping(realTestY, VegeTypes1, VegeTypes2,
                                       HierRelations)
    #Evaluate
    EvalueFolder = dirto
    xgbf.mlcEvalAndWriteResult(EvalueFolder, pred_Y, np.zeros_like(pred_Y),
                               test_Y)

    #Plot confusion matrix
    varlabelweights = [
        -1
    ]  #weights of the variable clusters. [-1] is default, indicating no difference
    baggingmetric = 'kappa'  #metric for ensembling the built base models (weighted voting)
    baggingweightindex = 1  #weight index for ensembling the built base models (baggingmetric^baggingweightindex as weight of each base model)
    baggingmetricthres = 0.75  #the threshold to filter out model performance < baggingmetricthres

    #%%
    #Read datasets
    trainDataSetFiledir = dirfrom + os.sep + trainDataSetName
    testDataSetFiledir = dirfrom + os.sep + testDataSetName
    selectVariableFiledir = dirfrom + os.sep + selectVariableName
    vegetypeFiledir = dirfrom + os.sep + vegetypeNames
    TrainDataSet = init.readCSVasPandas(trainDataSetFiledir)
    TestDataSet = init.readCSVasPandas(testDataSetFiledir)
    varnames = init.getListFromPandas(selectVariableFiledir, 'VariableName')
    varlabels = init.getListFromPandas(selectVariableFiledir, 'VariableClass')
    #    varmeanings=init.getListFromPandas(selectVariableFiledir,'VariableMeaning')
    VegeTypes = init.getListFromPandas(vegetypeFiledir, 'VegeName')

    num_class = len(VegeTypes)
    #Set XGBoost parameters
    params=xgbf.setParams(bool_gpu,tree_method,num_class,eval_metric,max_depth,lamb,alpha,gamma,subsample,colsample_bytree,\
                          min_child_weight,scale_pos_weight,eta,nthread,max_delta_step=max_delta_step)
    #%%
    #SMOTE for balanced dataset
    #tar_ratio is max(num. of classes)/min(num. of classes). -1 represents full balance, recommended here.
    if bool_smote:
        TrainDataSet=smote.createSMOTEDataSet(TrainDataSet,VegeTypes,varnames,method='regular',tar_ratio=-1,\
                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
#%%
    min_evalue_gain = 0.0  #
    max_backtrack_times = 70  #max times for evalue gain less than 0
    rm_itvl = 5  #remove trival features in the selected set after every rm_itvl runtimes
    #(evaluation calculated by xgboost feature importance)
    cv_num = 1  #repeat times of k-fold cross validation
    skf_split = 10  #k-fold (stratified)
    evalue_method = 'kappa'  #evaluation matric
    #%%
    #Read datasets
    trainDataSetFiledir = dirfrom + os.sep + trainDataSetName
    validDataSetFiledir = dirfrom + os.sep + validDataSetName
    variableIDFiledir = dirfrom + os.sep + variableIDName
    vegetypeFiledir = dirfrom + os.sep + vegetypeNames
    TrainDataSet = init.readCSVasPandas(trainDataSetFiledir)
    ValidDataSet = init.readCSVasPandas(validDataSetFiledir)
    varnames = init.getListFromPandas(variableIDFiledir, 'VariableName')
    VegeTypes = init.getListFromPandas(vegetypeFiledir, 'VegeName')

    num_class = len(VegeTypes)
    #Set XGBoost parameters
    params=xgbf.setParams(bool_gpu,tree_method,num_class,eval_metric,max_depth,lamb,alpha,gamma,subsample,colsample_bytree,\
                  min_child_weight,scale_pos_weight,eta,nthread,max_delta_step=max_delta_step,gpu_id=1)

    #%%
    #Remove Identical Features
    features_included = fs.removeIdenticalFeatures(TrainDataSet,
                                                   varnames,
                                                   rm_thres=rm_eq_thres)
    print("%d features remained.\n" % len(features_included))
    #%%
    #SMOTE for balanced dataset
Example #12
0
def predictMulticlassBaggingModel(MatX,
                                  nrow,
                                  ncol,
                                  varnames,
                                  num_class,
                                  params,
                                  multiclassmethod,
                                  runtimes=300,
                                  bool_save=True,
                                  savedirbase=""):
    count = 0.0
    if not bool_save:
        print("Bagging Method has to save models!")
        return
    evalweightsFileName = "Runtime_Model_Evaluation_Weights.csv"
    selectvarnamesfiledir = savedirbase + os.sep + "Runtime_Model_Select_Variables.csv"
    evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
    selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
    baggingweights = init.getListFromPandas(evalweightsFiledirto, 'weight')
    selectruntimesvarnames = []
    for runtime in range(runtimes):
        selectruntimesvarnames.append(
            init.getListFrompdDataSet(selrunvarspdData,
                                      "SelectVarName_run" + str(runtime)))
    del selrunvarspdData
    bool_mask = init.getMask(MatX)
    time_start = time.time()
    if multiclassmethod == 'softmax':
        pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32)
        pred_X = init.fomatMulticlassSoftmaxMatrix(MatX)
        for runtime in range(runtimes):
            if baggingweights[runtime] == 0:
                print("Model not established!")
                continue
            selruntimevarstr = selectruntimesvarnames[runtime]
            selruntimevaridx = _findListSubsetIndexes(selruntimevarstr,
                                                      varnames)
            pred_X_runtime = pred_X[:, selruntimevaridx]
            print("Predicting Bagging Model...    runtime = %d" % runtime)
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            pred_pY=mlc.predictMulticlassSoftmaxModelCvted([],pred_X_runtime,params,\
                                                           runtime=runtime,bool_save=bool_save,savedir=savedir)
            pred_Y = np.argmax(pred_pY, axis=1)
            pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class)
            pred_pY_ense = pred_pY_ense + baggingweights[
                runtime] * pred_Y_epd.astype(np.float32)
            time_stop = time.time()
            count = count + 1
            done = count / runtimes
            remain = (runtimes - count) / runtimes
            num_day, num_hour, num_min = _calDueTime(time_start, time_stop,
                                                     done, 0.0)
            print(
                "Model: %d Calculating Finished!      Done: %.2f%%, Remaining: %.2f%%"
                % (runtime, 100 * done, 100 * remain))
            print("Calculating will finish in %d Days %d Hours %d Minutes\n" %
                  (num_day, num_hour, num_min))
        [pred_Y,
         pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense,
                                                 nrow,
                                                 ncol,
                                                 num_class,
                                                 bool_onearray=False,
                                                 mask=bool_mask.flatten())
    elif multiclassmethod == 'category':
        pred_pY_ense = np.zeros([nrow * ncol, num_class], dtype=np.float32)
        pred_X = init.formatMulticlassCategoryMatrix(MatX, num_class)
        for runtime in range(runtimes):
            if baggingweights[runtime] == 0:
                print("Model not established!")
                continue
            selruntimevarstr = selectruntimesvarnames[runtime]
            selruntimevaridx = _findListSubsetIndexes(selruntimevarstr,
                                                      varnames)
            pred_X_runtime = pred_X[:, selruntimevaridx]
            print("Predicting Bagging Model...    runtime = %d" % runtime)
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            pred_Y=mlc.predictMulticlassCategoryModelCvted([],pred_X_runtime,params,runtime=runtime,bool_retlabel=True,num_instance=nrow*ncol,num_class=num_class,\
                                                            bool_save=bool_save,savedir=savedir)
            pred_Y_epd = init.expandCategories(pred_Y, num_class=num_class)
            pred_pY_ense = pred_pY_ense + baggingweights[
                runtime] * pred_Y_epd.astype(np.float32)
            time_stop = time.time()
            count = count + 1
            done = count / runtimes
            remain = (runtimes - count) / runtimes
            num_day, num_hour, num_min = _calDueTime(time_start, time_stop,
                                                     done, 0.0)
            print(
                "Model: %d Calculating Finished!      Done: %.2f%%, Remaining: %.2f%%"
                % (runtime, 100 * done, 100 * remain))
            print("Calculating will finish in %d Days %d Hours %d Minutes\n" %
                  (num_day, num_hour, num_min))
        [pred_Y,
         pred_pY] = init.reshapeMulticlassMatrix(pred_pY_ense,
                                                 nrow,
                                                 ncol,
                                                 num_class,
                                                 bool_onearray=False,
                                                 mask=bool_mask.flatten())
    return [pred_Y, pred_pY]