Esempio n. 1
0
def predictMulticlassCategoryModelMatrix(ModelList,
                                         MatX,
                                         num_class,
                                         params,
                                         bool_save=False,
                                         savedir=""):
    matshape = MatX.shape
    pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]],
                      dtype=np.float32)
    for i in range(matshape[2]):
        pred_X[:, i] = MatX[:, :, i].flatten()
    if bool_save:
        modelName = 'category_multiclass.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]
    pred_X = init.formatMulticlassCategoryInput([], pred_X, num_class, 0)
    print("Predicting......")
    pred_pY = xgbf.Predict(model, pred_X, bool_binary=False)
    pred_pY_reshape = np.zeros([matshape[0] * matshape[1], num_class])
    for i in range(matshape[0] * matshape[1]):
        pred_pY_reshape[i, :] = pred_pY[i * num_class:(i + 1) * num_class]
    pred_Y = np.argmax(pred_pY_reshape, axis=1)
    pred_Y = pred_Y.reshape(matshape[0], matshape[1])
    prob_Y = np.zeros([matshape[0], matshape[1], num_class], dtype=np.float32)
    for i in range(pred_pY_reshape.shape[1]):
        prob_Y[:, :, i] = pred_pY_reshape[:,
                                          i].reshape(matshape[0], matshape[1])
    pred_pY = prob_Y
    return [pred_Y, pred_pY]
Esempio n. 2
0
def predictMulticlassSoftmaxModelCvted(ModelList,
                                       pred_X,
                                       params,
                                       runtime=-1,
                                       bool_save=False,
                                       savedir=""):
    if bool_save:
        if runtime == -1:
            modelName = 'softmax_multiclass.model'
        else:
            modelName = 'softmax_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]
    pred_pY = xgbf.Predict(model, pred_X, bool_binary=False)
    return pred_pY
Esempio n. 3
0
def testSingleclassBaggingModel(Models,TestDataSet,vtname,params,single_thres=0.5,runtimes=300,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""):
    ModelList = []
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models

    pred_pY_ense = np.zeros(len(TestDataSet))
    for runtime in range(runtimes):
        print("Predicting runtime = %d" % runtime)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params)
        else:
            model = ModelList[runtime]
        varnames = selectruntimesvarnames[runtime]
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_X,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + pred_pY * ense_weights[runtime]
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    pred_pY = pred_pY_ense
    if len(test_Y.shape) > 1:
        test_Y = test_Y[:, 0]
    return [pred_Y, pred_pY, test_Y]
Esempio n. 4
0
def _trainSingleclassBaggingModel(CPIDs,DataSet,vtname,params,baggingmetric,bool_gpu,n_gpus,n_parallels,selectruntimesvarnames,\
                                  runtime,train_percent,single_thres,bool_balance,bool_strclass,labelHeaderName,bool_save,savedirbase):
    #Assign task to worker
    print("Training #%d model..." % runtime)
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    #Execute model training process
    RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\
                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
    [train_x, test_x, train_y,
     test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                       train_percent,
                                       bool_stratify=1)
    if bool_balance:
        if len(train_y.shape) > 1:
            ratio = np.float(
                np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1)
        else:
            ratio = np.float(np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1)
        params_parallel['scale_pos_weight'] = ratio
    model = xgbf.TrainModel(train_x, train_y, params_parallel)
    savedir = savedirbase + os.sep + "runtime_" + str(runtime)
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    modelName = vtname + '_xgboost_singleclass_run' + str(runtime) + ".model"
    modeldir = savedir + os.sep + modelName
    model.save_model(modeldir)
    [pred_Y, pred_pY] = xgbf.Predict(model,
                                     test_x,
                                     bool_binary=1,
                                     threshold=single_thres)
    evalValue = xgbf.Evaluate(test_y, pred_Y, pred_pY, baggingmetric)
    print("Runtime: %d model training finished. Evaluation Value = %f\n" %
          (runtime, evalValue))
    return evalValue
Esempio n. 5
0
def testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    num_class = len(VegeTypes)
    if not len(ModelList):
        if runtime == -1:
            modelName = 'category_multiclass.model'
        else:
            modelName = 'category_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]
    if bool_pandas:
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [test_Y, test_X] = TestDataSet
    xshape = test_X.shape
    flag = len(xshape)
    if flag == 1:
        t = np.zeros([1, len(varnames)])
        t[0, :] = test_X
        test_X = t
        t = np.zeros([1, num_class])
        t[0, :] = test_Y
        test_Y = t
    if not bool_strclass and len(test_Y.shape) > 1:
        test_Y = init.mergeCategories(test_Y)
    num_instance = test_X.shape[0]
    test_X = init.formatMulticlassCategoryInput([], test_X, num_class, 0)
    pred_pY = xgbf.Predict(model, test_X, bool_binary=False)
    if flag == 1:
        t = np.zeros([1, num_class])
        t[0, :] = pred_pY
        pred_pY = t
    else:
        pred_pY_reshape = np.zeros([num_instance, num_class])
        for i in range(num_instance):
            pred_pY_reshape[i, :] = pred_pY[i * num_class:(i + 1) * num_class]
        pred_pY = pred_pY_reshape
    pred_Y = np.argmax(pred_pY, axis=1)
    return [pred_Y, pred_pY, test_Y]
Esempio n. 6
0
def _testSingleclassBaggingModel(CPIDs,RuntimeList,TestDataSet,vtname,runtime,params,ModelList,bool_gpu,n_gpus,n_parallels,\
                                 selectruntimesvarnames,baggingweights,single_thres,bool_strclass,labelHeaderName,\
                                 bool_save,savedirbase):
    print("Predicting Singleclass Bagging Ensemble Models...")
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    pred_pY_ense = np.zeros(len(TestDataSet))
    for runtime in RuntimeList:
        print("Predicting runtime = %d" % runtime)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params_parallel)
        else:
            model = ModelList[runtime]
        varnames = selectruntimesvarnames[runtime]
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\
                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_X,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + pred_pY * baggingweights[runtime]
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    pred_pY = pred_pY_ense
    if len(test_Y.shape) > 1:
        test_Y = test_Y[:, 0]
    return [pred_Y, pred_pY, test_Y]
Esempio n. 7
0
def trainSingleclassBaggingModel(DataSet,vtname,varnames,params,baggingmetric='auc',baggingweightindex=1,\
                       baggingmetricthres=0.7,single_thres=0.5,varlabelweights=[-1],colsamplerate=0.7,\
                       train_percent=0.75,runtimes=300,bool_autolabel=True,varlabels=[],n_varlabels=5,bool_balance=True,\
                       bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""):
    ModelList = []
    if bool_autolabel:
        varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels)
    selectruntimesvarnames = _stratifiedRandomChoice_column(
        varnames, varlabels, varlabelweights, colsamplerate, runtimes)
    evalValues = np.zeros(runtimes)
    for runtime in range(runtimes):
        RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\
                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [train_x, test_x, train_y,
         test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                           train_percent,
                                           bool_stratify=1)
        if bool_balance:
            if len(train_y.shape) > 1:
                ratio = np.float(
                    np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1)
            else:
                ratio = np.float(
                    np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1)
            params['scale_pos_weight'] = ratio
        model = xgbf.TrainModel(train_x, train_y, params)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            if not os.path.exists(savedir):
                os.makedirs(savedir)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model.save_model(modeldir)
        else:
            ModelList.append(model)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_x,
                                         bool_binary=1,
                                         threshold=single_thres)
        evalValues[runtime] = xgbf.Evaluate(test_y, pred_Y, pred_pY,
                                            baggingmetric)
        print("Runtime: %d model done. Evaluation Value = %f" %
              (runtime, evalValues[runtime]))
    baggingweights = _calWeight(evalValues, runtimes, baggingweightindex,
                                baggingmetricthres)
    if bool_save:
        #Save Weights
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        evalweightsarray = np.zeros([runtimes, 2])
        evalweightsarray[:, 0] = evalValues
        evalweightsarray[:, 1] = baggingweights
        evalweightarrayname = [baggingmetric, 'weight']
        init.writeArrayToCSV(evalweightsarray, evalweightarrayname,
                             evalweightsFiledirto)
        #Save Used Parameters
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        save = pd.DataFrame({})
        for runtime in range(runtimes):
            pdtmp = pd.DataFrame({
                "SelectVarName_run" + str(runtime):
                selectruntimesvarnames[runtime]
            })
            save = pd.concat([save, pdtmp], axis=1)
        save.to_csv(selectvarnamesfiledir, index=False, header=True)
        return []
    else:
        return [ModelList, selectruntimesvarnames, baggingweights]
Esempio n. 8
0
def predictSingleclassBaggingModelMatrix(Models,MatX,vtname,varnames,params,single_thres=0.5,runtimes=300,filter_percent=0,\
                                         bool_save=False,savedirbase=""):
    count = 0.0
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models
    matshape = MatX.shape
    bool_mask = init.getMask(MatX)
    pred_X = np.zeros([matshape[0] * matshape[1], matshape[2]],
                      dtype=np.float32)
    for i in range(matshape[2]):
        pred_X[:, i] = MatX[:, :, i].flatten()
    pred_pY_ense = np.zeros(matshape[0] * matshape[1], dtype=np.float32)
    time_start = time.time()
    for runtime in range(runtimes):
        print("Predicting runtime = %d..." % (runtime))
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params)
        else:
            model = ModelList[runtime]
        selruntimevarstr = selectruntimesvarnames[runtime]
        selruntimevaridx = _findListSubsetIndexes(selruntimevarstr, varnames)
        pred_X_runtime = pred_X[:, selruntimevaridx]
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         pred_X_runtime,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + ense_weights[runtime] * pred_pY
        time_stop = time.time()
        count = count + 1
        done = count / runtimes
        remain = (runtimes - count) / runtimes
        num_day, num_hour, num_min = _calDueTime(time_start, time_stop, done,
                                                 0.0)
        print(
            "Model: %d Calculating Finished!      Done: %.2f%%, Remaining: %.2f%%"
            % (runtime, 100 * done, 100 * remain))
        print("Calculating will finish in %d Days %d Hours %d Minutes\n" %
              (num_day, num_hour, num_min))
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_pY_ense = pred_pY_ense.reshape(matshape[0], matshape[1])
    pred_Y_ense = pred_Y_ense.reshape(matshape[0], matshape[1])
    if filter_percent > 0:
        p_max = np.max(np.max(pred_pY_ense[bool_mask]))
        pred_pY_ense[pred_pY_ense < p_max * filter_percent] = 0
    return [pred_Y_ense, pred_pY_ense]