Esempio n. 1
0
def KMeansLabel(DataSet, varnames, n_varlabels=5):
    [Y, X] = xgbf.trainingDataSet(DataSet, [], varnames)
    X_scaled = preprocessing.scale(X)
    kmeans = KMeans(n_clusters=n_varlabels).fit(X_scaled.T)
    varlabels = kmeans.labels_
    varlabel_counts = np.bincount(varlabels)
    #Print KMeans Clustering Result
    print("\n")
    print("Variables have been divided into %d groups: " %
          len(varlabel_counts))
    for i in range(len(varlabel_counts)):
        print("Label: %d, num = %d" % (i, varlabel_counts[i]))
    print("\n")
    return varlabels
Esempio n. 2
0
def createSMOTEDataSet(DataSet,
                       VegeTypes,
                       varnames,
                       method='regular',
                       tar_ratio=-1,
                       nthread=1,
                       bool_pandas=True,
                       bool_strclass=False,
                       labelHeaderName=""):
    if bool_pandas:
        [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [Y, X] = DataSet
    if not bool_strclass:
        Y = init.mergeCategories(Y)
    class_counts = np.bincount(Y)
    min_class_count = np.min(class_counts)
    if min_class_count > 5:
        k_neighbors = 5
    else:
        k_neighbors = min_class_count - 1
    if tar_ratio == -1:
        sm = SMOTE(kind=method, k_neighbors=k_neighbors, n_jobs=nthread)
    else:
        smoteratiodict = _calc_smoteratio(Y, class_counts, tar_ratio=tar_ratio)
        sm = SMOTE(ratio=smoteratiodict,
                   kind=method,
                   k_neighbors=k_neighbors,
                   n_jobs=nthread)
    [X_res, Y_res] = sm.fit_sample(X, Y)
    if not bool_strclass:
        if bool_pandas:
            X_res_pd = pd.DataFrame(X_res, columns=varnames)
            Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes))
            Y_res_pd = pd.DataFrame(Y_indi, columns=VegeTypes)
            SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1)
        else:
            Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes))
            SMOTEDataSet = [Y_indi, X_res]
    else:
        X_res_pd = pd.DataFrame(X_res, columns=varnames)
        Y_indi = init.classNumToStr(Y_res, VegeTypes)
        Y_res_pd = pd.DataFrame(Y_indi, columns=[labelHeaderName])
        SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1)
    return SMOTEDataSet
Esempio n. 3
0
def testSingleclassBaggingModel(Models,TestDataSet,vtname,params,single_thres=0.5,runtimes=300,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""):
    ModelList = []
    if bool_save:
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        ense_weights = init.getListFromPandas(evalweightsFiledirto, 'weight')
        selrunvarspdData = init.readCSVasPandas(selectvarnamesfiledir)
        selectruntimesvarnames = []
        for runtime in range(runtimes):
            selectruntimesvarnames.append(
                init.getListFrompdDataSet(selrunvarspdData,
                                          "SelectVarName_run" + str(runtime)))
        del selrunvarspdData
    else:
        [ModelList, selectruntimesvarnames, ense_weights] = Models

    pred_pY_ense = np.zeros(len(TestDataSet))
    for runtime in range(runtimes):
        print("Predicting runtime = %d" % runtime)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params)
        else:
            model = ModelList[runtime]
        varnames = selectruntimesvarnames[runtime]
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_X,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + pred_pY * ense_weights[runtime]
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    pred_pY = pred_pY_ense
    if len(test_Y.shape) > 1:
        test_Y = test_Y[:, 0]
    return [pred_Y, pred_pY, test_Y]
Esempio n. 4
0
def _trainSingleclassBaggingModel(CPIDs,DataSet,vtname,params,baggingmetric,bool_gpu,n_gpus,n_parallels,selectruntimesvarnames,\
                                  runtime,train_percent,single_thres,bool_balance,bool_strclass,labelHeaderName,bool_save,savedirbase):
    #Assign task to worker
    print("Training #%d model..." % runtime)
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    #Execute model training process
    RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\
                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
    [train_x, test_x, train_y,
     test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                       train_percent,
                                       bool_stratify=1)
    if bool_balance:
        if len(train_y.shape) > 1:
            ratio = np.float(
                np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1)
        else:
            ratio = np.float(np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1)
        params_parallel['scale_pos_weight'] = ratio
    model = xgbf.TrainModel(train_x, train_y, params_parallel)
    savedir = savedirbase + os.sep + "runtime_" + str(runtime)
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    modelName = vtname + '_xgboost_singleclass_run' + str(runtime) + ".model"
    modeldir = savedir + os.sep + modelName
    model.save_model(modeldir)
    [pred_Y, pred_pY] = xgbf.Predict(model,
                                     test_x,
                                     bool_binary=1,
                                     threshold=single_thres)
    evalValue = xgbf.Evaluate(test_y, pred_Y, pred_pY, baggingmetric)
    print("Runtime: %d model training finished. Evaluation Value = %f\n" %
          (runtime, evalValue))
    return evalValue
Esempio n. 5
0
def testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    num_class = len(VegeTypes)
    if not len(ModelList):
        if runtime == -1:
            modelName = 'category_multiclass.model'
        else:
            modelName = 'category_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]
    if bool_pandas:
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [test_Y, test_X] = TestDataSet
    xshape = test_X.shape
    flag = len(xshape)
    if flag == 1:
        t = np.zeros([1, len(varnames)])
        t[0, :] = test_X
        test_X = t
        t = np.zeros([1, num_class])
        t[0, :] = test_Y
        test_Y = t
    if not bool_strclass and len(test_Y.shape) > 1:
        test_Y = init.mergeCategories(test_Y)
    num_instance = test_X.shape[0]
    test_X = init.formatMulticlassCategoryInput([], test_X, num_class, 0)
    pred_pY = xgbf.Predict(model, test_X, bool_binary=False)
    if flag == 1:
        t = np.zeros([1, num_class])
        t[0, :] = pred_pY
        pred_pY = t
    else:
        pred_pY_reshape = np.zeros([num_instance, num_class])
        for i in range(num_instance):
            pred_pY_reshape[i, :] = pred_pY[i * num_class:(i + 1) * num_class]
        pred_pY = pred_pY_reshape
    pred_Y = np.argmax(pred_pY, axis=1)
    return [pred_Y, pred_pY, test_Y]
Esempio n. 6
0
def _trainMulticlassBaggingModel(CPIDs,DataSet,VegeTypes,varnames,params,multiclassmethod,bool_gpu,n_gpus,n_parallels,\
                                 selectruntimesvarnames,runtime,train_percent,baggingmetric,bool_weight,bool_strclass,labelHeaderName,\
                                 bool_save,savedirbase):
    #Assign task to worker
    print("Training #%d model..." % runtime)
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    #Execute model training process
    savedir = savedirbase + os.sep + "runtime_" + str(runtime)
    if not os.path.exists(savedir):
        os.makedirs(savedir)
    RuntimeDataSet=xgbf.trainingDataSet(DataSet,VegeTypes,selectruntimesvarnames[runtime],\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    [train_x, test_x, train_y,
     test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                       train_percent,
                                       bool_stratify=1)
    if multiclassmethod == 'softmax':
        ModelList=mlc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\
                                                  bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\
                                                    runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir)
    elif multiclassmethod == 'category':
        ModelList=mlc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,varnames,params_parallel,runtime=runtime,\
                                                   bool_weight=bool_weight,bool_pandas=False,bool_save=bool_save,savedir=savedir)
        [pred_Y,pred_pY,test_Y]=mlc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,varnames,params_parallel,\
                                                    runtime=runtime,bool_pandas=False,bool_save=bool_save,savedir=savedir)
    else:
        print("Invalid Multiclass Method Input!")
    evalValue = xgbf.Evaluate(test_Y, pred_Y, pred_pY, baggingmetric)
    #    evalValues[runtime]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,access_method)
    print("Runtime: %d model training finished. Evaluation Value = %f\n" %
          (runtime, evalValue))
    return evalValue
def removeIdenticalFeatures(DataSet,varnames,rm_thres=0.98):
    [fslabels,fsdata]=xgbf.trainingDataSet(DataSet,[],varnames)
    n=len(varnames)
    icld_features=[varnames[0]]
#    print("feature: %s selected!"%varnames[0])
    R=np.corrcoef(fsdata.T)
    flag_rm=False
    for i in range(1,n):
        fname=varnames[i]
        n_in=len(icld_features)
        for j in range(n_in):
            fId=varnames.index(icld_features[j])
            if R[i,fId]>rm_thres:
                flag_rm=True
                break
        if not flag_rm:
            icld_features.append(fname)
#            print("feature: %s selected!"%fname)
        flag_rm=False
    print("Identical Features Removed!\n")
    return icld_features
Esempio n. 8
0
def _testSingleclassBaggingModel(CPIDs,RuntimeList,TestDataSet,vtname,runtime,params,ModelList,bool_gpu,n_gpus,n_parallels,\
                                 selectruntimesvarnames,baggingweights,single_thres,bool_strclass,labelHeaderName,\
                                 bool_save,savedirbase):
    print("Predicting Singleclass Bagging Ensemble Models...")
    params_parallel = copy.deepcopy(params)
    process_pid = os.getpid()
    if len(CPIDs) < n_parallels:
        CPIDs.append(process_pid)
    process_pid_index = CPIDs.index(process_pid)
    print("Worker #%d: PID = %d" % (process_pid_index, process_pid))
    if bool_gpu:
        params_parallel['gpu_id'] = process_pid_index % n_gpus

    pred_pY_ense = np.zeros(len(TestDataSet))
    for runtime in RuntimeList:
        print("Predicting runtime = %d" % runtime)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model = xgbf.loadModel(modeldir, params_parallel)
        else:
            model = ModelList[runtime]
        varnames = selectruntimesvarnames[runtime]
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,[vtname],varnames,\
                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_X,
                                         bool_binary=1,
                                         threshold=single_thres)
        pred_pY_ense = pred_pY_ense + pred_pY * baggingweights[runtime]
    pred_Y_ense = (pred_pY_ense >= single_thres) * 1
    pred_Y = pred_Y_ense
    pred_pY = pred_pY_ense
    if len(test_Y.shape) > 1:
        test_Y = test_Y[:, 0]
    return [pred_Y, pred_pY, test_Y]
Esempio n. 9
0
def trainSingleclassBaggingModel(DataSet,vtname,varnames,params,baggingmetric='auc',baggingweightindex=1,\
                       baggingmetricthres=0.7,single_thres=0.5,varlabelweights=[-1],colsamplerate=0.7,\
                       train_percent=0.75,runtimes=300,bool_autolabel=True,varlabels=[],n_varlabels=5,bool_balance=True,\
                       bool_strclass=False,labelHeaderName="",bool_save=False,savedirbase=""):
    ModelList = []
    if bool_autolabel:
        varlabels = vc.KMeansLabel(DataSet, varnames, n_varlabels=n_varlabels)
    selectruntimesvarnames = _stratifiedRandomChoice_column(
        varnames, varlabels, varlabelweights, colsamplerate, runtimes)
    evalValues = np.zeros(runtimes)
    for runtime in range(runtimes):
        RuntimeDataSet=xgbf.trainingDataSet(DataSet,[vtname],selectruntimesvarnames[runtime],\
                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_binary=True)
        [train_x, test_x, train_y,
         test_y] = xgbf.splitTrainTestData(RuntimeDataSet,
                                           train_percent,
                                           bool_stratify=1)
        if bool_balance:
            if len(train_y.shape) > 1:
                ratio = np.float(
                    np.sum(train_y[:, 0] == 0)) / np.sum(train_y[:, 0] == 1)
            else:
                ratio = np.float(
                    np.sum(train_y[:] == 0)) / np.sum(train_y[:] == 1)
            params['scale_pos_weight'] = ratio
        model = xgbf.TrainModel(train_x, train_y, params)
        if bool_save:
            savedir = savedirbase + os.sep + "runtime_" + str(runtime)
            if not os.path.exists(savedir):
                os.makedirs(savedir)
            modelName = vtname + '_xgboost_singleclass_run' + str(
                runtime) + ".model"
            modeldir = savedir + os.sep + modelName
            model.save_model(modeldir)
        else:
            ModelList.append(model)
        [pred_Y, pred_pY] = xgbf.Predict(model,
                                         test_x,
                                         bool_binary=1,
                                         threshold=single_thres)
        evalValues[runtime] = xgbf.Evaluate(test_y, pred_Y, pred_pY,
                                            baggingmetric)
        print("Runtime: %d model done. Evaluation Value = %f" %
              (runtime, evalValues[runtime]))
    baggingweights = _calWeight(evalValues, runtimes, baggingweightindex,
                                baggingmetricthres)
    if bool_save:
        #Save Weights
        evalweightsFileName = vtname + "_Runtime_Evaluation_Weight.csv"
        evalweightsFiledirto = savedirbase + os.sep + evalweightsFileName
        evalweightsarray = np.zeros([runtimes, 2])
        evalweightsarray[:, 0] = evalValues
        evalweightsarray[:, 1] = baggingweights
        evalweightarrayname = [baggingmetric, 'weight']
        init.writeArrayToCSV(evalweightsarray, evalweightarrayname,
                             evalweightsFiledirto)
        #Save Used Parameters
        selectvarnamesfiledir = savedirbase + os.sep + vtname + "_Runtime_Model_Select_Variables.csv"
        save = pd.DataFrame({})
        for runtime in range(runtimes):
            pdtmp = pd.DataFrame({
                "SelectVarName_run" + str(runtime):
                selectruntimesvarnames[runtime]
            })
            save = pd.concat([save, pdtmp], axis=1)
        save.to_csv(selectvarnamesfiledir, index=False, header=True)
        return []
    else:
        return [ModelList, selectruntimesvarnames, baggingweights]
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\
                    bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir):
    print("Trying to evalute feature: %s"%evaluate_feature)
    params_parallel=copy.deepcopy(params)
    process_pid=os.getpid()
    if len(CPIDs)<n_parallels:
        CPIDs.append(process_pid)
    process_pid_index=CPIDs.index(process_pid)
    print("Worker #%d: PID = %d"%(process_pid_index,process_pid))
    if bool_gpu:
        params_parallel['gpu_id']=process_pid_index%n_gpus    
    if bool_cv==1:
        [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\
                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
        if not bool_strclass:
            class_labels=init.mergeCategories(Y)
        else:
            class_labels=Y
        pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        pred_pY_cv=np.zeros(len(class_labels)*cv_num)
        test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        last_cv_idx=0
        current_cv_idx=0
        for cv_i in range(cv_num):
            skf=StratifiedKFold(n_splits=skf_split,shuffle=True)
            cv_j=0
            for train, test in skf.split(X,class_labels):
                train_x=X[train]
                train_y=Y[train]
                test_x=X[test]
                test_y=Y[test]    
                if multiclassmethod=='softmax':
                    ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                elif multiclassmethod=='category':
                    ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                               bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                else:
                    print("Invalid Multiclass Method Input!")
                current_cv_idx=len(test_Y)+last_cv_idx
                pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y
#                    pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY
                test_Y_cv[last_cv_idx:current_cv_idx]=test_Y
                last_cv_idx=current_cv_idx
#                    evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method)                    
                cv_j=cv_j+1
        evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) 
    else:
        if multiclassmethod=='softmax':
            ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                      bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        elif multiclassmethod=='category':
            ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                       bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")
        evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) 
    print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue))
    return evalue