Example #1
0
def testMulticlassSoftmaxModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\
                               bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    num_class = len(VegeTypes)
    if not len(ModelList):
        if runtime == -1:
            modelName = 'softmax_multiclass.model'
        else:
            modelName = 'softmax_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]

    if bool_pandas:
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [test_Y, test_X] = TestDataSet
    if len(test_X.shape) == 1:
        t = np.zeros([1, len(varnames)])
        t[0, :] = test_X
        test_X = t
        t = np.zeros([1, num_class])
        t[0, :] = test_Y
        test_Y = t
    if not bool_strclass and len(test_Y.shape) > 1:
        test_Y = init.mergeCategories(test_Y)
    pred_pY = xgbf.Predict(model, test_X, bool_binary=False)
    pred_Y = np.argmax(pred_pY, axis=1)
    return [pred_Y, pred_pY, test_Y]
Example #2
0
def trainMulticlassCategoryModel(DataSet,VegeTypes,varnames,params,runtime=-1,bool_weight=False,bool_pandas=True,\
                                 bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    ModelList = []
    if bool_pandas:
        [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [Y, X] = DataSet
    if not bool_strclass and len(Y.shape) > 1:
        Y = init.mergeCategories(Y)

    num_class = len(VegeTypes)
    if bool_weight:
        #        oriweights=CalcInstanceWeights(Y)
        #        weights=np.zeros(num_instance*num_class,dtype=np.float)
        #        for rec in range(num_instance):
        #            weights[rec*num_class:(rec+1)*num_class]=np.ones(num_class,dtype=np.float)*oriweights[rec]/num_class
        [Y, X] = init.formatMulticlassCategoryInput(Y, X, num_class, 1)
        ratio = np.float(np.sum(Y == 0)) / np.sum(Y == 1)
        params['scale_pos_weight'] = ratio
        model = xgbf.TrainModel(X, Y, params)
    else:
        [Y, X] = init.formatMulticlassCategoryInput(Y, X, num_class, 1)
        model = xgbf.TrainModel(X, Y, params)
    if bool_save:
        if not os.path.exists(savedir):
            os.makedirs(savedir)
        if runtime == -1:
            modelName = 'category_multiclass.model'
        else:
            modelName = 'category_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model.save_model(modeldir)
    ModelList.append(model)
    return ModelList
Example #3
0
def trainMulticlassSoftmaxModel(DataSet,VegeTypes,varnames,params,runtime=-1,bool_weight=False,bool_pandas=True,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    ModelList = []
    if bool_pandas:
        [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [Y, X] = DataSet
    if not bool_strclass and len(Y.shape) > 1:
        Y = init.mergeCategories(Y)
    if bool_weight:
        weights = _calcInstanceWeights(Y)
        model = xgbf.TrainModel(X, Y, params, weight=weights)
    else:
        model = xgbf.TrainModel(X, Y, params)
    if bool_save:
        if not os.path.exists(savedir):
            os.makedirs(savedir)
        if runtime == -1:
            modelName = 'softmax_multiclass.model'
        else:
            modelName = 'softmax_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model.save_model(modeldir)
    ModelList.append(model)
    return ModelList
Example #4
0
def createSMOTEDataSet(DataSet,
                       VegeTypes,
                       varnames,
                       method='regular',
                       tar_ratio=-1,
                       nthread=1,
                       bool_pandas=True,
                       bool_strclass=False,
                       labelHeaderName=""):
    if bool_pandas:
        [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\
                                bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [Y, X] = DataSet
    if not bool_strclass:
        Y = init.mergeCategories(Y)
    class_counts = np.bincount(Y)
    min_class_count = np.min(class_counts)
    if min_class_count > 5:
        k_neighbors = 5
    else:
        k_neighbors = min_class_count - 1
    if tar_ratio == -1:
        sm = SMOTE(kind=method, k_neighbors=k_neighbors, n_jobs=nthread)
    else:
        smoteratiodict = _calc_smoteratio(Y, class_counts, tar_ratio=tar_ratio)
        sm = SMOTE(ratio=smoteratiodict,
                   kind=method,
                   k_neighbors=k_neighbors,
                   n_jobs=nthread)
    [X_res, Y_res] = sm.fit_sample(X, Y)
    if not bool_strclass:
        if bool_pandas:
            X_res_pd = pd.DataFrame(X_res, columns=varnames)
            Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes))
            Y_res_pd = pd.DataFrame(Y_indi, columns=VegeTypes)
            SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1)
        else:
            Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes))
            SMOTEDataSet = [Y_indi, X_res]
    else:
        X_res_pd = pd.DataFrame(X_res, columns=varnames)
        Y_indi = init.classNumToStr(Y_res, VegeTypes)
        Y_res_pd = pd.DataFrame(Y_indi, columns=[labelHeaderName])
        SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1)
    return SMOTEDataSet
Example #5
0
def splitTrainTestData(DataSet, train_percent, bool_stratify=False):
    [Y, X] = DataSet
    if not bool_stratify:
        [train_x, test_x, train_y,
         test_y] = train_test_split(X,
                                    Y,
                                    test_size=1 - train_percent,
                                    shuffle=True)
    else:
        if len(Y.shape) > 1:
            if Y.shape[1] > 1:
                class_labels = init.mergeCategories(Y)
            else:
                class_labels = Y[:, 0]
        else:
            class_labels = Y
        [train_x,test_x,train_y,test_y]=train_test_split(X,Y,test_size=1-train_percent,\
                                        shuffle=True,stratify=class_labels)
    return [train_x, test_x, train_y, test_y]
Example #6
0
def testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\
                                bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""):
    num_class = len(VegeTypes)
    if not len(ModelList):
        if runtime == -1:
            modelName = 'category_multiclass.model'
        else:
            modelName = 'category_multiclass_run' + str(runtime) + '.model'
        modeldir = savedir + os.sep + modelName
        model = xgbf.loadModel(modeldir, params)
    else:
        model = ModelList[0]
    if bool_pandas:
        [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\
                                        bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
    else:
        [test_Y, test_X] = TestDataSet
    xshape = test_X.shape
    flag = len(xshape)
    if flag == 1:
        t = np.zeros([1, len(varnames)])
        t[0, :] = test_X
        test_X = t
        t = np.zeros([1, num_class])
        t[0, :] = test_Y
        test_Y = t
    if not bool_strclass and len(test_Y.shape) > 1:
        test_Y = init.mergeCategories(test_Y)
    num_instance = test_X.shape[0]
    test_X = init.formatMulticlassCategoryInput([], test_X, num_class, 0)
    pred_pY = xgbf.Predict(model, test_X, bool_binary=False)
    if flag == 1:
        t = np.zeros([1, num_class])
        t[0, :] = pred_pY
        pred_pY = t
    else:
        pred_pY_reshape = np.zeros([num_instance, num_class])
        for i in range(num_instance):
            pred_pY_reshape[i, :] = pred_pY[i * num_class:(i + 1) * num_class]
        pred_pY = pred_pY_reshape
    pred_Y = np.argmax(pred_pY, axis=1)
    return [pred_Y, pred_pY, test_Y]
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\
                    bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir):
    print("Trying to evalute feature: %s"%evaluate_feature)
    params_parallel=copy.deepcopy(params)
    process_pid=os.getpid()
    if len(CPIDs)<n_parallels:
        CPIDs.append(process_pid)
    process_pid_index=CPIDs.index(process_pid)
    print("Worker #%d: PID = %d"%(process_pid_index,process_pid))
    if bool_gpu:
        params_parallel['gpu_id']=process_pid_index%n_gpus    
    if bool_cv==1:
        [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\
                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName)
        if not bool_strclass:
            class_labels=init.mergeCategories(Y)
        else:
            class_labels=Y
        pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        pred_pY_cv=np.zeros(len(class_labels)*cv_num)
        test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32)
        last_cv_idx=0
        current_cv_idx=0
        for cv_i in range(cv_num):
            skf=StratifiedKFold(n_splits=skf_split,shuffle=True)
            cv_j=0
            for train, test in skf.split(X,class_labels):
                train_x=X[train]
                train_y=Y[train]
                test_x=X[test]
                test_y=Y[test]    
                if multiclassmethod=='softmax':
                    ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                              bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                elif multiclassmethod=='category':
                    ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\
                                                               bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                    [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\
                                                            bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
                else:
                    print("Invalid Multiclass Method Input!")
                current_cv_idx=len(test_Y)+last_cv_idx
                pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y
#                    pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY
                test_Y_cv[last_cv_idx:current_cv_idx]=test_Y
                last_cv_idx=current_cv_idx
#                    evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method)                    
                cv_j=cv_j+1
        evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) 
    else:
        if multiclassmethod=='softmax':
            ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                      bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        elif multiclassmethod=='category':
            ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\
                                                       bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
            [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\
                                                    bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir)
        else:
            print("Invalid Multiclass Method Input!")
        evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) 
    print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue))
    return evalue