def testMulticlassSoftmaxModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""): num_class = len(VegeTypes) if not len(ModelList): if runtime == -1: modelName = 'softmax_multiclass.model' else: modelName = 'softmax_multiclass_run' + str(runtime) + '.model' modeldir = savedir + os.sep + modelName model = xgbf.loadModel(modeldir, params) else: model = ModelList[0] if bool_pandas: [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) else: [test_Y, test_X] = TestDataSet if len(test_X.shape) == 1: t = np.zeros([1, len(varnames)]) t[0, :] = test_X test_X = t t = np.zeros([1, num_class]) t[0, :] = test_Y test_Y = t if not bool_strclass and len(test_Y.shape) > 1: test_Y = init.mergeCategories(test_Y) pred_pY = xgbf.Predict(model, test_X, bool_binary=False) pred_Y = np.argmax(pred_pY, axis=1) return [pred_Y, pred_pY, test_Y]
def trainMulticlassCategoryModel(DataSet,VegeTypes,varnames,params,runtime=-1,bool_weight=False,bool_pandas=True,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""): ModelList = [] if bool_pandas: [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) else: [Y, X] = DataSet if not bool_strclass and len(Y.shape) > 1: Y = init.mergeCategories(Y) num_class = len(VegeTypes) if bool_weight: # oriweights=CalcInstanceWeights(Y) # weights=np.zeros(num_instance*num_class,dtype=np.float) # for rec in range(num_instance): # weights[rec*num_class:(rec+1)*num_class]=np.ones(num_class,dtype=np.float)*oriweights[rec]/num_class [Y, X] = init.formatMulticlassCategoryInput(Y, X, num_class, 1) ratio = np.float(np.sum(Y == 0)) / np.sum(Y == 1) params['scale_pos_weight'] = ratio model = xgbf.TrainModel(X, Y, params) else: [Y, X] = init.formatMulticlassCategoryInput(Y, X, num_class, 1) model = xgbf.TrainModel(X, Y, params) if bool_save: if not os.path.exists(savedir): os.makedirs(savedir) if runtime == -1: modelName = 'category_multiclass.model' else: modelName = 'category_multiclass_run' + str(runtime) + '.model' modeldir = savedir + os.sep + modelName model.save_model(modeldir) ModelList.append(model) return ModelList
def trainMulticlassSoftmaxModel(DataSet,VegeTypes,varnames,params,runtime=-1,bool_weight=False,bool_pandas=True,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""): ModelList = [] if bool_pandas: [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) else: [Y, X] = DataSet if not bool_strclass and len(Y.shape) > 1: Y = init.mergeCategories(Y) if bool_weight: weights = _calcInstanceWeights(Y) model = xgbf.TrainModel(X, Y, params, weight=weights) else: model = xgbf.TrainModel(X, Y, params) if bool_save: if not os.path.exists(savedir): os.makedirs(savedir) if runtime == -1: modelName = 'softmax_multiclass.model' else: modelName = 'softmax_multiclass_run' + str(runtime) + '.model' modeldir = savedir + os.sep + modelName model.save_model(modeldir) ModelList.append(model) return ModelList
def createSMOTEDataSet(DataSet, VegeTypes, varnames, method='regular', tar_ratio=-1, nthread=1, bool_pandas=True, bool_strclass=False, labelHeaderName=""): if bool_pandas: [Y,X]=xgbf.trainingDataSet(DataSet,VegeTypes,varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) else: [Y, X] = DataSet if not bool_strclass: Y = init.mergeCategories(Y) class_counts = np.bincount(Y) min_class_count = np.min(class_counts) if min_class_count > 5: k_neighbors = 5 else: k_neighbors = min_class_count - 1 if tar_ratio == -1: sm = SMOTE(kind=method, k_neighbors=k_neighbors, n_jobs=nthread) else: smoteratiodict = _calc_smoteratio(Y, class_counts, tar_ratio=tar_ratio) sm = SMOTE(ratio=smoteratiodict, kind=method, k_neighbors=k_neighbors, n_jobs=nthread) [X_res, Y_res] = sm.fit_sample(X, Y) if not bool_strclass: if bool_pandas: X_res_pd = pd.DataFrame(X_res, columns=varnames) Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes)) Y_res_pd = pd.DataFrame(Y_indi, columns=VegeTypes) SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1) else: Y_indi = init.expandCategories(Y_res, num_class=len(VegeTypes)) SMOTEDataSet = [Y_indi, X_res] else: X_res_pd = pd.DataFrame(X_res, columns=varnames) Y_indi = init.classNumToStr(Y_res, VegeTypes) Y_res_pd = pd.DataFrame(Y_indi, columns=[labelHeaderName]) SMOTEDataSet = pd.concat([Y_res_pd, X_res_pd], axis=1) return SMOTEDataSet
def splitTrainTestData(DataSet, train_percent, bool_stratify=False): [Y, X] = DataSet if not bool_stratify: [train_x, test_x, train_y, test_y] = train_test_split(X, Y, test_size=1 - train_percent, shuffle=True) else: if len(Y.shape) > 1: if Y.shape[1] > 1: class_labels = init.mergeCategories(Y) else: class_labels = Y[:, 0] else: class_labels = Y [train_x,test_x,train_y,test_y]=train_test_split(X,Y,test_size=1-train_percent,\ shuffle=True,stratify=class_labels) return [train_x, test_x, train_y, test_y]
def testMulticlassCategoryModel(ModelList,TestDataSet,VegeTypes,varnames,params,runtime=-1,bool_pandas=True,\ bool_strclass=False,labelHeaderName="",bool_save=False,savedir=""): num_class = len(VegeTypes) if not len(ModelList): if runtime == -1: modelName = 'category_multiclass.model' else: modelName = 'category_multiclass_run' + str(runtime) + '.model' modeldir = savedir + os.sep + modelName model = xgbf.loadModel(modeldir, params) else: model = ModelList[0] if bool_pandas: [test_Y,test_X]=xgbf.trainingDataSet(TestDataSet,VegeTypes,varnames,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) else: [test_Y, test_X] = TestDataSet xshape = test_X.shape flag = len(xshape) if flag == 1: t = np.zeros([1, len(varnames)]) t[0, :] = test_X test_X = t t = np.zeros([1, num_class]) t[0, :] = test_Y test_Y = t if not bool_strclass and len(test_Y.shape) > 1: test_Y = init.mergeCategories(test_Y) num_instance = test_X.shape[0] test_X = init.formatMulticlassCategoryInput([], test_X, num_class, 0) pred_pY = xgbf.Predict(model, test_X, bool_binary=False) if flag == 1: t = np.zeros([1, num_class]) t[0, :] = pred_pY pred_pY = t else: pred_pY_reshape = np.zeros([num_instance, num_class]) for i in range(num_instance): pred_pY_reshape[i, :] = pred_pY[i * num_class:(i + 1) * num_class] pred_pY = pred_pY_reshape pred_Y = np.argmax(pred_pY, axis=1) return [pred_Y, pred_pY, test_Y]
def evalFeature(CPIDs,evaluate_feature,TrainDataSet,ValidDataSet,VegeTypes,feature_names,multiclassmethod,params,evalue_method,\ bool_cv,cv_num,skf_split,bool_gpu,n_gpus,n_parallels,bool_weight,bool_strclass,labelHeaderName,bool_save,savedir): print("Trying to evalute feature: %s"%evaluate_feature) params_parallel=copy.deepcopy(params) process_pid=os.getpid() if len(CPIDs)<n_parallels: CPIDs.append(process_pid) process_pid_index=CPIDs.index(process_pid) print("Worker #%d: PID = %d"%(process_pid_index,process_pid)) if bool_gpu: params_parallel['gpu_id']=process_pid_index%n_gpus if bool_cv==1: [Y,X]=xgbf.trainingDataSet(TrainDataSet,VegeTypes,feature_names,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName) if not bool_strclass: class_labels=init.mergeCategories(Y) else: class_labels=Y pred_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) pred_pY_cv=np.zeros(len(class_labels)*cv_num) test_Y_cv=np.zeros(len(class_labels)*cv_num,dtype=np.int32) last_cv_idx=0 current_cv_idx=0 for cv_i in range(cv_num): skf=StratifiedKFold(n_splits=skf_split,shuffle=True) cv_j=0 for train, test in skf.split(X,class_labels): train_x=X[train] train_y=Y[train] test_x=X[test] test_y=Y[test] if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel([train_y,train_x],VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,[test_y,test_x],VegeTypes,feature_names,params_parallel,bool_pandas=False,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") current_cv_idx=len(test_Y)+last_cv_idx pred_Y_cv[last_cv_idx:current_cv_idx]=pred_Y # pred_pY_cv[last_cv_idx:current_cv_idx]=pred_pY test_Y_cv[last_cv_idx:current_cv_idx]=test_Y last_cv_idx=current_cv_idx # evalues_runtime[cv_i,cv_j]=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) cv_j=cv_j+1 evalue=xgbf.Evaluate(test_Y_cv,pred_Y_cv,pred_pY_cv,evalue_method) else: if multiclassmethod=='softmax': ModelList=mtc.trainMulticlassSoftmaxModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassSoftmaxModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) elif multiclassmethod=='category': ModelList=mtc.trainMulticlassCategoryModel(TrainDataSet,VegeTypes,feature_names,params_parallel,bool_weight=bool_weight,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) [pred_Y,pred_pY,test_Y]=mtc.testMulticlassCategoryModel(ModelList,ValidDataSet,VegeTypes,feature_names,params_parallel,bool_pandas=True,\ bool_strclass=bool_strclass,labelHeaderName=labelHeaderName,bool_save=bool_save,savedir=savedir) else: print("Invalid Multiclass Method Input!") evalue=xgbf.Evaluate(test_Y,pred_Y,pred_pY,evalue_method) print("Feature: %s partial evalue = %f\n"%(evaluate_feature,evalue)) return evalue