def train_and_save_final_model(X, y, X_train, y_train, params, save_model_file_path, test_data): gnbc=GaussianNB() gnbc.set_params(**params) if test_data == None: gnbc.fit(X_train, y_train) else: gnbc.fit(X, y) #save model model_file_path = save_model_file_path + 'gnbc.sav' pickle.dump(gnbc, open(model_file_path, 'wb'))
class ModelGaussNB(Model, BaseEstimator, ClassifierMixin): def __init__(self, run_fold_name, priors=None, var_smoothing=1e-09): params = {'priors': priors, 'var_smoothing': var_smoothing} super().__init__(run_fold_name, params) self.model = GaussianNB(**self.params) def train(self, tr_x, tr_y, va_x=None, va_y=None): self.model = self.model.fit(tr_x, tr_y) def fit(self, tr_x, tr_y): self.train(tr_x, tr_y) return self def predict(self, te_x): return self.model.predict(te_x) def score(self, te_x, te_y): y_pred = self.predict(te_x) return f1_score(np.identity(5)[te_y], np.identity(5)[y_pred], average='samples') def get_params(self, deep=True): dic = self.model.get_params(deep) dic["run_fold_name"] = self.run_fold_name return dic def set_params(self, **parameters): if "run_fold_name" in parameters: self.run_fold_name = parameters["run_fold_name"] parameters.pop("run_fold_name", None) self.params.update(parameters) self.model.set_params(**self.params) return self def save_model(self, feature): model_path = os.path.join(f'../model/model/{feature}', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Util.dump(self.model, model_path) def load_model(self, feature): model_path = os.path.join(f'../model/model/{feature}', f'{self.run_fold_name}.model') self.model = Util.load(model_path)
def demoOne(): X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) y = np.array([1, 1, 1, 2, 2, 2]) clf = GaussianNB(priors=None) clf.fit(X, y) print(clf.predict([[-0.8, -1]])) print('predict_prob: ', clf.predict_proba([[-0.8, -1]])) print('predict_log_prob: ', clf.predict_log_proba([[-0.8, -1]])) print(clf.score([[-0.8, -1]], clf.predict([[-0.8, -1]]))) print(clf.partial_fit(X, y, classes=np.unique(y))) print(clf.set_params()) return X, y
Xtrain1 = scaler.transform(Xtrain) scaler = StandardScaler().fit(Xtest) Xtest1 = scaler.transform(Xtest) k_range = (1, 10) param_grid = dict(n_neighbors=k_range) grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5) grid.fit(Xtrain1, Ytrain) knn = KNeighborsClassifier(n_neighbors=grid.best_params_["n_neighbors"]) knn.fit(Xtrain1, Ytrain) ypred_knn = knn.predict(Xtest1) taux_knn_opt = prediction(Ytest, ypred_knn) # Gaussienne taux_gaussienne = [] priors = np.linspace(0.001, 1, 1000, False) for i in range(0, len(priors)): clf_gaussian.set_params(priors=[priors[i], 1 - priors[i]]) clf_gaussian.fit(Xtrain, Ytrain) pred = clf_gaussian.predict(Xtest) taux_gaussienne = prediction(Ytest, pred) taux_opt_gauss = priors[np.argmin(taux_gaussienne)] #VALIDATION CROISEE score_gaussian = cross_val_score(clf_gaussian, Xtrain, Ytrain, cv=5) score_Nearest = cross_val_score(clf_Nearest, Xtrain, Ytrain, cv=5) score_kneighbors = cross_val_score(knn, Xtrain1, Ytrain, cv=5) score_lmlr = cross_val_score(clf_lmlr, Xtrain1, Ytrain, cv=5) #AFFICHAGE #5. Affichage des resultats plt.subplot(2, 2, 1) plt.title("Gaussian")
class GaussianNB(Classifier): r"""Implementation of gaussian Naive Bayes classifier. Date: 2020 Author: Luka Pečnik License: MIT Reference: Murphy, Kevin P. "Naive bayes classifiers." University of British Columbia 18 (2006): 60. Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html See Also: * :class:`niaaml.classifiers.Classifier` """ Name = 'Gaussian Naive Bayes' def __init__(self, **kwargs): r"""Initialize GaussianNB instance. """ warnings.filterwarnings(action='ignore', category=ChangedBehaviorWarning) warnings.filterwarnings(action='ignore', category=ConvergenceWarning) warnings.filterwarnings(action='ignore', category=DataConversionWarning) warnings.filterwarnings(action='ignore', category=DataDimensionalityWarning) warnings.filterwarnings(action='ignore', category=EfficiencyWarning) warnings.filterwarnings(action='ignore', category=FitFailedWarning) warnings.filterwarnings(action='ignore', category=NonBLASDotWarning) warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning) self.__gaussian_nb = GNB() super(GaussianNB, self).__init__() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__gaussian_nb.set_params(**kwargs) def fit(self, x, y, **kwargs): r"""Fit GaussianNB. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. y (pandas.core.series.Series): n classes of the samples in the x array. Returns: None """ self.__gaussian_nb.fit(x, y) def predict(self, x, **kwargs): r"""Predict class for each sample (row) in x. Arguments: x (pandas.core.frame.DataFrame): n samples to classify. Returns: pandas.core.series.Series: n predicted classes. """ return self.__gaussian_nb.predict(x) def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return Classifier.to_string(self).format( name=self.Name, args=self._parameters_to_string(self.__gaussian_nb.get_params()))
# 在scikit-learn中,提供了3中朴素贝叶斯分类算法: # GaussianNB(高斯朴素贝叶斯)、 # MultinomialNB(多项式朴素贝叶斯)、 # BernoulliNB(伯努利朴素贝叶斯) # 1、高斯朴素贝叶斯:sklearn.naive_bayes.GaussianNB(priors=None) X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [-5, -5], [1, 1], [2, 2], [3, 3]]) y = np.array([1, 1, 1, 1, 1, 2, 2, 2]) clf = GaussianNB() # 默认priors=None temp = clf.fit(X, y) print('clf.fit(X,y): ', temp) print('获取各个类标记对应的先验概率 clf.priors: ', clf.priors) clf.set_params(priors=[0.625, 0.375]) # 设置估计器priors参数 print('获取各个类标记对应的先验概率 clf.priors: ', clf.priors) print('clf.class_prior_: ', clf.class_prior_) print('获取各类标记对应的训练样本数 clf.class_count_: ', clf.class_count_) print('获取各个类标记在各个特征上的均值 clf.theta_: ', clf.theta_) print('获取各个类标记在各个特征上的方差 clf.sigma_: ', clf.sigma_) # fit(X, y, sample_weight=None):训练样本, # X表示特征向量,y类标记, # sample_weight表各样本权重数组 # 设置样本不同的权重 temp = clf.fit(X, y, np.array([0.05, 0.05, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2])) print(temp) print('获取各个类标记在各个特征上的均值 clf.theta_:', clf.theta_) print('获取各个类标记在各个特征上的方差 clf.sigma_: ', clf.sigma_)
def get_estimator(self): estimator = self.kwargs.get("estimator", self.ESTIMATOR) # self.mlflow_log_param("model", estimator) # added both regressions for predicting scores and classifier for match outcomes # elif estimator == 'Linear': # model = LinearRegression() # elif estimator == 'RandomForestRegressor': # model = RandomForestRegressor() # elif estimator == 'Lasso': # model = Lasso() # elif estimator == "Ridge": # model = Ridge() # elif estimator == "GBM": # model = GradientBoostingRegressor() # elif estimator == "KNNRegressor": # model = KNeighborsRegressor() if estimator == 'GaussianNB': # No proba parameter needed model = GaussianNB() # elif estimator == 'LDA': # self.model_params = {'solver': ['lsqr','eigen'], #note svd does not run with shrinkage and models using it will be tuned separately # 'n_components': [1.0,2.0,3.0,4.0,5.0]} # model = LinearDiscriminantAnalysis() # elif estimator == "xgboost": # model = XGBRegressor() # classification models if estimator == 'Logistic': # No proba parameter needed self.model_params = {'C': np.arange(0.001, 1000)} #model = LogisticRegression(C=20.000999999999998) model = LogisticRegression() # elif estimator == 'LDA': # model = LinearDiscriminantAnalysis() elif estimator == 'RandomForestClassifier': # No proba parameter needed self.model_params = { 'bootstrap': [True, False], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'max_features': ['auto', 'sqrt'], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 5, 10], 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000] } #model = RandomForestClassifier(n_estimators=1800, n_jobs=-1,max_depth=100,min_samples_split=5,bootstrap=False) model = RandomForestClassifier() elif estimator == "RidgeClassifier": # No predict_proba self.model_params = {"alpha": np.arange(0.001, 1000)} model = RidgeClassifier(alpha=106.00099999999999) # model = RidgeClassifier() # model = GridSearchCV(estimator=grid, param_grid=dict(alpha=alphas)) elif estimator == "KNNClassifier": # No Proba parameter needed self.model_params = { "leaf_size": range(1, 1000), "n_neighbors": range(1, 1000), "p": [1.0, 2.0] } #model = KNeighborsClassifier(leaf_size=336,n_neighbors=913,p=2.0) #positive results model = KNeighborsClassifier() # model = GridSearchCV(knn, hyperparameters, cv=10) elif estimator == "XGBClassifier": # Proba: Returns array with the probability of each data example being of a given class. self.model_params = { 'max_depth': range(2, 20, 2), 'n_estimators': range(60, 220, 40), 'learning_rate': [0.3, 0.1, 0.01, 0.05], 'min_child_weight': [1.0, 3.0, 5.0], 'gamma': [1.0, 3.0, 5.0] } #model = XGBClassifier(max_depth=14,n_estimators=60,learning_rate=0.1,min_child_weight=1.0,gamma=5.0) #positive results # model = XGBClassifier(max_depth=18,n_estimators=60,learning_rate=0.05,min_child_weight=5,gamma=3.0) #positive results model = XGBClassifier() # model = GridSearchCV(XGB, param_grid=params_1, cv=5) elif estimator == "Dummy": model = DummyClassifier(strategy='uniform', random_state=15) elif estimator == "SVC": self.model_params = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [0.01, 0.001], 'kernel': ['rbf', 'poly', 'sigmoid'] } # model = SVC(kernel='sigmoid', C=80,gamma=0.001,probability=True) model = SVC(probability=True) elif estimator == "Sequential": model = Sequential() model.add(Flatten()) model.add(BatchNormalization()) model.add(Dense(32, activation='relu')) model.add(Dense(32, activation='relu')) model.add(Dense(16, activation='relu')) model.add( Dense(8, kernel_regularizer=regularizers.l2(0.003), activation='relu', input_shape=(10000, ))) model.add( Dense(8, kernel_regularizer=regularizers.l2(0.003), activation='relu')) model.add(Dense(1, activation='sigmoid')) # model.add(SimpleRNN(1, input_shape=[None, 1], activation='tanh')) model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy']) else: self.model_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} model = LogisticRegression() estimator_params = self.kwargs.get("estimator_params", {}) if estimator != "Sequential": model.set_params(**estimator_params) return model
import numpy as np from sklearn.naive_bayes import GaussianNB X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [1, 1], [2, 2], [3, 3]]) y = np.array([1, 1, 1, 1, 2, 2, 2]) clf = GaussianNB() re = clf.fit(X, y) # print(re) # GaussianNB(priors=None, var_smoothing=1e-09) re1 = clf.priors # print(re1) #None # 设置priors参数值 re2 = clf.set_params(priors=[0.625, 0.375]) # print(re2) # GaussianNB(priors=[0.625, 0.375], var_smoothing=1e-09) # 返回各类标记对应先验概率组成的列表 re3 = clf.priors # print(re3) # [0.625, 0.375] re4 = clf.class_prior_ # print(re4) # [0.57142857 0.42857143] re5 = type(clf.class_prior_) # print(re5) # <class 'numpy.ndarray'> re6 = clf.class_count_
main_algorithm = RandomForestClassifier() elif algorithm == "gaussian_process": main_algorithm = GaussianProcessClassifier() elif algorithm == "stochastic_gradient_descent": main_algorithm = SGDClassifier() elif algorithm == "multi_layer_perceptron": main_algorithm = MLPClassifier() else: print "Unknown algorithm", algorithm exit(1) # Load the best known parameter set (if any). name = step_name(main_algorithm) best_params = custom_param_grids.get_best_parameter_set(name, do_prefix=False) if best_params: main_algorithm.set_params(**best_params) # Use StratifiedShuffleSplit instead of default StratifiedKFold for cross validation: # See notes.md for a summary of the article at scikit-learn.org on cross validaiton. # The rationale of stratification is that the relative frequencies of class labels # (POI or not POI) should be the same in training and test set as in the whole data. from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=SEED) # Optimization for f1 score brings good mix of precision and recall. scoring_function = "f1" # scoring_function = "recall" # Setup feature selection from sklearn.feature_selection import SelectKBest, SelectPercentile, RFECV, SelectFromModel feature_selector = args.feature_selection
#核心代码:其实fit后面还有一个参数即fit(X, y, sample_weight=None),sample_weight表各样本权重数组,假如一共训练8个样本 #则可以写为clf.fit(iris.data[:8], iris.target[:8],sample_weight=np.array([0.05,0.05,0.1,0.1,0.1,0.2,0.2,0.2])) clf = GaussianNB() clf.fit(iris.data, iris.target) ''' #GaussianNB一个重要的功能是有 partial_fit方法,这个方法的一般用在如果训练集数据量非常大,一次不能全部载入 #内存的时候。这时我们可以把训练集分成若干等分,重复调用partial_fit来一步步的学习训练集,非常方便 #在第一次调用partial_fit函数时,必须制定classes参数,在随后的调用可以忽略 clf.partial_fit(iris.data, iris.target,classes=[0,1,2]) ''' #学习后模型中的一些参数 clf.set_params( priors=[0.333, 0.333, 0.333]) #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?) print(clf.priors) #获取各个类标记对应的先验概率 print(clf.class_prior_ ) #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组 print(clf.get_params(deep=True)) #返回priors与其参数值组成字典 print(clf.class_count_) #获取各类标记对应的训练样本数 print(clf.theta_) #获取各个类标记在各个特征上的均值 print(clf.sigma_) #获取各个类标记在各个特征上的方差 #测试数据 data_test = np.array([6, 4, 6, 2]) data = data_test.reshape(1, -1) Result_predict = clf.predict(data) Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1],
elif np.cumprod(resultmale)[2] < np.cumprod(resultfemale)[2]: print('女') else: print('不确定') #使用sklearn中的朴素贝叶斯算法计算 #1、高斯朴素贝叶斯 from sklearn.naive_bayes import GaussianNB X = train.iloc[:, 1:] y = train.iloc[:, 0] clf = GaussianNB() clf.fit(X, y) clf.set_params(priors=[0.5, 0.5]) print(clf.predict([[6, 130, 8]])) #2、多项式朴素贝叶斯 from sklearn.naive_bayes import MultinomialNB #X=train.iloc[:,1:] #y=train.iloc[:,0] clf2 = MultinomialNB(alpha=2.0, class_prior=None, fit_prior=False) clf2.fit(X, y) print(clf2.predict([[6, 130, 8]])) #3、伯努利朴素贝叶斯 不论是test数据还是train的数据,预测的结果都是0 ,是不是说明该模型对于该数据是不合适的??
import numpy as np from sklearn.naive_bayes import GaussianNB X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [-5, -5], [1, 1], [2, 2], [3, 3]]) y = np.array([1, 1, 1, 1, 1, 2, 2, 2]) clf = GaussianNB(priors=[0.625, 0.375]) #默认priors=None clf.fit(X, y, sample_weight=None) #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组 print(clf.class_prior_) #priors属性:获取各个类标记对应的先验概率 print(clf.priors) #class_prior_属性:同priors一样, print(clf.class_count_) #class_count_属性:获取各类标记对应的训练样本数 print(clf.theta_) #theta_属性:获取各个类标记在各个特征上的均值 print(clf.sigma_) #sigma_属性:获取各个类标记在各个特征上的方差 print(clf.get_params(deep=True)) #get_params(deep=True):返回priors与其参数值组成字典 clf.set_params(priors=[0.6, 0.4]) #set_params(**params):设置估计器priors参数 print(clf.get_params(deep=True)) print(clf.predict([[-6, -6], [4, 5]])) #预测样本分类 print(clf.predict_proba([[-6, -6], [4, 5]])) #predict_proba(X):输出测试样本在各个类标记预测概率值 print(clf.predict_log_proba([[-6, -6], [4, 5] ])) #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值 print(clf.score( [[-6, -6], [-4, -2], [-3, -4], [4, 5]], [1, 1, 2, 2])) #score(X, y, sample_weight=None):返回测试样本映射到指定类标记上的平均得分(准确率) # output: # [0.625 0.375] # [0.625, 0.375] # [5. 3.] # [[-3. -3.] # [ 2. 2.]]