Example #1
0
def train_and_save_final_model(X, y, X_train, y_train, params, save_model_file_path, test_data):
    gnbc=GaussianNB()
    gnbc.set_params(**params)
    
    if test_data == None:
        gnbc.fit(X_train, y_train)
    else:
        gnbc.fit(X, y)
    
    #save model
    model_file_path = save_model_file_path + 'gnbc.sav'
    pickle.dump(gnbc, open(model_file_path, 'wb'))
Example #2
0
class ModelGaussNB(Model, BaseEstimator, ClassifierMixin):
    def __init__(self, run_fold_name, priors=None, var_smoothing=1e-09):
        params = {'priors': priors, 'var_smoothing': var_smoothing}
        super().__init__(run_fold_name, params)
        self.model = GaussianNB(**self.params)

    def train(self, tr_x, tr_y, va_x=None, va_y=None):
        self.model = self.model.fit(tr_x, tr_y)

    def fit(self, tr_x, tr_y):
        self.train(tr_x, tr_y)
        return self

    def predict(self, te_x):
        return self.model.predict(te_x)

    def score(self, te_x, te_y):
        y_pred = self.predict(te_x)
        return f1_score(np.identity(5)[te_y],
                        np.identity(5)[y_pred],
                        average='samples')

    def get_params(self, deep=True):
        dic = self.model.get_params(deep)
        dic["run_fold_name"] = self.run_fold_name
        return dic

    def set_params(self, **parameters):
        if "run_fold_name" in parameters:
            self.run_fold_name = parameters["run_fold_name"]
            parameters.pop("run_fold_name", None)
        self.params.update(parameters)
        self.model.set_params(**self.params)
        return self

    def save_model(self, feature):
        model_path = os.path.join(f'../model/model/{feature}',
                                  f'{self.run_fold_name}.model')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        Util.dump(self.model, model_path)

    def load_model(self, feature):
        model_path = os.path.join(f'../model/model/{feature}',
                                  f'{self.run_fold_name}.model')
        self.model = Util.load(model_path)
Example #3
0
def demoOne():
    X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    y = np.array([1, 1, 1, 2, 2, 2])

    clf = GaussianNB(priors=None)
    clf.fit(X, y)
    print(clf.predict([[-0.8, -1]]))
    print('predict_prob: ', clf.predict_proba([[-0.8, -1]]))
    print('predict_log_prob: ', clf.predict_log_proba([[-0.8, -1]]))
    print(clf.score([[-0.8, -1]], clf.predict([[-0.8, -1]])))
    print(clf.partial_fit(X, y, classes=np.unique(y)))
    print(clf.set_params())
    return X, y
Example #4
0
Xtrain1 = scaler.transform(Xtrain)
scaler = StandardScaler().fit(Xtest)
Xtest1 = scaler.transform(Xtest)
k_range = (1, 10)
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(Xtrain1, Ytrain)
knn = KNeighborsClassifier(n_neighbors=grid.best_params_["n_neighbors"])
knn.fit(Xtrain1, Ytrain)
ypred_knn = knn.predict(Xtest1)
taux_knn_opt = prediction(Ytest, ypred_knn)
# Gaussienne
taux_gaussienne = []
priors = np.linspace(0.001, 1, 1000, False)
for i in range(0, len(priors)):
    clf_gaussian.set_params(priors=[priors[i], 1 - priors[i]])
    clf_gaussian.fit(Xtrain, Ytrain)
    pred = clf_gaussian.predict(Xtest)
    taux_gaussienne = prediction(Ytest, pred)

taux_opt_gauss = priors[np.argmin(taux_gaussienne)]

#VALIDATION CROISEE
score_gaussian = cross_val_score(clf_gaussian, Xtrain, Ytrain, cv=5)
score_Nearest = cross_val_score(clf_Nearest, Xtrain, Ytrain, cv=5)
score_kneighbors = cross_val_score(knn, Xtrain1, Ytrain, cv=5)
score_lmlr = cross_val_score(clf_lmlr, Xtrain1, Ytrain, cv=5)
#AFFICHAGE
#5. Affichage des resultats
plt.subplot(2, 2, 1)
plt.title("Gaussian")
Example #5
0
class GaussianNB(Classifier):
    r"""Implementation of gaussian Naive Bayes classifier.
    
    Date:
        2020

    Author:
        Luka Pečnik

    License:
        MIT
    
    Reference:
        Murphy, Kevin P. "Naive bayes classifiers." University of British Columbia 18 (2006): 60.
    
    Documentation:
        https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html

    See Also:
        * :class:`niaaml.classifiers.Classifier`
    """
    Name = 'Gaussian Naive Bayes'

    def __init__(self, **kwargs):
        r"""Initialize GaussianNB instance.
        """
        warnings.filterwarnings(action='ignore',
                                category=ChangedBehaviorWarning)
        warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataConversionWarning)
        warnings.filterwarnings(action='ignore',
                                category=DataDimensionalityWarning)
        warnings.filterwarnings(action='ignore', category=EfficiencyWarning)
        warnings.filterwarnings(action='ignore', category=FitFailedWarning)
        warnings.filterwarnings(action='ignore', category=NonBLASDotWarning)
        warnings.filterwarnings(action='ignore',
                                category=UndefinedMetricWarning)

        self.__gaussian_nb = GNB()
        super(GaussianNB, self).__init__()

    def set_parameters(self, **kwargs):
        r"""Set the parameters/arguments of the algorithm.
        """
        self.__gaussian_nb.set_params(**kwargs)

    def fit(self, x, y, **kwargs):
        r"""Fit GaussianNB.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.
            y (pandas.core.series.Series): n classes of the samples in the x array.

        Returns:
            None
        """
        self.__gaussian_nb.fit(x, y)

    def predict(self, x, **kwargs):
        r"""Predict class for each sample (row) in x.

        Arguments:
            x (pandas.core.frame.DataFrame): n samples to classify.

        Returns:
            pandas.core.series.Series: n predicted classes.
        """
        return self.__gaussian_nb.predict(x)

    def to_string(self):
        r"""User friendly representation of the object.

        Returns:
            str: User friendly representation of the object.
        """
        return Classifier.to_string(self).format(
            name=self.Name,
            args=self._parameters_to_string(self.__gaussian_nb.get_params()))
Example #6
0

# 在scikit-learn中,提供了3中朴素贝叶斯分类算法:
# GaussianNB(高斯朴素贝叶斯)、
# MultinomialNB(多项式朴素贝叶斯)、
# BernoulliNB(伯努利朴素贝叶斯)

# 1、高斯朴素贝叶斯:sklearn.naive_bayes.GaussianNB(priors=None)
X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [-5, -5],
              [1, 1], [2, 2], [3, 3]])
y = np.array([1, 1, 1, 1, 1, 2, 2, 2])
clf = GaussianNB()  # 默认priors=None
temp = clf.fit(X, y)
print('clf.fit(X,y): ', temp)
print('获取各个类标记对应的先验概率 clf.priors: ', clf.priors)
clf.set_params(priors=[0.625, 0.375])    # 设置估计器priors参数
print('获取各个类标记对应的先验概率 clf.priors: ', clf.priors)
print('clf.class_prior_: ', clf.class_prior_)
print('获取各类标记对应的训练样本数 clf.class_count_: ', clf.class_count_)
print('获取各个类标记在各个特征上的均值 clf.theta_: ', clf.theta_)
print('获取各个类标记在各个特征上的方差 clf.sigma_: ', clf.sigma_)

# fit(X, y, sample_weight=None):训练样本,
# X表示特征向量,y类标记,
# sample_weight表各样本权重数组

# 设置样本不同的权重
temp = clf.fit(X, y, np.array([0.05, 0.05, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2]))
print(temp)
print('获取各个类标记在各个特征上的均值 clf.theta_:', clf.theta_)
print('获取各个类标记在各个特征上的方差 clf.sigma_: ', clf.sigma_)
Example #7
0
    def get_estimator(self):
        estimator = self.kwargs.get("estimator", self.ESTIMATOR)
        # self.mlflow_log_param("model", estimator)
        # added both regressions for predicting scores and classifier for match outcomes
        # elif estimator == 'Linear':
        #     model = LinearRegression()
        # elif estimator == 'RandomForestRegressor':
        #     model = RandomForestRegressor()
        # elif estimator == 'Lasso':
        #     model = Lasso()
        # elif estimator == "Ridge":
        #     model = Ridge()
        # elif estimator == "GBM":
        #     model = GradientBoostingRegressor()
        # elif estimator == "KNNRegressor":
        #     model = KNeighborsRegressor()
        if estimator == 'GaussianNB':  # No proba parameter needed
            model = GaussianNB()
        # elif estimator == 'LDA':
        #     self.model_params = {'solver': ['lsqr','eigen'],  #note svd does not run with shrinkage and models using it will be tuned separately
        #                           'n_components': [1.0,2.0,3.0,4.0,5.0]}
        #     model = LinearDiscriminantAnalysis()
        # elif estimator == "xgboost":
        #     model = XGBRegressor()
        # classification models
        if estimator == 'Logistic':  # No proba parameter needed
            self.model_params = {'C': np.arange(0.001, 1000)}
            #model = LogisticRegression(C=20.000999999999998)
            model = LogisticRegression()
        # elif estimator == 'LDA':
        #     model = LinearDiscriminantAnalysis()
        elif estimator == 'RandomForestClassifier':  # No proba parameter needed
            self.model_params = {
                'bootstrap': [True, False],
                'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                'max_features': ['auto', 'sqrt'],
                'min_samples_leaf': [1, 2, 4],
                'min_samples_split': [2, 5, 10],
                'n_estimators':
                [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
            }
            #model = RandomForestClassifier(n_estimators=1800, n_jobs=-1,max_depth=100,min_samples_split=5,bootstrap=False)
            model = RandomForestClassifier()
        elif estimator == "RidgeClassifier":  # No predict_proba
            self.model_params = {"alpha": np.arange(0.001, 1000)}
            model = RidgeClassifier(alpha=106.00099999999999)
            # model = RidgeClassifier()
            # model = GridSearchCV(estimator=grid, param_grid=dict(alpha=alphas))
        elif estimator == "KNNClassifier":  # No Proba parameter needed
            self.model_params = {
                "leaf_size": range(1, 1000),
                "n_neighbors": range(1, 1000),
                "p": [1.0, 2.0]
            }
            #model = KNeighborsClassifier(leaf_size=336,n_neighbors=913,p=2.0) #positive results
            model = KNeighborsClassifier()
            # model = GridSearchCV(knn, hyperparameters, cv=10)
        elif estimator == "XGBClassifier":  # Proba: Returns array with the probability of each data example being of a given class.
            self.model_params = {
                'max_depth': range(2, 20, 2),
                'n_estimators': range(60, 220, 40),
                'learning_rate': [0.3, 0.1, 0.01, 0.05],
                'min_child_weight': [1.0, 3.0, 5.0],
                'gamma': [1.0, 3.0, 5.0]
            }
            #model = XGBClassifier(max_depth=14,n_estimators=60,learning_rate=0.1,min_child_weight=1.0,gamma=5.0) #positive results
            # model = XGBClassifier(max_depth=18,n_estimators=60,learning_rate=0.05,min_child_weight=5,gamma=3.0) #positive results
            model = XGBClassifier()
            # model = GridSearchCV(XGB, param_grid=params_1, cv=5)
        elif estimator == "Dummy":
            model = DummyClassifier(strategy='uniform', random_state=15)
        elif estimator == "SVC":
            self.model_params = {
                'C': [0.1, 1, 10, 100, 1000],
                'gamma': [0.01, 0.001],
                'kernel': ['rbf', 'poly', 'sigmoid']
            }
            # model = SVC(kernel='sigmoid', C=80,gamma=0.001,probability=True)
            model = SVC(probability=True)

        elif estimator == "Sequential":
            model = Sequential()
            model.add(Flatten())
            model.add(BatchNormalization())
            model.add(Dense(32, activation='relu'))
            model.add(Dense(32, activation='relu'))
            model.add(Dense(16, activation='relu'))
            model.add(
                Dense(8,
                      kernel_regularizer=regularizers.l2(0.003),
                      activation='relu',
                      input_shape=(10000, )))
            model.add(
                Dense(8,
                      kernel_regularizer=regularizers.l2(0.003),
                      activation='relu'))
            model.add(Dense(1, activation='sigmoid'))
            # model.add(SimpleRNN(1, input_shape=[None, 1], activation='tanh'))
            model.compile(loss='binary_crossentropy',
                          optimizer='Adam',
                          metrics=['accuracy'])

        else:
            self.model_params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
            model = LogisticRegression()

        estimator_params = self.kwargs.get("estimator_params", {})
        if estimator != "Sequential":
            model.set_params(**estimator_params)
        return model
Example #8
0
import numpy as np
from sklearn.naive_bayes import GaussianNB
X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [1, 1], [2, 2], [3, 3]])
y = np.array([1, 1, 1, 1, 2, 2, 2])
clf = GaussianNB()
re = clf.fit(X, y)
# print(re)
# GaussianNB(priors=None, var_smoothing=1e-09)

re1 = clf.priors
# print(re1)  #None

# 设置priors参数值
re2 = clf.set_params(priors=[0.625, 0.375])
# print(re2)
# GaussianNB(priors=[0.625, 0.375], var_smoothing=1e-09)

# 返回各类标记对应先验概率组成的列表
re3 = clf.priors
# print(re3)
# [0.625, 0.375]

re4 = clf.class_prior_
# print(re4)
# [0.57142857 0.42857143]

re5 = type(clf.class_prior_)
# print(re5)
# <class 'numpy.ndarray'>

re6 = clf.class_count_
    main_algorithm = RandomForestClassifier()
elif algorithm == "gaussian_process":
    main_algorithm = GaussianProcessClassifier()
elif algorithm == "stochastic_gradient_descent":
    main_algorithm = SGDClassifier()
elif algorithm == "multi_layer_perceptron":
    main_algorithm = MLPClassifier()
else:
    print "Unknown algorithm", algorithm
    exit(1)

# Load the best known parameter set (if any).
name = step_name(main_algorithm)
best_params = custom_param_grids.get_best_parameter_set(name, do_prefix=False)
if best_params:
    main_algorithm.set_params(**best_params)

# Use StratifiedShuffleSplit instead of default StratifiedKFold for cross validation:
# See notes.md for a summary of the article at scikit-learn.org on cross validaiton.
# The rationale of stratification is that the relative frequencies of class labels
# (POI or not POI) should be the same in training and test set as in the whole data.
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=100, test_size=0.3, random_state=SEED)

# Optimization for f1 score brings good mix of precision and recall.
scoring_function = "f1"
# scoring_function = "recall"

# Setup feature selection
from sklearn.feature_selection import SelectKBest, SelectPercentile, RFECV, SelectFromModel
feature_selector = args.feature_selection
#核心代码:其实fit后面还有一个参数即fit(X, y, sample_weight=None),sample_weight表各样本权重数组,假如一共训练8个样本
#则可以写为clf.fit(iris.data[:8], iris.target[:8],sample_weight=np.array([0.05,0.05,0.1,0.1,0.1,0.2,0.2,0.2]))
clf = GaussianNB()
clf.fit(iris.data, iris.target)
'''
#GaussianNB一个重要的功能是有 partial_fit方法,这个方法的一般用在如果训练集数据量非常大,一次不能全部载入
#内存的时候。这时我们可以把训练集分成若干等分,重复调用partial_fit来一步步的学习训练集,非常方便
#在第一次调用partial_fit函数时,必须制定classes参数,在随后的调用可以忽略
clf.partial_fit(iris.data, iris.target,classes=[0,1,2])

'''

#学习后模型中的一些参数
clf.set_params(
    priors=[0.333, 0.333,
            0.333])  #这里要设一下各个类标记对应的先验概率,如果不设置直接clf.priors返回的是None(不知道为什么?)
print(clf.priors)  #获取各个类标记对应的先验概率
print(clf.class_prior_
      )  #同priors一样,都是获取各个类标记对应的先验概率,区别在于priors属性返回列表,class_prior_返回的是数组
print(clf.get_params(deep=True))  #返回priors与其参数值组成字典

print(clf.class_count_)  #获取各类标记对应的训练样本数
print(clf.theta_)  #获取各个类标记在各个特征上的均值
print(clf.sigma_)  #获取各个类标记在各个特征上的方差

#测试数据
data_test = np.array([6, 4, 6, 2])
data = data_test.reshape(1, -1)
Result_predict = clf.predict(data)
Score = clf.score([[6, 8, 5, 3], [5, 3, 4, 2], [4, 6, 7, 2]], [2, 0, 1],
Example #11
0
elif np.cumprod(resultmale)[2] < np.cumprod(resultfemale)[2]:
    print('女')
else:
    print('不确定')

#使用sklearn中的朴素贝叶斯算法计算
#1、高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB

X = train.iloc[:, 1:]
y = train.iloc[:, 0]

clf = GaussianNB()
clf.fit(X, y)

clf.set_params(priors=[0.5, 0.5])

print(clf.predict([[6, 130, 8]]))

#2、多项式朴素贝叶斯
from sklearn.naive_bayes import MultinomialNB

#X=train.iloc[:,1:]
#y=train.iloc[:,0]

clf2 = MultinomialNB(alpha=2.0, class_prior=None, fit_prior=False)
clf2.fit(X, y)

print(clf2.predict([[6, 130, 8]]))

#3、伯努利朴素贝叶斯  不论是test数据还是train的数据,预测的结果都是0 ,是不是说明该模型对于该数据是不合适的??
Example #12
0
import numpy as np
from sklearn.naive_bayes import GaussianNB

X = np.array([[-1, -1], [-2, -2], [-3, -3], [-4, -4], [-5, -5], [1, 1], [2, 2],
              [3, 3]])
y = np.array([1, 1, 1, 1, 1, 2, 2, 2])
clf = GaussianNB(priors=[0.625, 0.375])  #默认priors=None
clf.fit(X, y, sample_weight=None)  #训练样本,X表示特征向量,y类标记,sample_weight表各样本权重数组
print(clf.class_prior_)  #priors属性:获取各个类标记对应的先验概率
print(clf.priors)  #class_prior_属性:同priors一样,
print(clf.class_count_)  #class_count_属性:获取各类标记对应的训练样本数
print(clf.theta_)  #theta_属性:获取各个类标记在各个特征上的均值
print(clf.sigma_)  #sigma_属性:获取各个类标记在各个特征上的方差
print(clf.get_params(deep=True))  #get_params(deep=True):返回priors与其参数值组成字典
clf.set_params(priors=[0.6, 0.4])  #set_params(**params):设置估计器priors参数
print(clf.get_params(deep=True))
print(clf.predict([[-6, -6], [4, 5]]))  #预测样本分类
print(clf.predict_proba([[-6, -6], [4,
                                    5]]))  #predict_proba(X):输出测试样本在各个类标记预测概率值
print(clf.predict_log_proba([[-6, -6], [4, 5]
                             ]))  #predict_log_proba(X):输出测试样本在各个类标记上预测概率值对应对数值
print(clf.score(
    [[-6, -6], [-4, -2], [-3, -4], [4, 5]],
    [1, 1, 2, 2]))  #score(X, y, sample_weight=None):返回测试样本映射到指定类标记上的平均得分(准确率)

# output:
# [0.625 0.375]
# [0.625, 0.375]
# [5. 3.]
# [[-3. -3.]
#  [ 2.  2.]]