Esempio n. 1
0
def test_categorical_data_digits_all_negative():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)

    mixed_nb = MixedNB(categorical_features='all')
    with pytest.raises(ValueError):
        mixed_nb.fit(X, y)
Esempio n. 2
0
def test_categorical_data_digits():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb.score(X, y)

    mixed_nb = MixedNB(categorical_features='all',
                       max_categories=np.repeat(17, 64))
    mixed_nb.fit(X[:1440], y[:1440])
    mixed_nb.score(X[:1440], y[:1440])
Esempio n. 3
0
def test_continuous_data_digits():
    digits = load_digits()
    X = digits['data']
    y = digits['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
Esempio n. 4
0
def test_continuous_data_wine():
    wine = load_wine()
    X = wine['data']
    y = wine['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
Esempio n. 5
0
def test_continuous_data_iris():
    iris = load_iris()
    X = iris['data']
    y = iris['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_pred = gaussian_nb.predict(X)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_pred = mixed_nb.predict(X)

    assert (mixed_nb_pred == gaussian_nb_pred).all()
Esempio n. 6
0
def test_continuous_data_breast_cancer():
    breast_cancer = load_breast_cancer()
    X = breast_cancer['data']
    y = breast_cancer['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_score = gaussian_nb.score(X, y)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_score = mixed_nb.score(X, y)

    assert np.isclose(gaussian_nb_score, mixed_nb_score)
Esempio n. 7
0
def test_input_y_not_encoded():
    clf = MixedNB()
    with pytest.raises(ValueError):
        clf.fit([[1, 2], [2, 2], [3, 3]], [0, 8, 0])
Esempio n. 8
0
def test_input_wrong_dims_2():
    clf = MixedNB()
    with pytest.raises(ValueError):
        clf.fit([[0, 1, 2]], [[0, 1]])
Esempio n. 9
0
def test_input_string_y():
    clf = MixedNB()
    with pytest.raises(TypeError):
        clf.fit([[2], [1]], [0, '1'])
Esempio n. 10
0
def test_input_string_x():
    clf = MixedNB()
    with pytest.raises(TypeError):
        clf.fit([['X'], ['y']], [0, 1])
Esempio n. 11
0
def test_input_param():
    clf = MixedNB(alpha='l')
    with pytest.raises(TypeError):
        clf.fit([0, 1, 2], [0, 1, 0])
Esempio n. 12
0
def test_categorical_data_simple():
    X, y = load_example()

    mixed_nb = MixedNB([0, 1])
    mixed_nb.fit(X, y)
    mixed_nb.score(X, y)
#Divide the dataset into y and X
y = pro2['Severity']
X = pro2.iloc[:,1:]

# Change categorical variables into numerical variables
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]] = X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]].apply(LabelEncoder().fit_transform)

# Split the dataset into training dataset and test dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.2,random_state=1234, stratify=y)

# Build a Bayesian Classification Model and predict the type using the test data.
gnb = MixedNB(categorical_features=[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35])
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

# Calculate the accuracy
accuracy = gnb.score(X_test, y_test)
print('Accuracy: {0:.2f}'.format(accuracy))

# Build a confusion matrix
cm = metrics.confusion_matrix(y_test,y_pred)
print(metrics.classification_report(y_test,y_pred))


#Bayesian for PCA dataset
# Load dataset
import numpy as np
import pandas as pd
    filename2 = Output_name+'Naive_Bayes_fitted_model.mod'
    with open( filename2, "wb") as file:
        pickle.dump(best_clf, file)
    
    print(out)
    # Fit  model
    #best_clf.fit(SX_train.astype(float),y_train.astype(int))

    # Train the model using the training sets
    #scores = cross_val_score(best_clf, SX_train.astype(float), y_train.astype(int), n_jobs=16, cv=StratifiedKFold(5))
    #accuracy = scores.mean()
    #sd= (scores.std())     
    # 0.8330165782220578 +/- 0.029993214054194695

    # Fit optimised model
    best_clf.fit(SX_train.astype(float),y_train.astype(int))
    SX_test.to_csv(Output_name+'SX_test.csv')


    filename = Output_name+'Naive_Bayes_finalized_model.mod'
    with open( filename, "wb") as file:
        pickle.dump(best_clf, file)

    ### Evalaute performance on the training set ###
    y_train_pred = best_clf.predict(SX_train)

    # Print confusion matrix 
    cm_train = confusion_matrix(y_train, y_train_pred)
    print(cm_train)

metrics.accuracy_score(ytest, multinomial_pred)
# Multinomial NB model accuracy is 0.774966

metrics.confusion_matrix(ytest, multinomial_pred)
#    [[10891,   469],
#    [ 2920,   780]]

############# Model 3 using Mixed NB
# here we will use Gaussian NB for continuous predictors and MUltinomial NB
# for categorical predictors
# first install it using - pip install MixedNB
from mixed_naive_bayes import MixedNB
salary_train_raw.dtypes
mixed_model = MixedNB(categorical_features=[1, 2, 4, 5, 6, 7, 8, 12])
mixed_model_pred = mixed_model.fit(Xtrain, ytrain).predict(Xtest)

metrics.accuracy_score(ytest, mixed_model_pred)
# 0.8242

# we can see that the mixed model has highest accuracy score

#################### Model 4 using Logistic regression

from sklearn.linear_model import LogisticRegression
sal_logreg = LogisticRegression()
sal_logreg.fit(Xtrain, ytrain)

logreg_pred = sal_logreg.predict(Xtest)

Counter(ytest)  # {0: 11360, 1: 3700}
Esempio n. 16
0
Run benchmarks on toy datasets provided by sklearn. 
This is to ensure our implementation of Gaussian Naive 
Bayes is the same as sklearn's.
"""

from sklearn.datasets import load_iris, load_digits, \
    load_wine, load_breast_cancer
from sklearn.naive_bayes import GaussianNB
from mixed_naive_bayes import MixedNB

for load_data in [load_iris, load_digits, load_wine,
                  load_breast_cancer]:

    print(f"--- {''.join(load_data.__name__.split('_')[1:])} ---")

    dataset = load_data()

    X = dataset['data']
    y = dataset['target']

    gaussian_nb = GaussianNB()
    gaussian_nb.fit(X, y)
    gaussian_nb_pred = gaussian_nb.predict(X)

    mixed_nb = MixedNB()
    mixed_nb.fit(X, y)
    mixed_nb_pred = mixed_nb.predict(X)

    print(f"GaussianNB: {gaussian_nb.score(X,y)}")
    print(f"MixedNB   : {mixed_nb.score(X,y)}")