def test_categorical_data_digits_all_negative(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) mixed_nb = MixedNB(categorical_features='all') with pytest.raises(ValueError): mixed_nb.fit(X, y)
def test_categorical_data_digits(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb.score(X, y) mixed_nb = MixedNB(categorical_features='all', max_categories=np.repeat(17, 64)) mixed_nb.fit(X[:1440], y[:1440]) mixed_nb.score(X[:1440], y[:1440])
def test_continuous_data_digits(): digits = load_digits() X = digits['data'] y = digits['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_continuous_data_wine(): wine = load_wine() X = wine['data'] y = wine['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_continuous_data_iris(): iris = load_iris() X = iris['data'] y = iris['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_pred = gaussian_nb.predict(X) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_pred = mixed_nb.predict(X) assert (mixed_nb_pred == gaussian_nb_pred).all()
def test_continuous_data_breast_cancer(): breast_cancer = load_breast_cancer() X = breast_cancer['data'] y = breast_cancer['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_score = gaussian_nb.score(X, y) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_score = mixed_nb.score(X, y) assert np.isclose(gaussian_nb_score, mixed_nb_score)
def test_input_y_not_encoded(): clf = MixedNB() with pytest.raises(ValueError): clf.fit([[1, 2], [2, 2], [3, 3]], [0, 8, 0])
def test_input_wrong_dims_2(): clf = MixedNB() with pytest.raises(ValueError): clf.fit([[0, 1, 2]], [[0, 1]])
def test_input_string_y(): clf = MixedNB() with pytest.raises(TypeError): clf.fit([[2], [1]], [0, '1'])
def test_input_string_x(): clf = MixedNB() with pytest.raises(TypeError): clf.fit([['X'], ['y']], [0, 1])
def test_input_param(): clf = MixedNB(alpha='l') with pytest.raises(TypeError): clf.fit([0, 1, 2], [0, 1, 0])
def test_categorical_data_simple(): X, y = load_example() mixed_nb = MixedNB([0, 1]) mixed_nb.fit(X, y) mixed_nb.score(X, y)
#Divide the dataset into y and X y = pro2['Severity'] X = pro2.iloc[:,1:] # Change categorical variables into numerical variables label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]] = X.iloc[:,[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]].apply(LabelEncoder().fit_transform) # Split the dataset into training dataset and test dataset X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =.2,random_state=1234, stratify=y) # Build a Bayesian Classification Model and predict the type using the test data. gnb = MixedNB(categorical_features=[0,1,2,3,4,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,26,28,30,31,32,33,34,35]) gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) # Calculate the accuracy accuracy = gnb.score(X_test, y_test) print('Accuracy: {0:.2f}'.format(accuracy)) # Build a confusion matrix cm = metrics.confusion_matrix(y_test,y_pred) print(metrics.classification_report(y_test,y_pred)) #Bayesian for PCA dataset # Load dataset import numpy as np import pandas as pd
filename2 = Output_name+'Naive_Bayes_fitted_model.mod' with open( filename2, "wb") as file: pickle.dump(best_clf, file) print(out) # Fit model #best_clf.fit(SX_train.astype(float),y_train.astype(int)) # Train the model using the training sets #scores = cross_val_score(best_clf, SX_train.astype(float), y_train.astype(int), n_jobs=16, cv=StratifiedKFold(5)) #accuracy = scores.mean() #sd= (scores.std()) # 0.8330165782220578 +/- 0.029993214054194695 # Fit optimised model best_clf.fit(SX_train.astype(float),y_train.astype(int)) SX_test.to_csv(Output_name+'SX_test.csv') filename = Output_name+'Naive_Bayes_finalized_model.mod' with open( filename, "wb") as file: pickle.dump(best_clf, file) ### Evalaute performance on the training set ### y_train_pred = best_clf.predict(SX_train) # Print confusion matrix cm_train = confusion_matrix(y_train, y_train_pred) print(cm_train)
metrics.accuracy_score(ytest, multinomial_pred) # Multinomial NB model accuracy is 0.774966 metrics.confusion_matrix(ytest, multinomial_pred) # [[10891, 469], # [ 2920, 780]] ############# Model 3 using Mixed NB # here we will use Gaussian NB for continuous predictors and MUltinomial NB # for categorical predictors # first install it using - pip install MixedNB from mixed_naive_bayes import MixedNB salary_train_raw.dtypes mixed_model = MixedNB(categorical_features=[1, 2, 4, 5, 6, 7, 8, 12]) mixed_model_pred = mixed_model.fit(Xtrain, ytrain).predict(Xtest) metrics.accuracy_score(ytest, mixed_model_pred) # 0.8242 # we can see that the mixed model has highest accuracy score #################### Model 4 using Logistic regression from sklearn.linear_model import LogisticRegression sal_logreg = LogisticRegression() sal_logreg.fit(Xtrain, ytrain) logreg_pred = sal_logreg.predict(Xtest) Counter(ytest) # {0: 11360, 1: 3700}
Run benchmarks on toy datasets provided by sklearn. This is to ensure our implementation of Gaussian Naive Bayes is the same as sklearn's. """ from sklearn.datasets import load_iris, load_digits, \ load_wine, load_breast_cancer from sklearn.naive_bayes import GaussianNB from mixed_naive_bayes import MixedNB for load_data in [load_iris, load_digits, load_wine, load_breast_cancer]: print(f"--- {''.join(load_data.__name__.split('_')[1:])} ---") dataset = load_data() X = dataset['data'] y = dataset['target'] gaussian_nb = GaussianNB() gaussian_nb.fit(X, y) gaussian_nb_pred = gaussian_nb.predict(X) mixed_nb = MixedNB() mixed_nb.fit(X, y) mixed_nb_pred = mixed_nb.predict(X) print(f"GaussianNB: {gaussian_nb.score(X,y)}") print(f"MixedNB : {mixed_nb.score(X,y)}")