def test_classification(): t = zeros(len(target)) t[target == 'setosa'] = 1 t[target == 'versicolor'] = 2 t[target == 'virginica'] = 3 from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(data,t) # training on the iris dataset print classifier.predict(data[0]) print t[0] from sklearn import cross_validation train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0) classifier.fit(train,t_train) # train print classifier.score(test,t_test) # test from sklearn.metrics import confusion_matrix print confusion_matrix(classifier.predict(test),t_test) from sklearn.metrics import classification_report print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']) from sklearn.cross_validation import cross_val_score # cross validation with 6 iterations scores = cross_val_score(classifier, data, t, cv=6) print scores from numpy import mean print mean(scores)
def crossvalidate(X_trn, Y_trn): """Cross validation with comparison to classifiers that classify as only good or only bad""" import numpy as np X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_trn.toarray(), Y_trn, test_size=0.4, random_state=1) dumb_labels1 = Y_test.copy() dumb_labels2 = Y_test.copy() dumb_labels1[dumb_labels1 == 0] = 1; #Labels all 1s dumb_labels2[dumb_labels2 == 1] = 0; #Labels all 0s dumb_labels3 = np.random.randint(2, size=(len(Y_test),)) clf = GaussianNB() #clf = Perceptron() #clf = SGDClassifier() #clf = MultinomialNB() #clf = KNeighborsClassifier() #clf = LinearSVC() clf.fit(X_train, Y_train) accuracy = clf.score(X_test, Y_test) dumb_clf1_score = clf.score(X_test, dumb_labels1) dumb_clf2_score = clf.score(X_test, dumb_labels2) dumb_clf3_score = clf.score(X_test, dumb_labels3) print "Classifier Score : ", accuracy print "Dumb_classifier with all 1s : ", dumb_clf1_score print "Dumb classifier with all 0s : ", dumb_clf2_score print "Dumb classifier with random sequence : ", dumb_clf3_score return accuracy
def get_GNB(Xtrain, Xtest, Ytrain, Ytest): gnb = GaussianNB() gnb.fit(Xtrain,Ytrain) scores = np.empty((4)) scores[0] = gnb.score(Xtrain,Ytrain) scores[1] = gnb.score(Xtest,Ytest) print('GNB, train: {0:.02f}% '.format(scores[0]*100)) print('GNB, test: {0:.02f}% '.format(scores[1]*100)) return gnb
def get_GNB(Xtrain, Ytrain, Xtest = None , Ytest = None, verbose = 0): gnb = GaussianNB() gnb.fit(Xtrain,Ytrain) if (verbose == 1): scores = np.empty((2)) scores[0] = gnb.score(Xtrain,Ytrain) print('GNB, train: {0:.02f}% '.format(scores[0]*100)) if (type(Xtest) != type(None)): scores[1] = gnb.score(Xtest,Ytest) print('GNB, test: {0:.02f}% '.format(scores[1]*100)) return gnb
def cvalidate(): from sklearn import cross_validation targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16') y = [x for x in targetset] trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16') X = np.array([x for x in trainset]) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0) gnb = GaussianNB() X_train, X_test = decomposition_pca(X_train, X_test) gnb.fit(X_train, y_train) print gnb.score(X_test, y_test)
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module #from sklearn.metrics import accuracy_score #accuarcy = accuracy_score(pred, labels_test) accuracy = clf.score(features_test, labels_test) return accuracy
def NB(text): ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = Preprocess() Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text]) # classification goes here clf = GaussianNB() # training train_t0 = time() clf.fit(features_train, labels_train) train_t1 = time() # prediction or testing test_t0 = time() predict = clf.predict(features_test) test_t1 = time() print "accuracy: ", clf.score(features_test, labels_test) print "#################################" print "tain time: ", round(train_t1 - train_t0, 3), "s" print "prediction time: ", round(test_t1 - test_t0, 3), "s" print "#################################" clf.fit(Ifeatures_train,Ilabels_train) print ("prediction of ",str(clf.predict(Ifeatures_test))[1]) #print "prediction of ", clf.predict(preprocess_input(text)) return str(clf.predict(Ifeatures_test))[1]
class GaussianNBcls(object): """docstring for ClassName""" def __init__(self): self.gnb_cls = GaussianNB() self.prediction = None self.train_x = None self.train_y = None def train_model(self, train_x, train_y): try: self.train_x = train_x self.train_y = train_y self.gnb_cls.fit(train_x, train_y) except: print(traceback.format_exc()) def predict(self, test_x): try: self.test_x = test_x self.prediction = self.gnb_cls.predict(test_x) return self.prediction except: print(traceback.format_exc()) def accuracy_score(self, test_y): try: # return r2_score(test_y, self.prediction) return self.gnb_cls.score(self.test_x, test_y) except: print(traceback.format_exc())
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() t0 = time() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" ### use the trained classifier to predict labels for the test features import numpy as np t1 = time() pred = clf.predict(features_test) print "predicting time:", round(time()-t1, 3), "s" ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module accuracy = clf.score(features_test, labels_test) return accuracy
class PriceModel(object): """Linear Regression Model used to predict future prices""" def __init__(self, algorithm='gnb'): self.algorithm = algorithm if algorithm == 'svm': self.clf = SVC(kernel='rbf') elif algorithm == 'rf': self.clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0) elif algorithm == 'lr': self.clf = LogisticRegression() elif algorithm == 'knn': self.clf = KNeighborsClassifier(n_neighbors=3) else: # Naive Bayes self.clf = GaussianNB() def train(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, x): return self.clf.predict(x) def score(self, X_test, y_test): return self.clf.score(X_test, y_test)
def trainData(username): """ Trains the data based on the users performance so far Returns a trained Gaussian Naive Bayes model and updates result collection """ X = getFeatures(username) Y = getClassList(username) trainX = np.array(X) trainY = np.array(Y) gnb = GaussianNB() gnb.fit(trainX, trainY) print "Score with Naive Bayes: ", gnb.score(trainX, trainY) testData = words.posts.find({}, {'id' : 1, 'points' : 1, 'diff' : 1, '_id' : 0}) testData = map(lambda x : (x['id'], x['points'], x['diff']), testData) with warnings.catch_warnings(): warnings.simplefilter('ignore') for data in testData: testWord = words.posts.find_one({'id' : data[0]}, {'word' : 1, '_id' : 0})['word'] wordClass = setWordClass(list(gnb.predict_proba(data))[0]) classWord = result.posts.update({'username' : username}, {'$set' : {testWord : wordClass}}, upsert = True)
def gaussian_bayes_test(self): print 'gaussian bayes test' g_bayes_clf = GaussianNB() print 'cross validation score',cross_val_score(g_bayes_clf, self.x_data, self.y_data) start_time = time.time() g_bayes_clf.fit(self.x_train, self.y_train) print 'score',g_bayes_clf.score(self.x_test, self.y_test) print 'time cost', time.time() - start_time
def Accuracy(features_train,labels_train,features_test,labels_test): clf = GaussianNB() clf.fit(features_train,labels_train) pred = clf.predict(features_test) return clf.score(features_test,labels_test)
def run_naive_bayes(self): print "Running......" clf = GaussianNB() clf.fit(self.features_train, self.labels_train) pred = clf.predict(self.features_test) accuracy = clf.score(self.features_test, self.labels_test) #Save model and performance self.save_model(clf, "Naive Bayes") self.save_performance("Naive Bayes", accuracy)
def Gaussian_NB(X, y, tst_size): X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = tst_size, random_state = 0) clf = GaussianNB() score = 0 for i in range(100): clf.fit(X_train, y_train) score += clf.score(X_test, y_test) score = score/100 return score
def naiveBayesClassifierTraining(compounds_all): print "Building naive Bayes classifier (" + str(NB_FOLDS) + "-fold cross-validation)..." # get the data keys = compounds_all.keys() fingerprint_data = [compounds_all[cmpnd_id]['fingerprint'] for cmpnd_id in keys] fingerprint_data = numpy.asarray(fingerprint_data) activity_data = [compounds_all[cmpnd_id]['active'] for cmpnd_id in keys] activity_data = numpy.asarray(activity_data) # perform K-fold cross-validation classifier = GaussianNB() kfold_xv_strat = cross_validation.StratifiedKFold(activity_data, NB_FOLDS, indices=False) confusion_matrices = [] probabilities = [] scores = [] models = [] true_activities = [] aucs = [] for train, test in kfold_xv_strat: fingerprint_data_train = fingerprint_data[train] fingerprint_data_test = fingerprint_data[test] activity_data_train = activity_data[train] activity_data_test = activity_data[test] # model building classifier.fit(fingerprint_data_train, activity_data_train) # testing activity_data_predictions = classifier.predict(fingerprint_data_test) models.append(classifier) probability_estimates = classifier.predict_proba(fingerprint_data_test) probabilities.append(probability_estimates) scores.append(classifier.score(fingerprint_data_test, activity_data_test)) activity_confusion_matrix = confusion_matrix(activity_data_test, activity_data_predictions) confusion_matrices.append(activity_confusion_matrix) true_activities.append(activity_data_test) # ROC curves fpr, tpr, thresholds = roc_curve(activity_data_test, probability_estimates[:, 1]) aucs.append(auc(fpr, tpr)) classifier.fit(fingerprint_data, activity_data) print "Done." return { 'confusion_matrices' : confusion_matrices , 'probabilities' : probabilities , 'scores' : scores , 'models' : models , 'true_activity_data' : true_activities , 'AUCs' : aucs , 'fingerprint_data' : fingerprint_data , 'activity_data' : activity_data , 'final_model' : classifier }
def author_id(f_train, f_test, l_train, l_test): clf = GaussianNB() t0 = time() clf.fit(f_train, l_train) print "training time:", round(time()-t0, 3), "s" t0 = time() pred = clf.predict(f_test) print "prediction time:", round(time()-t0, 3), "s" return 'accuracy: %f' % clf.score(f_test, l_test)
def run_test(trainData, trainLabels, testData, testLabels): start_time = time() classifier = GaussianNB() classifier.fit(trainData, trainLabels) score = classifier.score(testData, testLabels) duration = time() - start_time print "training set size: " + str(len(trainData)) print "score: " + str(score) print "time: " + str(duration) + "\n"
def classify(features_train, labels_train, features_test, labels_test): classifier = GaussianNB() t0 = time() classifier.fit(features_train, labels_train) print "training time: ", round(time() - t0), "s" t1 = time() classifier.predict(features_test) print "predicting time: ", round(time() - t1), "s" return classifier.score(features_test, labels_test)
def GNB_select_cv(X, Y, num_features): scores = [] skf = cross_validation.StratifiedKFold(Y, n_folds=10) for train, test in skf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000) # RFsel XRF_test = X_test[:, ind] # reorder test set after RFsel clf = GaussianNB() clf.fit(XRF_train[:, 0:num_features], y_train) scores.append(clf.score(XRF_test[:, 0:num_features], y_test)) score = np.mean(scores) return(score)
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ # import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB # create classifier clf = GaussianNB() # fit the classifier on the training features and labels clf.fit(features_train, labels_train) # use the trained classifier to predict labels for the test features # calculate and return the accuracy on the test data accuracy = clf.score(features_test, labels_test) return accuracy
def NBClassifier(filename, split_ratio): print "-"*15,"Naive Bayes Classfier","-"*15 X, Y, X_labels, Y_labels = split_data(filename,split_ratio) # print X.shape, Y.shape, X_labels.shape, Y_labels.shape nb_model = GaussianNB() nb_model.fit(X, X_labels) print "\n accuracy =", nb_model.score(Y,Y_labels,sample_weight=None) print "-"*50
def sklearn_model(): """Fits the (parametric) Gaussian Naive Bayes classifier from sklearn on the iris dataset.""" # load iris data, perform train/test split iris = load_iris() tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT) train_features, test_features, train_labels, test_labels = tts # train (gaussian) Naive Bayes model, make predictions on test set gnb = GaussianNB().fit(train_features, train_labels) predicted_labels = gnb.predict(test_features) # show accuracy pct print "accuracy = {0} %".format(round(100 * gnb.score(test_features, test_labels)))
class NaiveBayes(): def __init__(self): self.clf = GaussianNB() self.accuracy = 0 self.y_out = [] def train(self, X_train, y_train): self.clf.fit(X_train, y_train.ravel()) def test(self, X_test): self.y_out = self.clf.predict(X_test) def score(self, X_test, y_test): self.accuracy = self.clf.score(X_test, y_test.ravel())
def bayes(): ##Naive Bayes from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(Xtrn, Ytrn) print gnb.score(Xtrn, Ytrn) print gnb.score(Xval1, Yval1) print gnb.score(Xval2, Yval2) print gnb.score(Xval3, Yval3)
def clsfr(): train1_err = [] # test1_err = [] train2_err = [] test2_err = [] t = zeros(len(target)) t[target == 'setosa'] = 1 t[target == 'versicolor'] = 2 t[target == 'virginica'] = 3 classifier = GaussianNB() classifier.fit(data, t) # training on the iris dataset for i in range(len(t)): if classifier.predict(data[i]) != array(t[i]): train1_err.append((classifier.predict(data[i]), t[i])) train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0) classifier.fit(train, t_train) # train for i in range(len(t_train)): if classifier.predict(train[i]) != array(t_train[i]): train2_err.append((classifier.predict(train[i]), t_train[i])) for i in range(len(t_test)): if classifier.predict(test[i]) != array(t_test[i]): test2_err.append((classifier.predict(test[i]), t_test[i])) print('train error: ', train1_err) print('train count: ', len(t)) print('train error count: ', len(train1_err)) print('accuracy rate: ', classifier.score(data, t)) print('*******************************************') print('train error: ', train2_err) print('test error: ', test2_err) print('train count: ', len(train)) print('train error count: ', len(train2_err)) print('test count: ', len(test)) print('test error count: ', len(test2_err)) print('test accuracy rate: ', classifier.score(test, t_test)) # test print('train accuracy rate: ', classifier.score(train, t_train)) print('****************************************************') print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']))
def nb_classify(self): print "Naive Bayes" clf = GaussianNB() clf.fit(self.descr, self.target) mean = clf.score(self.test_descr, self.test_target) pred = clf.predict(self.test_descr) accuracy = np.where(pred == self.test_target, 1, 0).sum() / float(len(self.test_target)) print "Accuracy: %3f" % accuracy print "Mean : %3f" % mean print "Probability ", clf.class_prior_ print "Mean of each feature per class ", clf.theta_ print "Variance of each feature per class ", clf.sigma_ print "Predict Probability ", clf.predict_proba(self.descr)
def compute_bayes_error(): np.random.seed(0) mu1 = [0, 0] cov_mat_1 = 1 * np.eye(2) mu2 = [0, 0] cov_mat_2 = 16 * np.eye(2) #create unified training set from two normal distributions X_vect = np.concatenate([np.random.multivariate_normal(mu1, cov_mat_1, 5000), np.random.multivariate_normal(mu2, cov_mat_2, 5000)]) y = np.zeros(10000) y[5000:] = 1 # Fit the Naive Bayes' classifier clf = GaussianNB() clf.fit(X_vect, y) # predict the classification probabilities on a grid xlim = (-5, 5) ylim = (-5, 5) xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50), np.linspace(ylim[0], ylim[1], 70)) Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()]) Z = Z[:, 1].reshape(xx.shape) acc = clf.score(X_vect,y) #Error rate error = 1- acc #Add decision boundery plot fig = plt.figure(figsize=(8, 8)) fig.suptitle('decision boundary', fontsize=12) fig = plt.gcf() #set display window title fig.canvas.set_window_title('Decision Boundary') ax = fig.add_subplot(111) p1 = ax.scatter(X_vect[:, 0], X_vect[:, 1], c=y, cmap=plt.get_cmap('Set3'), zorder=5) p2 = ax.contour(xx, yy, Z, [0.5],linewidths=3, colors='k') ax.set_xlim(xlim) ax.set_ylim(ylim) ax.set_xlabel('$x1$') ax.set_ylabel('$x2$') plt.clabel(p2, inline=3, fontsize=5) p2.collections[0].set_label("Decision Boundary") ax.legend(loc='lower right') return error
def gausssian_data(X,y): """ 朴素贝叶斯算法 :param X: :param y: :return: """ from sklearn import metrics from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(X, y) expected = y predicted = model.predict(X) score = model.score(X,y) # print(metrics.classification_report(expected, predicted,labels=[0,1],target_names=['良性网址','恶意网址'])) cm = metrics.confusion_matrix(expected, predicted) return score, cm
def gnb(training_data, training_target, testing_data, testing_target): """ DESCRIPTION: INPUTS: OUTPUTS: EXAMPLE USAGE: """ clf = GaussianNB() clf.fit(training_data, training_target) return clf.score(testing_data, testing_target)
# if features[obs][4] > 10: # print(features[obs][4] #plt.plot(np.array(features[:,0]),np.array(features[:,1])) # for k in data_dict: # for j in data_dict[k]: # print(data_dict["salary"][j] # for k in data_dict: # print(data_dict[k]["bonus"] ###GAUSSIAN from sklearn.naive_bayes import GaussianNB clfGAU = GaussianNB().fit(features, labels) print("Gaussian cf score is %f " % clfGAU.score(features, labels)) ###SVM from sklearn import svm clfSVM = svm.SVC(kernel="rbf", C=0.001, gamma=0.001).fit(features, labels) print("classic SVM score is %f " % clfSVM.score(features, labels)) # predSVM = clfSVM.fit(features, labels) #print("classic accuracy_score score is %f " % accuracy_score(labels, predSVM) ###Decision Tree from sklearn import tree clfDT = tree.DecisionTreeClassifier(min_samples_split=50).fit(features, labels) print("decision tree score % f" % clfDT.score(features, labels)) print("features_list", features_list) print('most important features DT', clfDT.feature_importances_)
# GaussianNB # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(x_train, y_train) # Predicting the Test set results y_pred = classifier.predict(x_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) print(cm) score = classifier.score(x_test, y_test) print(score) # from sklearn import metrics # metrics.accuracy_score(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap x_set, y_set = x_train, y_train X1, X2 = nm.meshgrid( nm.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01), nm.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
logreg = LogisticRegression() logreg.fit(X_train, y_train) print(logreg.score(X_train, y_train)) svc = SVC() svc.fit(X_train, y_train) print(svc.score(X_train, y_train)) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) print(knn.score(X_train, y_train)) gaussian = GaussianNB() gaussian.fit(X_train, y_train) print(gaussian.score(X_train, y_train)) linear_svc = LinearSVC() linear_svc.fit(X_train, y_train) print(linear_svc.score(X_train, y_train)) sgd = SGDClassifier() sgd.fit(X_train, y_train) print(sgd.score(X_train, y_train)) decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, y_train) print(decision_tree.score(X_train, y_train)) random_forest = RandomForestClassifier(n_estimators=100) random_forest.fit(X_train, y_train)
from time import time sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) total = (len(labels_test)) errors = 0 for i in range(total): if (predictions[i] != labels_test[i]): errors += 1 correct = float(total - errors) total = float(total) accuracy = correct / total print clf.score(features_test, labels_test) #can also use print(accuracy) #########################################################
Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess from sklearn.naive_bayes import GaussianNB ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### gnb = GaussianNB() t0 = time() y_pred = gnb.fit(features_train, labels_train) print("training time:", round(time() - t0, 3), "s") t0 = time() y_pred = gnb.predict(features_test) print("predicting time:", round(time() - t0, 3), "s") accuracy = gnb.score(features_test, labels_test) print(accuracy) #########################################################
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB df = pd.read_csv('./glass.csv') y = df["Type"] df1 = df.drop("Type", axis=1).copy() # create training and testing X_train, X_test, Y_train, Y_test = train_test_split(df1, y, test_size=0.15) model = GaussianNB() model.fit(X_train, Y_train) Y_pred = model.predict(X_test) acc_svc = round(model.score(X_test, Y_test) * 100, 2) print("Naive Byes accuracy with test is:", acc_svc) """plt.plot(Y_test,label="Y_test") plt.plot(Y_pred,label="Y_pred") plt.legend() plt.show()"""
for item in survived: if(item==0): colors.append('Red') else: colors.append('Green') # plt.scatter(ages, fares, s=50, color=colors) # s means size, we want size to be bigger # plt.show() # Step 3: Build a NB Model Features = dataframe.drop(['Survived'], axis=1).values Targets = dataframe['Survived'].values Features_Train, Target_Train = Features[:710], Targets[:710] # there are total 887 data points and 80% of that will be 710 Features_Test, Targets_test = Features[710:], Targets[710:] # print(Features_Test) model = GaussianNB() model.fit(Features_Train, Target_Train) # Step 4: Print Predicted vs Actuals predicted_values = model.predict(Features_Test) for item in zip(Targets_test, predicted_values): print('Actual was:', item[0], 'Predicted was', item[1]) # Step 5: Estimate Error print('Accuracy is:', model.score(Features_Test, Targets_test)) # we didnt gave targets_test and predicted_values because # score method itself calculates the predicted values from features_test and compares it with # target_test and gives us the score
batch_x = uncompress(batch_x, 86796) # print batch_x.shape batch_x = np.sum(batch_x, axis=1) # print batch_x.shape batch_x = np.squeeze(batch_x) # print batch_x.shape # print 'y' # print batch_y.shape batch_y = np.repeat(batch_y, 50, axis=0) # print batch_y.shape # gnb.partial_fit(batch_x,batch_y,classes=[0,1]) x = gnb.score(batch_x, batch_y) print x s += x i += 1 print 'average : ', s / i # gnb.fit(X,Y) # print s / i fp = open(os.path.join('nb_logs', 'nb_object' + '.save'), 'wb') cPickle.dump(gnb, fp, protocol=cPickle.HIGHEST_PROTOCOL) fp.close()
% (NB_NonScaled_cross_val_scores.mean(), NB_NonScaled_cross_val_scores.std() * 2)) # In[24]: if NB_NonScaled_cross_val_scores.mean() > 0.97: print("The Naive Bayes Model (Non Scaled) is overfitting in this case.") else: NB_classifier.fit(X_train, y_train) NB_NonScaled_predicted = NB_classifier.predict(X_test) NB_NonScaled_prob_default = np.sum(NB_NonScaled_predicted) / len( NB_NonScaled_predicted) print( "The Default Probability based on Naive Bayes Model(Non Scaled) is :", '%.3f' % NB_NonScaled_prob_default) NB_NonScaled_accuracy = NB_classifier.score(X_test, y_test) print("The accuracy of Naive Bayes Model(Non Scaled) on test set is : ", '%.3f' % NB_NonScaled_accuracy) # In[25]: #output the result into the existing evaluation dataframe to compare with other models new_evaluation = pd.DataFrame({ 'Model': ["Naive Bayes_NonScaled"], 'Default_Probability': [NB_NonScaled_prob_default], 'Cross_Validation_Accuracy': [NB_NonScaled_cross_val_scores.mean()], 'Test_Accuracy': [NB_NonScaled_accuracy] }) evaluation = evaluation.append(new_evaluation) evaluation = evaluation[[ 'Model', 'Default_Probability', 'Cross_Validation_Accuracy',
sub.to_csv('svm.csv', index=False) ## knn knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, Y_train) Y_pred = knn.predict(X_test) acc_knn = round(knn.score(X_train, Y_train) * 100, 2) acc_knn sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred}) sub.to_csv('knn.csv', index=False) # Gaussian Naive Bayes gaussian = GaussianNB() gaussian.fit(X_train, Y_train) Y_pred = gaussian.predict(X_test) acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) acc_gaussian sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred}) sub.to_csv('gnb.csv', index=False) # Decision Tree decision_tree = DecisionTreeClassifier() decision_tree.fit(X_train, Y_train) Y_pred = decision_tree.predict(X_test) acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) acc_decision_tree sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred}) sub.to_csv('tree.csv', index=False) # Random Forest
return vectors #Vectorizes Y with 0 being neg and 1 being pos def CreateYVector(self): print("......building Y matrix") vector = np.zeros(5331 + 5331, dtype=int) count = 0 for entry in vector: if count > 5330: vector[count] = 1 count += 1 return vector data = DataPrep("rt-polaritydata/rt-polaritydata/rt-polarity.neg", "rt-polaritydata/rt-polaritydata/rt-polarity.pos") #print(data.X) #print(data.Y) print("......spliting") X_train, X_test, y_train, y_test = train_test_split(data.X, data.Y, test_size=0.33) clf = GaussianNB(var_smoothing=.0001) print("......training") clf.fit(X_train, y_train) print("Accuracy: ") print(clf.score(X_test, y_test))
from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(x_train, y_train) y_pred = classifier.predict(x_test) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) result = classifier.score(x_test, y_test) from matplotlib.colors import ListedColormap x_set, y_set = x_train, y_train x1, x2 = np.meshgrid( np.arange(x_set[:, 0].min() - 1, x_set[:, 0].max() + 1, 0.01), np.arange(x_set[:, 1].min() - 1, x_set[:, 1].max() + 1, 0.01)) y_d = np.array([x1.ravel(), x2.ravel()]).T plt.contourf(x1, x2, classifier.predict(np.array([x1.ravel(), x2.ravel()]).T).reshape(x1.shape), alpha=0.4, cmap=ListedColormap(('red', 'green'))) plt.xlim(x1.min(), x1.max()) plt.ylim(x2.min(), x2.max())
svm = SVC(random_state=1) svm.fit(x_train.T, y_train.T) acc = svm.score(x_test.T, y_test.T) * 100 accuracies['SVM'] = acc print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc)) # In[35]: #Naive Bayes Algorithm from sklearn.naive_bayes import GaussianNB nb = GaussianNB() nb.fit(x_train.T, y_train.T) acc = nb.score(x_test.T, y_test.T) * 100 accuracies['Naive Bayes'] = acc print("Accuracy of Naive Bayes: {:.2f}%".format(acc)) # In[36]: #Decision Tree Algorithm from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() dtc.fit(x_train.T, y_train.T) acc = dtc.score(x_test.T, y_test.T) * 100 accuracies['Decision Tree'] = acc print("Decision Tree Test Accuracy {:.2f}%".format(acc))
color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('Predicted Class', fontsize=12) plt.xlabel('Actual Class', fontsize=12) # predict data for i in range(3): #Use Gaussian Naive Bayes method classifier = GaussianNB() #Fit the model classifier.fit(train[i], classification_train) #Calculate the result & accuracy result = classifier.predict(test[i]) accuracy = classifier.score(test[i], classification_test) #Calculate the probability estimates of the positive class prob_data = classifier.predict_proba(test[i]) prob_data = prob_data[:, 1] #Calculate fpr & ftr fpr, tpr, thresholds = metrics.roc_curve(classification_test, prob_data) fprs.append(fpr) tprs.append(tpr) #Calculate confusion matrix, precision & recall conf_mat = metrics.confusion_matrix(classification_test, result) precision = metrics.precision_score(classification_test, result) recall = metrics.recall_score(classification_test, result) roc_auc = metrics.auc(fpr, tpr) print 'min_df = ' + str(min_df[i]) print 'dimension reduction method: ' + str(method[i])
'AST', 'BLK'] #Pandas DataFrame allows you to select columns. #We use column selection to split the data into features and class. nba_feature = nba[feature_columns] nba_class = nba[class_column] print(nba_feature[0:3]) print(list(nba_class[0:3])) train_feature, test_feature, train_class, test_class = \ train_test_split(nba_feature, nba_class, stratify=nba_class, \ train_size=0.75, test_size=0.25, random_state=0) training_accuracy = [] test_accuracy = [] nb = GaussianNB().fit(train_feature, train_class) print("Test set score: {:.3f}".format(nb.score(test_feature, test_class))) prediction = nb.predict(test_feature) print("Confusion matrix:") print( pd.crosstab(test_class, prediction, rownames=['True'], colnames=['Predicted'], margins=True)) scores = cross_val_score(nb, nba_feature, nba_class, cv=10) print("Cross-validation scores: {}".format(scores)) print("Average cross-validation score: {:.2f}".format(scores.mean()))
logic_reg = LogisticRegression() logic_reg.fit(x_train, y_train) print("Test accuarcy: {:.2f}%".format(logic_reg.score(x_test, y_test) * 100)) #for Knn model from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=3) knn.fit(x_train, y_train) print("Test accuracy of knn is {:.2f}%".format( knn.score(x_test, y_test) * 100)) '''#for Svm model not working from sklearn.svm import SVC sps=SVC(random_state=1,kernel='rbf') sps.fit_transpose(x_train,y_test) print("SVM Accuracy report {:.2f}%".format(sps.score(x_test,y_test)*100)) ''' #naive bayes from sklearn.naive_bayes import GaussianNB nai = GaussianNB() nai.fit(x_train, y_train) print("Naive Bayes Accuracy report {:.2f}%".format( nai.score(x_test, y_test) * 100)) #Random forest from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=1000, random_state=1) rf.fit(x_train, y_train) print("Random Forest Accuracy report {:.2f}%".format( rf.score(x_test, y_test) * 100))
#predict method print " 1 means survived, 0 means not survived" print 'Classified as :', classifier.predict([data[0]]) print 'Classified as :', classifier.predict([[3,27,0]]) print 'Classified as :', classifier.predict([data[2],data[4]]) #TEST DATA-training and classiffication #split to 60 percent-train and 40 perccent-test of total data from sklearn import cross_validation train, test, t_train, t_test = cross_validation.train_test_split(data, t,test_size=0.4, random_state=0) print 'Number of records used for training',train.shape print 'Number of records used for testing',test.shape #train and test classifier.fit(train,t_train) # train with 1st part:60 percent print 'Accuracy is =',classifier.score(test,t_test) # test with 2nd part:40 percent #CONFUSION MATRICS TO SHOW ACCURACY from sklearn.metrics import confusion_matrix print 'confusion matrix\n',confusion_matrix(classifier.predict(test),t_test) #Function that gives us a complete report on the performance from sklearn.metrics import classification_report print classification_report(classifier.predict(test),t_test,target_names=['Survived', 'Not Survived']) #Sophisticated evaluation model like Cross Validation. The idea behind the model is simple: the data is split into train and test sets several consecutive times and the averaged value of the prediction scores obtained with the different sets is the evaluation of the classifier from sklearn.cross_validation import cross_val_score # cross validation with 6 iterations scores = cross_val_score(classifier, data, t, cv=20) #print scores
def bayes(): #se carga el dataset dataset = pd.read_csv("Dataset_Bayes.csv") #Se imprime la cantidad de usuarios que ganaron y perdieron print(dataset.groupby('Gano').size()) #Imprime grafica de barras Gano vs Variables dataset.drop(['Gano'], axis=1).hist() plt.show() #se elimina userId, completer son irrelevantes para aplicar el metodo dataset_limpio = dataset.drop(['userId', 'completer'], axis=1) dataset_limpio.describe() # se limpia el dataset de valores NaN, Inf dataset_limpio = limpiar_dataset_Para_Bayes(dataset_limpio) #se elimna y obtiene la variable Gano con el fin de poder buscar las 5 mejores variables que pueden determinar si Gano o perdio a = dataset_limpio.drop(['Gano'], axis=1) b = dataset_limpio['Gano'] best = SelectKBest(k=5) a_new = best.fit_transform(a, b) a_new.shape selected = best.get_support(indices=True) print("Mejores 5 variables") print(a.columns[selected]) #Imprime grafica de correlación de pearson con respecto a las 5 mejores variables used_features = a.columns[selected] colormap = plt.viridis() plt.figure(figsize=(12, 12)) plt.title('Coeficiente de correlación de Pearson', y=1.05, size=15) sns.heatmap(dataset_limpio[used_features].astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True) plt.show() #se dividen los datos de entrada en 'entrenamiento' y 'pruebas' a_entrenamiento, a_pruebas = train_test_split(dataset_limpio, test_size=0.2, random_state=6) b_entrenamiento = a_entrenamiento["Gano"] b_pruebas = a_pruebas["Gano"] gnb = GaussianNB() gnb.fit(a_entrenamiento[used_features].values, b_entrenamiento) y_pred = gnb.predict(a_pruebas[used_features]) print('Precisión en el set de Entrenamiento: {:.2f}'.format( gnb.score(a_entrenamiento[used_features], b_entrenamiento))) print('Precisión en el set de Pruebas: {:.2f}'.format( gnb.score(a_pruebas[used_features], b_pruebas))) #cinco mejores variables #'SRL', 'Atry to lecture', 'num_events', 'grade', 'cluster' #tomamos datos del dataset donde un usuario perdio y gano (0,1) con relacion a las 5 mejores variables print( gnb.predict([[1.666666667, 0, 2, 5.999999866, 0], [2.041666667, 150, 151, 62.00000048, 1]]))
Data.extend(i) X = [i[0] for i in Data] Y = [i[1] for i in Data] #print(t) split = len(corpus)-len(corpus)//5 tf = CountVectorizer() t = tf.fit_transform(X).toarray() print(t.shape) print(len(Y)) x_train = t[:split] x_test = t[split:] y_train = Y[:split] y_test =Y[split:] from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(x_train,y_train) print("Finished Trainning") print(clf.score(x_test,y_test)) #from nltk.tag import tnt #tnt_pos_tagger = tnt.TnT() #tnt_pos_tagger.train(train) #print(word_tokenize(word_test)) #print(tnt_pos_tagger.evaluate(test)) #print(tnt_pos_tagger.tag(word_tokenize(word_test)))
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn import datasets iris = datasets.load_iris() x = iris.data[:, 2:][0:140] y = iris.target[0:140] x_test = iris.data[:, 2:][141:150] y_test = iris.target[141:150] '''NAIVE BAYES''' from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(x, y) nb = model.score(x, y) pred = model.predict(x_test) sum(x == 0 for x in pred - y_test) / len(pred) '''DECISION TREES''' from sklearn import tree model = tree.DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=20, max_features=x.shape[1], max_leaf_nodes=4, min_samples_leaf=1, min_samples_split=1, min_weight_fraction_leaf=0.0, presort=False,
temp = X_test.groupby(['label']) t = temp.packets.count() label_predicted = [-1 if e == 0 else e for e in label_predicted] test_labels = [-1 if e == 0 else e for e in test_labels] pred = [a*b for a,b in zip(label_predicted,t)] act = [a*b for a,b in zip(test_labels,t)] pp =0 pn = 0 nn = 0 np = 0 for a,b in zip(pred,act): if a>0 and b>0: pp +=a; elif a<0 and b<0: nn -= a; elif a>0 and b<0: pn += a; else : np += b; print pp print nn print np print pn ############################################################################### from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(train_cluster_features,train_labels) label_predicted=gnb.predict(test_cluster_features) gnb.score(test_cluster_features,test_labels) ###############################################################################
# Training set and targets X = bank.drop(columns='y').values t = bank['y'].values #experiment 1 from sklearn.model_selection import train_test_split X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 0.2, shuffle = True) #experiment 2 from sklearn.naive_bayes import GaussianNB gaussian_clf = GaussianNB() gaussian_clf.fit(X_train, t_train) #experiment 3 from sklearn.metrix import confusion_matrix gaussian_score = gaussian_clf.score(X_test, t_test) gaussian_pred - gaussian_clf.predict(X_test) cm = confusion_matrix(t_test, gaussian_pred) gaussian_proba = gaussian_clf.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(t_test, gaussian_proba) auc = roc_auc_score(t_test, gaussian_proba) print "Gausian CLF Score: " + str(gaussian_score) print "Confusion Matrix " print cm print "Gaussian CLF auc Score: " + str(roc_auc_score) plt.figure() plt.plot(fpr, tpr)
from sklearn.naive_bayes import MultinomialNB from sklearn.naive_bayes import BernoulliNB digits = load_digits() x = digits.data #样本 y = digits.target #标签 #划分训练集、测试集,其中测试集的比例为0.3 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) """高斯贝叶斯分类器:GaussianNB""" gnb = GaussianNB().fit(x_train, y_train) #利用训练集数据训练模型 gnb_predict = gnb.predict(x_test) #对测试集进行预测 for i in range(10): #输出前十个预测结果与实际结果进行比对 print(f"actual:{y_test[i]},predict:{gnb_predict[i]}") gnb_score = gnb.score(x_test, y_test) print(f"accuracy(GaussianNB):{gnb_score}") print("-------------------") """多项贝叶斯分类器:MultinomialNB""" mnb = MultinomialNB().fit(x_train, y_train) mnb_predict = mnb.predict(x_test) for i in range(10): print(f"actual:{y_test[i]},predict:{mnb_predict[i]}") mnb_score = mnb.score(x_test, y_test) print(f"accuracy(MultinomialNB):{mnb_score}") print("-------------------") """伯努利贝叶斯分类器:BernoulliNB""" bnb = BernoulliNB().fit(x_train, y_train) bnb_predict = bnb.predict(x_test)
label_names = data['target_names'] labels = data['target'] feature_names = data['feature_names'] features = data['data'] # Look at our data print(label_names) print(labels[0]) print(feature_names[0]) print(features[0]) #Split the data train, test, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=42) #Classifier gnb = GaussianNB() #Training model = gnb.fit(train, train_labels) #Make prediction preds = gnb.predict(test) print(preds) #Evaluate accuracy print(accuracy_score(test_labels, preds)) print(gnb.score(test, test_labels))
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=3) dt_model.fit(train_set, train_labels) dt_model.score(test_set, test_labels) y_predict = dt_model.predict(test_set) y_predict[:5] test_set.head(5) """### Naive Bayes""" naive_model = GaussianNB() naive_model.fit(train_set, train_labels) prediction = naive_model.predict(test_set) naive_model.score(test_set, test_labels) """### Random Forest classifier""" randomforest_model = RandomForestClassifier(max_depth=2, random_state=0) randomforest_model.fit(train_set, train_labels) Importance = pd.DataFrame( {'Importance': randomforest_model.feature_importances_ * 100}, index=train_set.columns) Importance.sort_values('Importance', axis=0, ascending=True).plot( kind='barh', color='r', ) predicted_random = randomforest_model.predict(test_set) randomforest_model.score(test_set, test_labels)
plt.ylabel("Accuracy") plt.legend() # ### Question 2 # In[169]: pca = PCA(n_components=2) pca.fit(X) X_New = pca.transform(X) X_Test_New = pca.transform(X_Test) # Naive Baye classifier clf1 = GaussianNB() clf1 = clf1.fit(X_New, Y) print("GaussianNB Acc: {}".format(clf1.score(X_Test_New, Y_Test) * 100)) # KNeighborsClassifier clf2 = KNeighborsClassifier(n_jobs=-1) clf2 = clf2.fit(X_New, Y) print("KNeighborsClassifier Acc: {}".format( clf2.score(X_Test_New, Y_Test) * 100)) # DecisionTreeClassifier clf3 = DecisionTreeClassifier(max_depth=BestDep) clf3 = clf3.fit(X_New, Y) print("DecisionTreeClassifier Acc: {}".format( clf3.score(X_Test_New, Y_Test) * 100)) for clf in [clf1, clf2, clf3]: print(clf.score(X_New, Y))
knn = neighbors.KNeighborsClassifier(n_neighbors=5) rfc = RandomForestClassifier(n_estimators=10) lor = LogisticRegression(random_state=1) gnb = GaussianNB() vot = VotingClassifier(estimators=[('lr', lor), ('rf', rfc), ('gnb', gnb), ('knn', knn)], voting='hard') lr.fit(x_train, y_train) svc.fit(x_train, y_train) knn.fit(x_train, y_train) rfc.fit(x_train, y_train) lor.fit(x_train, y_train) gnb.fit(x_train, y_train) vot.fit(x_train, y_train) print("LogisticRegression", lor.score(x_test, y_test)) print("GaussianNB", gnb.score(x_test, y_test)) print("RandomForestClassifier ", rfc.score(x_test, y_test)) print("KNeighborsClassifier ", knn.score(x_test, y_test)) print("SVC ", svc.score(x_test, y_test)) print("LinearRegression ", lr.score(x_test, y_test)) print('VotingClassifier', vot.score(x_test, y_test)) N = 7 x = range(N) y = [ lor.score(x_test, y_test), gnb.score(x_test, y_test), rfc.score(x_test, y_test), knn.score(x_test, y_test), svc.score(x_test, y_test), lr.score(x_test, y_test), vot.score(x_test, y_test)
plt.figure() plt.bar(np.arange(2) + 0.2, trainsc, width=0.4, color='c', align='center') plt.bar(np.arange(2) + 0.6, testsc, width=0.4, color='r', align='center') plt.xticks(np.arange(2) + 0.4, alg) plt.title('Linear Discriminant Analysis accuracy') plt.ylabel('Accuracy') plt.legend(['Train', 'Test']) plt.show() #%% Naive Bayes Gaussian if (GNB_cl == 1): nb = GaussianNB() nb.fit(Xtrain, Ytrain) scores = np.empty((4)) scores[0] = nb.score(Xtrain, Ytrain) scores[1] = nb.score(Xtest, Ytest) print('Gaussian Naive Bayes, train: {0:.02f}% '.format(scores[0] * 100)) print('Gaussian Naive Bayes, test: {0:.02f}% '.format(scores[1] * 100)) bnb = BaggingClassifier(GaussianNB(), max_samples=0.5, n_jobs=-1) bnb.fit(Xtrain, Ytrain) scores[2] = bnb.score(Xtrain, Ytrain) scores[3] = bnb.score(Xtest, Ytest) print('Bagging Naive Bayes, test: {0:.02f}% '.format(scores[2] * 100)) print('Bagging Naive Bayes, test: {0:.02f}% '.format(scores[3] * 100)) alg = ['Naive Bayes', 'Bagged Naive Bayes'] trainsc = [scores[0], scores[2]] testsc = [scores[1], scores[3]] plt.figure()
def modeloNaiveBayesSampling(): #Carga del dataset almacenado en csv dataset = pd.read_csv('dataset2.csv') #Reducción de la dimensionalidad, con Feature Selection, usando SelctKBest de Sklearn X=dataset.drop(['Plag'], axis=1) y=dataset['Plag'] best=SelectKBest(k=50) X_new = best.fit_transform(X, y) X_new.shape selected = best.get_support(indices=True) #print(X.columns[selected]) used_features =X.columns[selected] # Separación de los datos del dataset en los cjtos de entrenamiento y test: X_train, X_test = train_test_split(dataset, test_size=0.3, random_state=6) y_train =X_train["Plag"] y_test = X_test["Plag"] #Configuración del muestreo que combina oversampling y subsamplig: os = make_pipeline( SMOTE(sampling_strategy={1: 5000}), NearMiss(sampling_strategy={0: 15000})) X_train_res, y_train_res = os.fit_resample(X_train, y_train) X_test_res, y_test_res= (X_test, y_test) # Uso del clasificador Gausiano: gnb = GaussianNB() #Con el modelo creado, se utiliza fit() para el aprendizaje gnb.fit( X_train_res[used_features].values, y_train_res ) y_pred = gnb.predict(X_test_res[used_features]) #Calculamos la precisión print('Precisión en el set de Entrenamiento: {:.2f}' .format(gnb.score(X_train_res[used_features], y_train_res))) print('Precisión en el set de Test: {:.2f}' .format(gnb.score(X_test_res[used_features], y_test_res))) #Calculo de la matriz de confusión print(confusion_matrix(y_test_res, y_pred)) print ("Distribución inicial de entrenamiento{}".format(Counter(y_train))) print ("Distribución finalde entrenamiento: {}".format(Counter(y_train_res))) print ("Distribución inicial de test {}".format(Counter(y_test))) print ("Distribución final de test: {}".format(Counter(y_test_res)))
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.metrics import classification_report # Importing data set train_df = pd.read_csv('glass.csv') X = train_df.drop("Type", axis=1) Y = train_df["Type"] # Training and testing data X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) # using navie bays gnb = GaussianNB() # Showing the result of test data Y_prediction = gnb.fit(X_train, y_train).predict(X_test) acc_gnb = round(gnb.score(X_test, y_test) * 100) # Calculating the accuracy print("Accuracy is:", acc_gnb) print(classification_report(y_test, Y_prediction))