from sklearn.naive_bayes import MultinomialNB as MB from sklearn.naive_bayes import GaussianNB as GB ##Building the Multinomial Naive Bayes Model classifier_mb = MB() classifier_mb.fit(x_train, y_train) pred_mb = classifier_mb.predict(x_train) accuracy_mb_train = np.mean(pred_mb == y_train) ##77% pd.crosstab(pred_mb, y_train) ##for test data pred_mb_test = classifier_mb.predict(x_test) accuracy_mb_test = np.mean(pred_mb_test == y_test) ##77% pd.crosstab(pred_mb_test, y_test) ##Building Gaussian model classifier_gb = GB() classifier_gb.fit(x_train, y_train) pred_gb = classifier_gb.predict(x_train) accuracy_gb_train = np.mean(pred_gb == y_train) ##80% pd.crosstab(pred_gb, y_train) ##for test data pred_gb_train = classifier_gb.predict(x_test) accuracy_gb_test = np.mean(pred_gb_train == y_test) ##80% pd.crosstab(pred_gb_train, y_test)
Chris has label 1 """ import sys from time import time sys.path.append("../tools/") from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB as GB clf = GB() t0 = time() clf.fit(features_train, labels_train) print("training time:", round(time() - t0, 3), "s") t1 = time() clf.predict(features_train) print("predicting time:", round(time() - t1, 3), "s") accuracy_score = clf.score(features_test, labels_test) print(accuracy_score) #########################################################
'average_montly_hours', 'last_evaluation' ] print(Important_features) Pred_var = ["left"] print(Pred_var) # In[ ]: # Lets us make a list of models models = [ "RandomForestClassifier", "Gaussian Naive Bays", "KNN", "Logistic_Regression", "Support_Vector" ] Classification_models = [ RandomForestClassifier(n_estimators=100), GB(), knn(n_neighbors=7), LogisticRegression(), SVC() ] Model_Accuracy = [] for model in Classification_models: Accuracy = Classification_model(model, Data, All_features, Pred_var) Model_Accuracy.append(Accuracy) # In[ ]: Accuracy_with_all_features = pd.DataFrame({ "Classification Model": models, "Accuracy with all features":
#%% DecisionTree from sklearn.tree import DecisionTreeClassifier as DT dt = DT(criterion="entropy") dt.fit(x, y) dt.predict(x) pde = dt.predict_proba(x) print("DTE accuracy", dt.score(x, y)) dt = DT(criterion="gini") dt.fit(x, y) dt.predict(x) pdg = dt.predict_proba(x) print("DTG accuracy", dt.score(x, y)) # z = dt.predict([[40, 1, 15,14, 55, 5.5, 0.856075, 2.16892]]) ####!!!task 2 there are two criterions on dcisiontree model, gini and entropy, try to understand the difference ####!!!task 3 try these two dcisiontree models on banloan default problem from sklearn.naive_bayes import GaussianNB as GB gb = GB() gb.fit(x, y) gb.predict(x) pgb = gb.predict_proba(x) print("NB accuracy", gb.score(x, y)) from sklearn.svm import SVC svc = SVC() svc.fit(x, y) svc.predict(x) print("svc accuracy", svc.score(x, y))
classifier_nb = MB() classifier_nb.fit(train_emails_matrix, y_train) train_pred_nb = classifier_nb.predict(train_emails_matrix) accuracy_nb = np.mean(train_pred_nb == y_train) ##98.8% pd.crosstab(train_pred_nb, y_train) ##predicting on test data test_pred_nb = classifier_nb.predict(test_emails_matrix) accuracy_test_nb = np.mean(test_pred_nb == y_test) ##96.82% pd.crosstab(test_pred_nb, y_test) ##Building Gaussian model classifier_gb = GB() classifier_gb.fit(train_emails_matrix.toarray(), y_train.values) train_pred_gb = classifier_gb.predict(train_emails_matrix.toarray()) accuracy_gb = np.mean(train_pred_gb == y_train) ##92% pd.crosstab(train_pred_gb, y_train) ##predicting on test data test_pred_gb = classifier_gb.predict(test_emails_matrix.toarray()) accuracy_test_gb = np.mean(test_pred_gb == y_test) ##85% pd.crosstab(test_pred_gb, y_test) ###Building with TFIDF transformation tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)
x_train=salary_train.iloc[:,0:12] y_train=salary_train.iloc[:,13] x_test=salary_test.iloc[:,0:12] y_test=salary_test.iloc[:,13] #######Importing the navies bayes function###### from sklearn.naive_bayes import MultinomialNB as MB from sklearn.naive_bayes import GaussianNB as GB classifiers_mb=MB() classifiers_mb.fit(x_train,y_train) train_pred_mb=classifiers_mb.predict(x_train) train_accu_mb=np.mean(train_pred_mb==y_train)##77% pd.crosstab(train_pred_mb,y_train) test_pred_mb=classifiers_mb.predict(x_test) test_accu_mb=np.mean(test_pred_mb==y_test)##77% pd.crosstab(test_pred_mb,y_test) classifiers_gb=GB() classifiers_gb.fit(x_train,y_train) train_pred_gb=classifiers_gb.predict(x_train) train_accu_gb=np.mean(train_pred_gb==y_train)##80% pd.crosstab(train_pred_gb,y_train) test_pred_gb=classifiers_gb.predict(x_test) test_accu_gb=np.mean(test_pred_gb==y_test)##80% #####From above Gassian Navies bayes contian more accuracy w.r.t. multinomial NB.
def crossRun(self): window = Toplevel(self) ttk.Label(window, text='Result').grid(column=0, row=0, sticky=W) self.crossResult = ScrolledText(window, height=15, width=70) self.crossResult.grid(column=0, row=1, sticky=W, columnspan=10, rowspan=5) self.crossResult.config(state=DISABLED) from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier as knn from sklearn.naive_bayes import GaussianNB as GB from sklearn.svm import SVC from sklearn.model_selection import cross_validate from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler df_drop = df.drop(labels=['sales', 'salary'], axis=1) left_col = df_drop['left'] df_drop.drop(labels=['left'], axis=1, inplace=True) df_drop.insert(0, 'left', left_col) df_drop.head() X = df_drop.iloc[:, 1:8].values y = df_drop.iloc[:, 0].values X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=6) X_pca = sklearn_pca.fit_transform(X_std) models = [ "RandomForestClassifier", "Gaussian Naive Bays", "KNN", "Logistic_Regression", "Support_Vector" ] Classification_models = [ RandomForestClassifier(n_estimators=100), GB(), knn(n_neighbors=7), LogisticRegression(), SVC() ] Model_Accuracy = [] scoring = { 'acc': 'accuracy', 'f1': 'f1', 'precision': 'precision', 'recall': 'recall', 'roc_auc': 'roc_auc' } for model, model_name in zip(Classification_models, models): print(model_name) scores = cross_validate(model, X_pca, y, scoring=scoring, cv=10, return_train_score=True) Model_Accuracy.append(scores) self.crossResult.config(state=NORMAL) self.crossResult.delete(1.0, END) for i, m in zip(Model_Accuracy, models): self.crossResult.insert(END, "\n" + m) self.crossResult.insert(END, "\n--------\n") for j in i: self.crossResult.insert( END, str(j) + ": " + str(i[j].mean()) + "\n") self.crossResult.config(state=DISABLED)
print(final_test_df) train_X = final_train_df.iloc[:, 0:13] train_y = final_train_df.iloc[:, 13] print(train_X.head()) print(train_y.head()) input() test_X = final_test_df.iloc[:, 0:13] test_y = final_test_df.iloc[:, 13] print(test_X.head()) print(test_y.head()) input() # Naive Bayes model ignb = GB() imnb = MB() # Building and predicting at the same time pred_gnb = ignb.fit(train_X, train_y).predict(test_X) # GaussianNB model pred_mnb = imnb.fit(train_X, train_y).predict(test_X) # Multinomal model # Confusion matrix GaussianNB model print(confusion_matrix(test_y, pred_gnb)) print(pd.crosstab(test_y.values.flatten(), pred_gnb)) print(classification_report(test_y, pred_gnb)) # classification report print(np.mean( pred_gnb == test_y.values.flatten())) #>> Accuracy = 0.7946879150066402 input() # Confusion matrix Multinomal model