Example #1
0
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

##Building the Multinomial Naive Bayes Model
classifier_mb = MB()
classifier_mb.fit(x_train, y_train)
pred_mb = classifier_mb.predict(x_train)
accuracy_mb_train = np.mean(pred_mb == y_train)
##77%
pd.crosstab(pred_mb, y_train)

##for test data
pred_mb_test = classifier_mb.predict(x_test)
accuracy_mb_test = np.mean(pred_mb_test == y_test)
##77%
pd.crosstab(pred_mb_test, y_test)

##Building Gaussian model
classifier_gb = GB()
classifier_gb.fit(x_train, y_train)
pred_gb = classifier_gb.predict(x_train)
accuracy_gb_train = np.mean(pred_gb == y_train)
##80%
pd.crosstab(pred_gb, y_train)

##for test data
pred_gb_train = classifier_gb.predict(x_test)
accuracy_gb_test = np.mean(pred_gb_train == y_test)
##80%
pd.crosstab(pred_gb_train, y_test)
Example #2
0
    Chris has label 1

"""
import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB as GB

clf = GB()
t0 = time()
clf.fit(features_train, labels_train)
print("training time:", round(time() - t0, 3), "s")
t1 = time()
clf.predict(features_train)
print("predicting time:", round(time() - t1, 3), "s")

accuracy_score = clf.score(features_test, labels_test)
print(accuracy_score)

#########################################################
Example #3
0
    'average_montly_hours', 'last_evaluation'
]
print(Important_features)
Pred_var = ["left"]
print(Pred_var)

# In[ ]:

# Lets us make a list of models
models = [
    "RandomForestClassifier", "Gaussian Naive Bays", "KNN",
    "Logistic_Regression", "Support_Vector"
]
Classification_models = [
    RandomForestClassifier(n_estimators=100),
    GB(),
    knn(n_neighbors=7),
    LogisticRegression(),
    SVC()
]
Model_Accuracy = []
for model in Classification_models:
    Accuracy = Classification_model(model, Data, All_features, Pred_var)
    Model_Accuracy.append(Accuracy)

# In[ ]:

Accuracy_with_all_features = pd.DataFrame({
    "Classification Model":
    models,
    "Accuracy with all features":
Example #4
0
#%% DecisionTree
from sklearn.tree import DecisionTreeClassifier as DT
dt = DT(criterion="entropy")
dt.fit(x, y)
dt.predict(x)
pde = dt.predict_proba(x)
print("DTE accuracy", dt.score(x, y))

dt = DT(criterion="gini")
dt.fit(x, y)
dt.predict(x)
pdg = dt.predict_proba(x)
print("DTG accuracy", dt.score(x, y))
# z = dt.predict([[40,	1,	15,14,	55,	5.5,	0.856075,	2.16892]])

####!!!task 2 there are two criterions on dcisiontree model, gini and entropy, try to understand the difference

####!!!task 3 try these two dcisiontree models on banloan default problem

from sklearn.naive_bayes import GaussianNB as GB
gb = GB()
gb.fit(x, y)
gb.predict(x)
pgb = gb.predict_proba(x)
print("NB accuracy", gb.score(x, y))

from sklearn.svm import SVC
svc = SVC()
svc.fit(x, y)
svc.predict(x)
print("svc accuracy", svc.score(x, y))
Example #5
0
classifier_nb = MB()
classifier_nb.fit(train_emails_matrix, y_train)
train_pred_nb = classifier_nb.predict(train_emails_matrix)
accuracy_nb = np.mean(train_pred_nb == y_train)
##98.8%
pd.crosstab(train_pred_nb, y_train)

##predicting on test data
test_pred_nb = classifier_nb.predict(test_emails_matrix)
accuracy_test_nb = np.mean(test_pred_nb == y_test)
##96.82%
pd.crosstab(test_pred_nb, y_test)

##Building Gaussian model

classifier_gb = GB()
classifier_gb.fit(train_emails_matrix.toarray(), y_train.values)
train_pred_gb = classifier_gb.predict(train_emails_matrix.toarray())
accuracy_gb = np.mean(train_pred_gb == y_train)
##92%
pd.crosstab(train_pred_gb, y_train)

##predicting on test data
test_pred_gb = classifier_gb.predict(test_emails_matrix.toarray())
accuracy_test_gb = np.mean(test_pred_gb == y_test)
##85%
pd.crosstab(test_pred_gb, y_test)

###Building with TFIDF transformation
tfidf_transformer = TfidfTransformer().fit(all_emails_matrix)
x_train=salary_train.iloc[:,0:12]
y_train=salary_train.iloc[:,13]
x_test=salary_test.iloc[:,0:12]
y_test=salary_test.iloc[:,13]

#######Importing the navies bayes function######
from sklearn.naive_bayes import MultinomialNB as MB
from sklearn.naive_bayes import GaussianNB as GB

classifiers_mb=MB()
classifiers_mb.fit(x_train,y_train)
train_pred_mb=classifiers_mb.predict(x_train)
train_accu_mb=np.mean(train_pred_mb==y_train)##77%
pd.crosstab(train_pred_mb,y_train)

test_pred_mb=classifiers_mb.predict(x_test)
test_accu_mb=np.mean(test_pred_mb==y_test)##77%
pd.crosstab(test_pred_mb,y_test)

classifiers_gb=GB()
classifiers_gb.fit(x_train,y_train)
train_pred_gb=classifiers_gb.predict(x_train)
train_accu_gb=np.mean(train_pred_gb==y_train)##80%
pd.crosstab(train_pred_gb,y_train)

test_pred_gb=classifiers_gb.predict(x_test)
test_accu_gb=np.mean(test_pred_gb==y_test)##80%


#####From above Gassian Navies bayes contian more accuracy w.r.t. multinomial NB.
Example #7
0
    def crossRun(self):

        window = Toplevel(self)
        ttk.Label(window, text='Result').grid(column=0, row=0, sticky=W)
        self.crossResult = ScrolledText(window, height=15, width=70)
        self.crossResult.grid(column=0,
                              row=1,
                              sticky=W,
                              columnspan=10,
                              rowspan=5)
        self.crossResult.config(state=DISABLED)
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.neighbors import KNeighborsClassifier as knn
        from sklearn.naive_bayes import GaussianNB as GB
        from sklearn.svm import SVC
        from sklearn.model_selection import cross_validate
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import StandardScaler
        df_drop = df.drop(labels=['sales', 'salary'], axis=1)
        left_col = df_drop['left']
        df_drop.drop(labels=['left'], axis=1, inplace=True)
        df_drop.insert(0, 'left', left_col)
        df_drop.head()
        X = df_drop.iloc[:, 1:8].values
        y = df_drop.iloc[:, 0].values
        X_std = StandardScaler().fit_transform(X)
        sklearn_pca = PCA(n_components=6)
        X_pca = sklearn_pca.fit_transform(X_std)
        models = [
            "RandomForestClassifier", "Gaussian Naive Bays", "KNN",
            "Logistic_Regression", "Support_Vector"
        ]
        Classification_models = [
            RandomForestClassifier(n_estimators=100),
            GB(),
            knn(n_neighbors=7),
            LogisticRegression(),
            SVC()
        ]
        Model_Accuracy = []
        scoring = {
            'acc': 'accuracy',
            'f1': 'f1',
            'precision': 'precision',
            'recall': 'recall',
            'roc_auc': 'roc_auc'
        }
        for model, model_name in zip(Classification_models, models):
            print(model_name)
            scores = cross_validate(model,
                                    X_pca,
                                    y,
                                    scoring=scoring,
                                    cv=10,
                                    return_train_score=True)
            Model_Accuracy.append(scores)
        self.crossResult.config(state=NORMAL)
        self.crossResult.delete(1.0, END)
        for i, m in zip(Model_Accuracy, models):
            self.crossResult.insert(END, "\n" + m)
            self.crossResult.insert(END, "\n--------\n")
            for j in i:
                self.crossResult.insert(
                    END,
                    str(j) + ": " + str(i[j].mean()) + "\n")
        self.crossResult.config(state=DISABLED)
print(final_test_df)

train_X = final_train_df.iloc[:, 0:13]
train_y = final_train_df.iloc[:, 13]
print(train_X.head())
print(train_y.head())
input()

test_X = final_test_df.iloc[:, 0:13]
test_y = final_test_df.iloc[:, 13]
print(test_X.head())
print(test_y.head())
input()

# Naive Bayes model
ignb = GB()
imnb = MB()

# Building and predicting at the same time
pred_gnb = ignb.fit(train_X, train_y).predict(test_X)  # GaussianNB model
pred_mnb = imnb.fit(train_X, train_y).predict(test_X)  # Multinomal model

# Confusion matrix GaussianNB model
print(confusion_matrix(test_y, pred_gnb))
print(pd.crosstab(test_y.values.flatten(), pred_gnb))
print(classification_report(test_y, pred_gnb))  # classification report
print(np.mean(
    pred_gnb == test_y.values.flatten()))  #>> Accuracy = 0.7946879150066402
input()

# Confusion matrix Multinomal model