Esempio n. 1
0
def main():
	data = MushroomData()
 	ourTest = MushroomDataDemo()
	y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing=True)

	clf = LinearSVC()
	clf.fit(X_train,y_train)

	classifiers = stdin.read()
	# print input
	# del classifiers[0]
	# print classifiers
	string = ''
	for x in classifiers:
		string += str(x)
		pass
	test = string
	print test

	# print test
	test = ourTest.convert(test)
	# print test
	# print ourTest.X
	print ourTest.y
	np.array(test)
	 # X.reshape(1, -1)
	y_prediction = clf.predict(ourTest.X)
	print "Our Prediction = ",
	if y_prediction[0] == 1:
		print "Edible"
	else:
		print "Poisonous"
Esempio n. 2
0
def main():
    data = MushroomData()

    print('\nGaussian Naive Bayes: (eliminating missing elements)')
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=True)
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    y_true = np.array(y_test)
    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
    print(
        metrics.classification_report(y_true,
                                      y_prediction,
                                      target_names=data.class_labels,
                                      digits=6))

    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False)

    print('\nGaussian Naive Bayes: (using all elements)')
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    y_true = np.array(y_test)
    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
    print(
        metrics.classification_report(y_true,
                                      y_prediction,
                                      target_names=data.class_labels,
                                      digits=6))

    print('\nGaussian Naive Bayes: (Ignore stalk-root)')
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False, ignore=['stalk-root'])

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    y_true = np.array(y_test)
    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
    print(
        metrics.classification_report(y_true,
                                      y_prediction,
                                      target_names=data.class_labels,
                                      digits=6))
def main():
    # print("testing data class")

    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=True)

    print('missing elements')

    # target = y_test.target
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    # Forming plot
    plot_calibration_curve(clf, 'SVM', 1, y_test, X_test, y_train, X_train)
    plt.show()

    y_true = np.array(y_test)
    print "macro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='macro')) + "\n"
    print "micro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='micro')) + "\n"
    print "weighted precision , recall , fscore = " + str(
        precision_recall_fscore_support(
            y_true, y_prediction, average='weighted')) + "\n"

    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))

    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False)

    print('\nAll Elements')

    # target = y_test.target
    clf = svm.SVC()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    # Forming plot
    plot_calibration_curve(clf, 'SVM', 1, y_test, X_test, y_train, X_train)
    plt.show()

    y_true = np.array(y_test)
    print "macro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='macro')) + "\n"
    print "micro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='micro')) + "\n"
    print "weighted precision , recall , fscore = " + str(
        precision_recall_fscore_support(
            y_true, y_prediction, average='weighted')) + "\n"

    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))

    pass
Esempio n. 4
0
def main():

    # Linear SVC: (eliminating missing elements)
    print('\nLinear SVC: (eliminating missing elements)')
    data = MushroomData()
    y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing=True)
    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_prediction = clf.predict(X_test)

    # Metrics
    y_true = np.array(y_test)
    print 'accuracy = %f' %( np.mean((list(y_test)-y_prediction)==0))
    print(metrics.classification_report(y_true, y_prediction, target_names=data.class_labels, digits=6))

    # Linear SVC: (using all elements)
    print('\nLinear SVC: (using all elements)')
    data = MushroomData()
    y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing=False)
    clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_prediction = clf.predict(X_test)

    # Metrics
    y_true = np.array(y_test)
    print 'accuracy = %f' %( np.mean(( list(y_test)-y_prediction)==0))
    print(metrics.classification_report(y_true, y_prediction, target_names=data.class_labels, digits=6))
def main():

    # Decision Tree: (eliminating missing elements)
    print('\nDecision Tree: (eliminating missing elements)')
    data = MushroomData()
    y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing=True)
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf.fit(X_train,y_train)
    y_prediction = clf.predict(X_test)
    
    # Calibration Curve Plot 
    plot_calibration_curve(clf , 'Decision Tree', 1 , y_test, X_test, y_train, X_train)
    plt.show()

    # Metrics
    y_true = np.array(y_test)
    print 'accuracy = %f' %( np.mean((list(y_test)-y_prediction) == 0))
    print(metrics.classification_report(y_true, y_prediction, target_names=data.class_labels, digits=6))

    # Feature Importances 
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure()
    y_pos = np.arange(len(importances))
    plt.bar(y_pos, importances[indices], align="center", alpha=0.5)
    plt.title("Feature Importance: Decision Tree (missing elements)")
    plt.ylabel('Importance')
    plt.xlabel('Attribute Number')
    plt.xticks(y_pos, indices)
    #plt.show()

    # Decision Tree: (using all elements)
    print('\nDecision Tree: (using all elements)')
    data = MushroomData()
    y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing=False)
    clf = tree.DecisionTreeClassifier(criterion='entropy')
    clf.fit(X_train,y_train)
    y_prediction = clf.predict(X_test)

    # Calibration Curve Plot 
    plot_calibration_curve(clf , 'Decision Tree', 1 ,  y_test, X_test,y_train,X_train)
    plt.show()

    # Metrics
    y_true = np.array(y_test)
    print 'accuracy = %f' %( np.mean((list(y_test)-y_prediction)==0))
    print(metrics.classification_report(y_true, y_prediction, target_names=data.class_labels, digits=6))

    # Feature Importances
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.figure()
    y_pos = np.arange(len(importances))
    plt.bar(y_pos, importances[indices], align="center", alpha=0.5)
    plt.title("Feature Importance: Decision Tree (all elements)")
    plt.ylabel('Importance')
    plt.xlabel('Attribute Number')
    plt.xticks(y_pos, indices)
Esempio n. 6
0
def main():

    # categories = [
    #     'edible',
    #     'poisonous',
    # ]
    # remove = ('headers', 'footers', 'quotes')
    # data_train = fetch_20newsgroups(subset='train', categories=categories,
    #                             shuffle=True, random_state=42,
    #                             remove=remove)

    # data_test = fetch_20newsgroups(subset='test', categories=categories,
    #                            shuffle=True, random_state=42,
    #                            remove=remove)

    # Gaussuan Naive Bayes: (eliminating missing elements)
    print('Gaussuan Naive Bayes: (eliminating missing elements)')
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=True)
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    # Calibration Curve Plot
    plot_calibration_curve(clf, 'NAIVE BAYES', 1, y_test, X_test, y_train,
                           X_train)
    plt.show()

    # Metrics
    y_true = np.array(y_test)
    #print "macro precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='macro'))+ "\n"
    #print "micro precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='micro'))+ "\n"
    #print "weighted precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='weighted'))+ "\n"
    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
    print(
        metrics.classification_report(y_true,
                                      y_prediction,
                                      target_names=data.class_labels,
                                      digits=6))

    # Gaussuan Naive Bayes: (using all elements)
    print('\nGaussuan Naive Bayes: (using all elements)')
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False)

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    # Calibration Curve Plot
    plot_calibration_curve(clf, 'NAIVE BAYES', 1, y_test, X_test, y_train,
                           X_train)
    plt.show()

    # Metrics
    y_true = np.array(y_test)
    #print "macro precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='macro'))+ "\n"
    #print "micro precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='micro'))+ "\n"
    #print "weighted precision , recall , fscore = " + str(precision_recall_fscore_support(y_true, y_prediction, average='weighted'))+ "\n"
    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
    print(
        metrics.classification_report(y_true,
                                      y_prediction,
                                      target_names=data.class_labels,
                                      digits=6))

    print('\nIgnore stalk-root')
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False, ignore=['stalk-root'])

    clf = GaussianNB()
    clf.fit(X_train, y_train)
    y_prediction = clf.predict(X_test)

    # Forming plot
    plot_calibration_curve(clf, 'NAIVE BAYES', 1, y_test, X_test, y_train,
                           X_train)
    plt.show()

    y_true = np.array(y_test)
    print "macro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='macro')) + "\n"
    print "micro precision , recall , fscore = " + str(
        precision_recall_fscore_support(y_true, y_prediction,
                                        average='micro')) + "\n"
    print "weighted precision , recall , fscore = " + str(
        precision_recall_fscore_support(
            y_true, y_prediction, average='weighted')) + "\n"

    print 'accuracy = %f' % (np.mean((list(y_test) - y_prediction) == 0))
Esempio n. 7
0
def main():
    # Get dataset from MushroomData
    n_classes = 1
    totalCount = 0
    data = MushroomData()
    typelist = ["missing data", "all data", "stalk-root"]

    fprList = []
    tprList = []
    roc_aucList = []
    for j in range(3):
        if j == 0:
            y_test, X_test, y_train, X_train = data.get_datasets(
                eliminate_missing=True)
        elif j == 1:
            y_test, X_test, y_train, X_train = data.get_datasets(
                eliminate_missing=False)
        elif j == 2:
            y_test, X_test, y_train, X_train = data.get_datasets(
                eliminate_missing=False, ignore=['stalk-root'])
            pass
        totalCount = 0

        X = np.array(data.X)
        y = np.array(data.y)
        y = label_binarize(y, classes=[1, -1])
        # print y
        # print X

        # y = label_binarize(y , classes= [1,-1])

        clf = LinearSVC()
        # clfList = [clf3]
        scoreList = ["linearSVC"]

        # For each classifier
        count = 0
        for w in range(1):
            # if w == 0:
            #     y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing = True)
            # elif w == 1:
            #     y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing = False)
            # elif w == 2:
            #     y_test,X_test,y_train,X_train = data.get_datasets(eliminate_missing = False ,  ignore=['stalk-root'] )
            #     pass
            # y = y_train
            # X = X_train
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=.5,
                                                                random_state=0)
            y_score = clf.fit(X_train, y_train).decision_function(X_test)
            y_score = np.array([y_score])
            local = copy.deepcopy(y_test)
            # y_test = []
            # for x in local:
            #     y_test.append(X)
            # print y_test

            y_score = np.reshape(y_score, y_test.shape)
            # print y_score.shape
            # print y_test.shape

            y_test_prediction = clf.predict(X_test)

            fpr = dict()
            tpr = dict()
            roc_auc = dict()

            for i in range(0, n_classes):
                fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
                # print fpr
                # print tpr
                roc_auc[i] = auc(fpr[i], tpr[i])

            fprList.append(fpr)
            tprList.append(tpr)
            roc_aucList.append(roc_auc)
            count += 1

            y_true = np.array(y_test)

            # print 'accuracy = %f' %( np.mean(( list(y_test)-y_test_prediction)==0))
            print(
                metrics.classification_report(y_true,
                                              y_test_prediction,
                                              target_names=data.class_labels,
                                              digits=6))

        totalCount += count
    plotROC(fprList, tprList, roc_aucList, scoreList, j + 1, typelist)
Esempio n. 8
0
def main():
    # Get dataset from MushroomData
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=True)

    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList = [score("GaussianNB"), score("SVM"), score("linearSVC")]

    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape

    # Add noisy features to make the problem harder
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape
    # X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,random_state=0)

    # classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
    #                              random_state=random_state))
    # y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    # print n_classes
    # For each classifier
    count = 0
    for clf in clfList:
        fitandgraph(clf, scoreList, count, y_test, X_test, y_train, X_train,
                    data)
        count += 1

# ---------------------------------------------------------------------------------------------
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False)

    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList = [score("GaussianNB"), score("SVM"), score("linearSVC")]

    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape

    # Add noisy features to make the problem harder
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape
    count = 0
    for clf in clfList:
        fitandgraph(clf, scoreList, count, y_test, X_test, y_train, X_train,
                    data)
        count += 1


# ------------------------------------------------------------------------------------------------
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False, ignore=['stalk-root'])

    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList = [score("GaussianNB"), score("SVM"), score("linearSVC")]

    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape

    # Add noisy features to make the problem harder
    random_state = np.random.RandomState(0)
    n_samples, n_features = X.shape
    count = 0
    for clf in clfList:
        fitandgraph(clf, scoreList, count, y_test, X_test, y_train, X_train,
                    data)
        count += 1
def main():
    # Get dataset from MushroomData
    data = MushroomData()
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=True)
    totalCount = 0

    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList = ["GaussianNB", "SVM", "linearSVC"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,random_state=0)

    fprList = []
    tprList = []
    roc_aucList = []
    # classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True,
    #                              random_state=random_state))
    # y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    # print n_classes
    # For each classifier
    count = 0
    for clf in clfList:

        clf.fit(X_train, y_train)
        y_test_prediction = clf.predict(X_test)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        fpr, tpr, _ = roc_curve(y_test, y_test_prediction)
        roc_auc = auc(fpr, tpr)

        fprList.append(fpr)
        tprList.append(tpr)
        roc_aucList.append(roc_auc)

        # plotROC(fpr,tpr,roc_auc,clfname.name)

        y_true = np.array(y_test)

        print scoreList[count]
        print 'accuracy = %f' % (np.mean(
            (list(y_test) - y_test_prediction) == 0))
        print(
            metrics.classification_report(y_true,
                                          y_test_prediction,
                                          target_names=data.class_labels,
                                          digits=6))

        count += 1
    totalCount += count
    # ---------------------------------------------------------------------------------------------
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False)
    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList.extend(["GaussianNB", "SVM", "linearSVC"])

    count = 0
    for clf in clfList:
        clf.fit(X_train, y_train)
        y_test_prediction = clf.predict(X_test)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        fpr, tpr, _ = roc_curve(y_test, y_test_prediction)
        roc_auc = auc(fpr, tpr)

        fprList.append(fpr)
        tprList.append(tpr)
        roc_aucList.append(roc_auc)

        # plotROC(fpr,tpr,roc_auc,clfname.name)

        y_true = np.array(y_test)

        print scoreList[count]
        print 'accuracy = %f' % (np.mean(
            (list(y_test) - y_test_prediction) == 0))
        print(
            metrics.classification_report(y_true,
                                          y_test_prediction,
                                          target_names=data.class_labels,
                                          digits=6))

        count += 1
    totalCount += count

    # ------------------------------------------------------------------------------------------------
    y_test, X_test, y_train, X_train = data.get_datasets(
        eliminate_missing=False, ignore=['stalk-root'])

    X = np.array(data.X)
    y = np.array(data.y)
    y = label_binarize(y, classes=[1, -1])

    clf1 = GaussianNB()
    clf2 = svm.SVC()
    clf3 = LinearSVC()
    clfList = [clf1, clf2, clf3]
    scoreList.extend(["GaussianNB", "SVM", "linearSVC"])

    count = 0
    for clf in clfList:
        clf.fit(X_train, y_train)
        y_test_prediction = clf.predict(X_test)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()

        fpr, tpr, _ = roc_curve(y_test, y_test_prediction)
        roc_auc = auc(fpr, tpr)

        fprList.append(fpr)
        tprList.append(tpr)
        roc_aucList.append(roc_auc)

        # plotROC(fpr,tpr,roc_auc,clfname.name)

        y_true = np.array(y_test)

        print scoreList[count]
        print 'accuracy = %f' % (np.mean(
            (list(y_test) - y_test_prediction) == 0))
        print(
            metrics.classification_report(y_true,
                                          y_test_prediction,
                                          target_names=data.class_labels,
                                          digits=6))
        count += 1
    totalCount += count
    plotROC(fprList, tprList, roc_aucList, scoreList, totalCount)
Esempio n. 10
0
#Group: Alix Voorthuyzen,
#       Alice Gibbons,
#       Jason Curt,
#       Matthew Clarkson,
#       Zachary Seselja
#
#Purpose: A naive bayes classifier for the mushroom dataset
#           with weighted attributes
#

import numpy as np
from MushroomData import MushroomData
from sklearn import tree
from sklearn import metrics

data = MushroomData()

y_test, X_test, y_train, X_train = data.get_datasets()

edible_probs = [{} for x in range(22)]
inedible_probs = [{} for x in range(22)]
feat_counts = data.feat_counts()


def fit(X, y):
    e_count = 0
    total = len(X)
    for i in range(total):
        if y[i] is 1:
            #edible
            e_count += 1.0