Example #1
0
def test_classification():
    t = zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3

    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(data,t) # training on the iris dataset

    print classifier.predict(data[0])
    print t[0]


    from sklearn import cross_validation
    train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0)

    classifier.fit(train,t_train) # train
    print classifier.score(test,t_test) # test

    from sklearn.metrics import confusion_matrix
    print confusion_matrix(classifier.predict(test),t_test)

    from sklearn.metrics import classification_report
    print classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica'])

    from sklearn.cross_validation import cross_val_score
    # cross validation with 6 iterations 
    scores = cross_val_score(classifier, data, t, cv=6)
    print scores

    from numpy import mean
    print mean(scores)
Example #2
0
def crossvalidate(X_trn, Y_trn):
    """Cross validation with comparison to classifiers that classify as only good or only bad"""
    import numpy as np
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X_trn.toarray(), Y_trn, test_size=0.4, random_state=1)
    dumb_labels1 = Y_test.copy()
    dumb_labels2 = Y_test.copy()
    dumb_labels1[dumb_labels1 == 0] = 1;    #Labels all 1s
    dumb_labels2[dumb_labels2 == 1] = 0;    #Labels all 0s
    dumb_labels3 = np.random.randint(2, size=(len(Y_test),))
    clf = GaussianNB()
    #clf = Perceptron()
    #clf = SGDClassifier()
    #clf = MultinomialNB()
    #clf = KNeighborsClassifier()
    #clf = LinearSVC()
    clf.fit(X_train, Y_train)
    accuracy = clf.score(X_test, Y_test)
    dumb_clf1_score = clf.score(X_test, dumb_labels1)
    dumb_clf2_score = clf.score(X_test, dumb_labels2)
    dumb_clf3_score = clf.score(X_test, dumb_labels3)
    print "Classifier Score : ", accuracy
    print "Dumb_classifier with all 1s : ", dumb_clf1_score
    print "Dumb classifier with all 0s : ", dumb_clf2_score
    print "Dumb classifier with random sequence : ", dumb_clf3_score
    return accuracy
Example #3
0
def get_GNB(Xtrain, Xtest, Ytrain, Ytest):
    gnb = GaussianNB()
    gnb.fit(Xtrain,Ytrain)
    scores = np.empty((4))
    scores[0] = gnb.score(Xtrain,Ytrain)
    scores[1] = gnb.score(Xtest,Ytest)
    print('GNB, train: {0:.02f}% '.format(scores[0]*100))
    print('GNB, test: {0:.02f}% '.format(scores[1]*100))
    return gnb
Example #4
0
def get_GNB(Xtrain, Ytrain, Xtest = None , Ytest = None, verbose = 0):
    gnb = GaussianNB()
    gnb.fit(Xtrain,Ytrain)
    
    if (verbose == 1):
        scores = np.empty((2))
        scores[0] = gnb.score(Xtrain,Ytrain)
        print('GNB, train: {0:.02f}% '.format(scores[0]*100))
        if (type(Xtest) != type(None)):
            scores[1] = gnb.score(Xtest,Ytest)
            print('GNB, test: {0:.02f}% '.format(scores[1]*100))
    return gnb
Example #5
0
def cvalidate():
    from sklearn import cross_validation
    targetset = np.genfromtxt(open('trainLabels.csv','r'), dtype='f16')
    y = [x for x in targetset]

    trainset = np.genfromtxt(open('train.csv','r'), delimiter=',', dtype='f16')
    X = np.array([x for x in trainset])
    
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3, random_state = 0)

    gnb = GaussianNB()
    X_train, X_test = decomposition_pca(X_train, X_test)
    gnb.fit(X_train, y_train)

    print gnb.score(X_test, y_test)
Example #6
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)

    ### use the trained classifier to predict labels for the test features
    pred = clf.predict(features_test)


    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example, 
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    
    #from sklearn.metrics import accuracy_score
    #accuarcy = accuracy_score(pred, labels_test)
    
    accuracy = clf.score(features_test, labels_test)
    return accuracy
def NB(text):
    ### features_train and features_test are the features for the training
    ### and testing datasets, respectively
    ### labels_train and labels_test are the corresponding item labels
    features_train, features_test, labels_train, labels_test = Preprocess()
    Ifeatures_train,Ifeatures_test,Ilabels_train=preprocess_input([text])

    # classification goes here

    clf = GaussianNB()

    # training
    train_t0 = time()
    clf.fit(features_train, labels_train)
    train_t1 = time()

    # prediction or testing
    test_t0 = time()
    predict = clf.predict(features_test)
    test_t1 = time()

    print "accuracy: ", clf.score(features_test, labels_test)
    print "#################################"
    print "tain time: ", round(train_t1 - train_t0, 3), "s"
    print "prediction time: ", round(test_t1 - test_t0, 3), "s"

    print "#################################"

    clf.fit(Ifeatures_train,Ilabels_train)
    print ("prediction of ",str(clf.predict(Ifeatures_test))[1])

    #print "prediction of ", clf.predict(preprocess_input(text))
    return  str(clf.predict(Ifeatures_test))[1]
class GaussianNBcls(object):
    """docstring for ClassName"""
    def __init__(self):
        self.gnb_cls = GaussianNB()
        self.prediction = None
        self.train_x = None
        self.train_y = None

    def train_model(self, train_x, train_y):
        try:
            self.train_x = train_x
            self.train_y = train_y
            self.gnb_cls.fit(train_x, train_y)
        except:
            print(traceback.format_exc())

    def predict(self, test_x):
        try:
            self.test_x = test_x
            self.prediction = self.gnb_cls.predict(test_x)
            return self.prediction
        except:
            print(traceback.format_exc())

    def accuracy_score(self, test_y):
        try:
            # return r2_score(test_y, self.prediction)
            return self.gnb_cls.score(self.test_x, test_y)
        except:
            print(traceback.format_exc())
Example #9
0
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    ### import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB

    ### create classifier
    clf = GaussianNB()

    t0 = time()
    ### fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)
    print "training time:", round(time()-t0, 3), "s"

    ### use the trained classifier to predict labels for the test features
    import numpy as np
    t1 = time()
    pred = clf.predict(features_test)
    print "predicting time:", round(time()-t1, 3), "s"

    ### calculate and return the accuracy on the test data
    ### this is slightly different than the example,
    ### where we just print the accuracy
    ### you might need to import an sklearn module
    accuracy = clf.score(features_test, labels_test)
    return accuracy
class PriceModel(object):
    """Linear Regression Model used to predict future prices"""
    def __init__(self, algorithm='gnb'):
        self.algorithm = algorithm

        if algorithm == 'svm':
            self.clf = SVC(kernel='rbf')
        elif algorithm == 'rf':
            self.clf = RandomForestClassifier(n_estimators=10,
                                                max_depth=None,
                                                min_samples_split=1,
                                                random_state=0)
        elif algorithm == 'lr':
            self.clf = LogisticRegression()
        elif algorithm == 'knn':
            self.clf = KNeighborsClassifier(n_neighbors=3)
        else:
            # Naive Bayes
            self.clf = GaussianNB()

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def score(self, X_test, y_test):
        return self.clf.score(X_test, y_test)
Example #11
0
def trainData(username):
	"""
	Trains the data based on the users performance so far
	Returns a trained Gaussian Naive Bayes model and updates result collection
	"""
	X = getFeatures(username)
	Y = getClassList(username)
	
	trainX = np.array(X)
	trainY = np.array(Y)

	gnb = GaussianNB()
	gnb.fit(trainX, trainY)
	print "Score with Naive Bayes: ", gnb.score(trainX, trainY)

	testData = words.posts.find({}, {'id' : 1,
									'points' : 1,
									'diff' : 1,
									'_id' : 0})
	testData = map(lambda x : (x['id'], x['points'], x['diff']), testData)

	with warnings.catch_warnings():
		warnings.simplefilter('ignore')
		for data in testData:
			testWord = words.posts.find_one({'id' : data[0]}, {'word' : 1, '_id' : 0})['word']
			wordClass = setWordClass(list(gnb.predict_proba(data))[0])
			classWord = result.posts.update({'username' : username}, {'$set' : {testWord : wordClass}}, upsert = True)
 def gaussian_bayes_test(self):
     print 'gaussian bayes test'
     g_bayes_clf = GaussianNB()
     print 'cross validation score',cross_val_score(g_bayes_clf, self.x_data, self.y_data)
     start_time = time.time()
     g_bayes_clf.fit(self.x_train, self.y_train)
     print 'score',g_bayes_clf.score(self.x_test, self.y_test)
     print 'time cost', time.time() - start_time
Example #13
0
def Accuracy(features_train,labels_train,features_test,labels_test):

    clf = GaussianNB()

    clf.fit(features_train,labels_train)

    pred = clf.predict(features_test)

    return clf.score(features_test,labels_test)    
	def run_naive_bayes(self):
		print "Running......"
		clf = GaussianNB()
		clf.fit(self.features_train, self.labels_train)
		pred = clf.predict(self.features_test)
		accuracy = clf.score(self.features_test, self.labels_test)
		#Save model and performance
		self.save_model(clf, "Naive Bayes")
		self.save_performance("Naive Bayes", accuracy)
def Gaussian_NB(X, y, tst_size):
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = tst_size, random_state = 0)
	clf = GaussianNB()
	score = 0
	for i in range(100):
		clf.fit(X_train, y_train)
		score += clf.score(X_test, y_test)
	score = score/100
	return score
def naiveBayesClassifierTraining(compounds_all):
    print "Building naive Bayes classifier (" + str(NB_FOLDS) + "-fold cross-validation)..."
    # get the data
    keys = compounds_all.keys()
    fingerprint_data = [compounds_all[cmpnd_id]['fingerprint'] for cmpnd_id in keys]
    fingerprint_data = numpy.asarray(fingerprint_data)
    activity_data = [compounds_all[cmpnd_id]['active'] for cmpnd_id in keys]
    activity_data = numpy.asarray(activity_data)

    # perform K-fold cross-validation
    classifier = GaussianNB()
    kfold_xv_strat = cross_validation.StratifiedKFold(activity_data, NB_FOLDS, indices=False)
    confusion_matrices = []
    probabilities = []
    scores = []
    models = []
    true_activities = []
    aucs = []
    for train, test in kfold_xv_strat:
        fingerprint_data_train = fingerprint_data[train]
        fingerprint_data_test = fingerprint_data[test]
        activity_data_train = activity_data[train]
        activity_data_test = activity_data[test]

        # model building
        classifier.fit(fingerprint_data_train, activity_data_train)

        # testing
        activity_data_predictions = classifier.predict(fingerprint_data_test)
        models.append(classifier)

        probability_estimates = classifier.predict_proba(fingerprint_data_test)
        probabilities.append(probability_estimates)

        scores.append(classifier.score(fingerprint_data_test, activity_data_test))

        activity_confusion_matrix = confusion_matrix(activity_data_test, activity_data_predictions)
        confusion_matrices.append(activity_confusion_matrix)

        true_activities.append(activity_data_test)

        # ROC curves
        fpr, tpr, thresholds = roc_curve(activity_data_test, probability_estimates[:, 1])
        aucs.append(auc(fpr, tpr))
    classifier.fit(fingerprint_data, activity_data)
    print "Done."
    return {
        'confusion_matrices' : confusion_matrices
        , 'probabilities' : probabilities
        , 'scores' : scores
        , 'models' : models
        , 'true_activity_data' : true_activities
        , 'AUCs' : aucs
        , 'fingerprint_data' : fingerprint_data
        , 'activity_data' : activity_data
        , 'final_model' : classifier
    }
Example #17
0
def author_id(f_train, f_test, l_train, l_test):
	clf = GaussianNB()
	t0 = time()
	clf.fit(f_train, l_train)
	print "training time:", round(time()-t0, 3), "s"	
	t0 = time()
	pred = clf.predict(f_test)
	print "prediction time:", round(time()-t0, 3), "s"
	return 'accuracy: %f' % clf.score(f_test, l_test)
Example #18
0
def run_test(trainData, trainLabels, testData, testLabels):
  start_time = time()
  classifier = GaussianNB()
  classifier.fit(trainData, trainLabels)
  score = classifier.score(testData, testLabels)
  duration = time() - start_time
  print "training set size: " + str(len(trainData))
  print "score: " + str(score)
  print "time: " + str(duration) + "\n"
def classify(features_train, labels_train, features_test, labels_test):
  classifier = GaussianNB()
  t0 = time()
  classifier.fit(features_train, labels_train)
  print "training time: ", round(time() - t0), "s"
  t1 = time()
  classifier.predict(features_test)
  print "predicting time: ", round(time() - t1), "s"
  return classifier.score(features_test, labels_test)
Example #20
0
def GNB_select_cv(X, Y, num_features):
    scores = []
    skf = cross_validation.StratifiedKFold(Y, n_folds=10)
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test]
        XRF_train, imp, ind, std = fitRF(X_train, y_train, est=2000)  # RFsel
        XRF_test = X_test[:, ind]  # reorder test set after RFsel
        clf = GaussianNB()
        clf.fit(XRF_train[:, 0:num_features], y_train)
        scores.append(clf.score(XRF_test[:, 0:num_features], y_test))
    score = np.mean(scores)
    return(score)
def NBAccuracy(features_train, labels_train, features_test, labels_test):
    """ compute the accuracy of your Naive Bayes classifier """
    # import the sklearn module for GaussianNB
    from sklearn.naive_bayes import GaussianNB
    # create classifier
    clf = GaussianNB()
    # fit the classifier on the training features and labels
    clf.fit(features_train, labels_train)

    # use the trained classifier to predict labels for the test features
    # calculate and return the accuracy on the test data
    accuracy = clf.score(features_test, labels_test)
    return accuracy
Example #22
0
def NBClassifier(filename, split_ratio):
	print "-"*15,"Naive Bayes Classfier","-"*15

	X, Y, X_labels, Y_labels = split_data(filename,split_ratio)

	# print X.shape, Y.shape, X_labels.shape, Y_labels.shape

	nb_model = GaussianNB()
	nb_model.fit(X, X_labels)

	print "\n accuracy =", nb_model.score(Y,Y_labels,sample_weight=None)

	print "-"*50
Example #23
0
def sklearn_model():
    """Fits the (parametric) Gaussian Naive Bayes classifier from sklearn on the iris
    dataset."""
    # load iris data, perform train/test split
    iris = load_iris()
    tts = cv.train_test_split(iris.data, iris.target, train_size=TRAIN_PCT)
    train_features, test_features, train_labels, test_labels = tts

    # train (gaussian) Naive Bayes model, make predictions on test set
    gnb = GaussianNB().fit(train_features, train_labels)
    predicted_labels = gnb.predict(test_features)

    # show accuracy pct
    print "accuracy = {0} %".format(round(100 * gnb.score(test_features, test_labels)))
class NaiveBayes():
    def __init__(self):
        self.clf = GaussianNB()
        self.accuracy = 0
        self.y_out = []

    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train.ravel())

    def test(self, X_test):
        self.y_out = self.clf.predict(X_test)

    def score(self, X_test, y_test):
        self.accuracy = self.clf.score(X_test, y_test.ravel())
def bayes():
    ##Naive Bayes
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb.fit(Xtrn, Ytrn)
    print gnb.score(Xtrn, Ytrn)
    print gnb.score(Xval1, Yval1)
    print gnb.score(Xval2, Yval2)
    print gnb.score(Xval3, Yval3)
Example #26
0
def clsfr():
    train1_err = []
    # test1_err = []
    train2_err = []
    test2_err = []
    t = zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3
    classifier = GaussianNB()
    classifier.fit(data, t)  # training on the iris dataset
    for i in range(len(t)):
        if classifier.predict(data[i]) != array(t[i]):
            train1_err.append((classifier.predict(data[i]), t[i]))
    train, test, t_train, t_test = cross_validation.train_test_split(data, t, test_size=0.4, random_state=0)
    classifier.fit(train, t_train)  # train
    for i in range(len(t_train)):
        if classifier.predict(train[i]) != array(t_train[i]):
            train2_err.append((classifier.predict(train[i]), t_train[i]))
    for i in range(len(t_test)):
        if classifier.predict(test[i]) != array(t_test[i]):
            test2_err.append((classifier.predict(test[i]), t_test[i]))
    print('train error: ', train1_err)
    print('train count: ', len(t))
    print('train error count: ', len(train1_err))
    print('accuracy rate: ', classifier.score(data, t))
    print('*******************************************')
    print('train error: ', train2_err)
    print('test error: ', test2_err)
    print('train count: ', len(train))
    print('train error count: ', len(train2_err))
    print('test count: ', len(test))
    print('test error count: ', len(test2_err))
    print('test accuracy rate: ', classifier.score(test, t_test))  # test
    print('train accuracy rate: ', classifier.score(train, t_train))
    print('****************************************************')
    print(classification_report(classifier.predict(test), t_test, target_names=['setosa', 'versicolor', 'virginica']))
Example #27
0
	def nb_classify(self):
		print "Naive Bayes"

		clf = GaussianNB()
		clf.fit(self.descr, self.target)
		mean = clf.score(self.test_descr, self.test_target)

		pred = clf.predict(self.test_descr)
		accuracy = np.where(pred == self.test_target, 1, 0).sum() / float(len(self.test_target))
		print "Accuracy: %3f" % accuracy

		print "Mean : %3f" % mean
		print "Probability ", clf.class_prior_
		print "Mean of each feature per class ", clf.theta_
		print "Variance of each feature per class ", clf.sigma_
		print "Predict Probability ", clf.predict_proba(self.descr)
Example #28
0
def compute_bayes_error():
    np.random.seed(0)
    mu1 = [0, 0]
    cov_mat_1 = 1 * np.eye(2)

    mu2 = [0, 0]
    cov_mat_2 = 16 * np.eye(2)

    #create unified training set from two normal distributions 
    X_vect = np.concatenate([np.random.multivariate_normal(mu1, cov_mat_1, 5000),
                        np.random.multivariate_normal(mu2, cov_mat_2, 5000)])
    y = np.zeros(10000)
    y[5000:] = 1

    # Fit the Naive Bayes' classifier
    clf = GaussianNB()
    clf.fit(X_vect, y)
    # predict the classification probabilities on a grid
    xlim = (-5, 5)
    ylim = (-5, 5)
    xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], 50),
                         np.linspace(ylim[0], ylim[1], 70))
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])
    Z = Z[:, 1].reshape(xx.shape)

    acc = clf.score(X_vect,y)
    #Error rate
    error = 1- acc

    #Add decision boundery plot
    fig = plt.figure(figsize=(8, 8))
    fig.suptitle('decision boundary', fontsize=12)
    fig = plt.gcf()
    #set display window title
    fig.canvas.set_window_title('Decision Boundary')
    ax = fig.add_subplot(111)
    p1 = ax.scatter(X_vect[:, 0], X_vect[:, 1], c=y, cmap=plt.get_cmap('Set3'), zorder=5)
    p2 = ax.contour(xx, yy, Z, [0.5],linewidths=3, colors='k')
   
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    ax.set_xlabel('$x1$')
    ax.set_ylabel('$x2$')
    plt.clabel(p2, inline=3, fontsize=5)
    p2.collections[0].set_label("Decision Boundary")
    ax.legend(loc='lower right')
    return error
def gausssian_data(X,y):
    """
    朴素贝叶斯算法
    :param X:
    :param y:
    :return:
    """
    from sklearn import metrics
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(X, y)
    expected = y
    predicted = model.predict(X)
    score = model.score(X,y)
    # print(metrics.classification_report(expected, predicted,labels=[0,1],target_names=['良性网址','恶意网址']))
    cm = metrics.confusion_matrix(expected, predicted)
    return score, cm
Example #30
0
def gnb(training_data, training_target, testing_data, testing_target):
    """
	DESCRIPTION:
	
	
	INPUTS:
	
	
	OUTPUTS:
	
	
	EXAMPLE USAGE:
	
	"""

    clf = GaussianNB()
    clf.fit(training_data, training_target)
    return clf.score(testing_data, testing_target)
# if features[obs][4] > 10:
#     print(features[obs][4]

#plt.plot(np.array(features[:,0]),np.array(features[:,1]))

# for k in data_dict:
#     for j in data_dict[k]:
#         print(data_dict["salary"][j]

# for k in data_dict:
#     print(data_dict[k]["bonus"]

###GAUSSIAN
from sklearn.naive_bayes import GaussianNB
clfGAU = GaussianNB().fit(features, labels)
print("Gaussian cf score is %f " % clfGAU.score(features, labels))

###SVM
from sklearn import svm
clfSVM = svm.SVC(kernel="rbf", C=0.001, gamma=0.001).fit(features, labels)
print("classic SVM score is %f " % clfSVM.score(features, labels))
# predSVM = clfSVM.fit(features, labels)
#print("classic accuracy_score score is %f " % accuracy_score(labels, predSVM)

###Decision Tree
from sklearn import tree
clfDT = tree.DecisionTreeClassifier(min_samples_split=50).fit(features, labels)
print("decision tree score % f" % clfDT.score(features, labels))

print("features_list", features_list)
print('most important features DT', clfDT.feature_importances_)
Example #32
0
# GaussianNB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(x_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

score = classifier.score(x_test, y_test)
print(score)

# from sklearn import metrics
# metrics.accuracy_score(y_test, y_pred)

# Visualising the Training set results
from matplotlib.colors import ListedColormap
x_set, y_set = x_train, y_train
X1, X2 = nm.meshgrid(
    nm.arange(start=x_set[:, 0].min() - 1,
              stop=x_set[:, 0].max() + 1,
              step=0.01),
    nm.arange(start=x_set[:, 1].min() - 1,
              stop=x_set[:, 1].max() + 1,
              step=0.01))
Example #33
0
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(logreg.score(X_train, y_train))

svc = SVC()
svc.fit(X_train, y_train)
print(svc.score(X_train, y_train))

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print(knn.score(X_train, y_train))

gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
print(gaussian.score(X_train, y_train))

linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)
print(linear_svc.score(X_train, y_train))

sgd = SGDClassifier()
sgd.fit(X_train, y_train)
print(sgd.score(X_train, y_train))

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
print(decision_tree.score(X_train, y_train))

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Example #34
0
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(features_train, labels_train)
predictions = clf.predict(features_test)

total = (len(labels_test))
errors = 0

for i in range(total):
    if (predictions[i] != labels_test[i]):
        errors += 1
correct = float(total - errors)
total = float(total)
accuracy = correct / total
print clf.score(features_test, labels_test)  #can also use print(accuracy)

#########################################################
Example #35
0
    Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
from sklearn.naive_bayes import GaussianNB

### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

#########################################################
### your code goes here ###
gnb = GaussianNB()

t0 = time()
y_pred = gnb.fit(features_train, labels_train)
print("training time:", round(time() - t0, 3), "s")

t0 = time()
y_pred = gnb.predict(features_test)
print("predicting time:", round(time() - t0, 3), "s")

accuracy = gnb.score(features_test, labels_test)
print(accuracy)

#########################################################
Example #36
0
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

df = pd.read_csv('./glass.csv')
y = df["Type"]
df1 = df.drop("Type", axis=1).copy()
# create training and testing
X_train, X_test, Y_train, Y_test = train_test_split(df1, y, test_size=0.15)

model = GaussianNB()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_test, Y_test) * 100, 2)
print("Naive Byes accuracy with test is:", acc_svc)
"""plt.plot(Y_test,label="Y_test")
plt.plot(Y_pred,label="Y_pred")
plt.legend()
plt.show()"""
for item in survived:
    if(item==0):
        colors.append('Red')
    else:
        colors.append('Green')
# plt.scatter(ages, fares, s=50, color=colors)
# s means size, we want size to be bigger
# plt.show()

# Step 3: Build a NB Model
Features = dataframe.drop(['Survived'], axis=1).values
Targets = dataframe['Survived'].values
Features_Train, Target_Train = Features[:710], Targets[:710]
# there are total 887 data points and 80% of that will be 710
Features_Test, Targets_test = Features[710:], Targets[710:]
# print(Features_Test)

model = GaussianNB()
model.fit(Features_Train, Target_Train)

# Step 4: Print Predicted vs Actuals
predicted_values = model.predict(Features_Test)
for item in zip(Targets_test, predicted_values):
    print('Actual was:', item[0], 'Predicted was', item[1])

# Step 5: Estimate Error
print('Accuracy is:', model.score(Features_Test, Targets_test))
# we didnt gave targets_test and predicted_values because
# score method itself calculates the predicted values from features_test and compares it with
# target_test and gives us the score
Example #38
0
        batch_x = uncompress(batch_x, 86796)
        # print batch_x.shape

        batch_x = np.sum(batch_x, axis=1)
        # print batch_x.shape
        batch_x = np.squeeze(batch_x)
        # print batch_x.shape

        # print 'y'
        # print batch_y.shape
        batch_y = np.repeat(batch_y, 50, axis=0)
        # print batch_y.shape

        # gnb.partial_fit(batch_x,batch_y,classes=[0,1])

        x = gnb.score(batch_x, batch_y)

        print x

        s += x
        i += 1

        print 'average : ', s / i

    # gnb.fit(X,Y)
    #
    print s / i

    fp = open(os.path.join('nb_logs', 'nb_object' + '.save'), 'wb')
    cPickle.dump(gnb, fp, protocol=cPickle.HIGHEST_PROTOCOL)
    fp.close()
    % (NB_NonScaled_cross_val_scores.mean(),
       NB_NonScaled_cross_val_scores.std() * 2))

# In[24]:

if NB_NonScaled_cross_val_scores.mean() > 0.97:
    print("The Naive Bayes Model (Non Scaled) is overfitting in this case.")
else:
    NB_classifier.fit(X_train, y_train)
    NB_NonScaled_predicted = NB_classifier.predict(X_test)
    NB_NonScaled_prob_default = np.sum(NB_NonScaled_predicted) / len(
        NB_NonScaled_predicted)
    print(
        "The Default Probability based on Naive Bayes Model(Non Scaled) is :",
        '%.3f' % NB_NonScaled_prob_default)
    NB_NonScaled_accuracy = NB_classifier.score(X_test, y_test)
    print("The accuracy of Naive Bayes Model(Non Scaled) on test set is : ",
          '%.3f' % NB_NonScaled_accuracy)

# In[25]:

#output the result into the existing evaluation dataframe to compare with other models
new_evaluation = pd.DataFrame({
    'Model': ["Naive Bayes_NonScaled"],
    'Default_Probability': [NB_NonScaled_prob_default],
    'Cross_Validation_Accuracy': [NB_NonScaled_cross_val_scores.mean()],
    'Test_Accuracy': [NB_NonScaled_accuracy]
})
evaluation = evaluation.append(new_evaluation)
evaluation = evaluation[[
    'Model', 'Default_Probability', 'Cross_Validation_Accuracy',
sub.to_csv('svm.csv', index=False)
## knn

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn
sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred})
sub.to_csv('knn.csv', index=False)
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian
sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred})
sub.to_csv('gnb.csv', index=False)

# Decision Tree

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Y_pred = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
acc_decision_tree
sub = pd.DataFrame({'PassengerId': df_test["PassengerId"], 'Survived': Y_pred})
sub.to_csv('tree.csv', index=False)
# Random Forest
Example #41
0
        return vectors

    #Vectorizes Y with 0 being neg and 1 being pos
    def CreateYVector(self):
        print("......building Y matrix")
        vector = np.zeros(5331 + 5331, dtype=int)
        count = 0

        for entry in vector:
            if count > 5330:
                vector[count] = 1
            count += 1

        return vector


data = DataPrep("rt-polaritydata/rt-polaritydata/rt-polarity.neg",
                "rt-polaritydata/rt-polaritydata/rt-polarity.pos")
#print(data.X)
#print(data.Y)
print("......spliting")
X_train, X_test, y_train, y_test = train_test_split(data.X,
                                                    data.Y,
                                                    test_size=0.33)
clf = GaussianNB(var_smoothing=.0001)
print("......training")
clf.fit(X_train, y_train)
print("Accuracy:  ")
print(clf.score(X_test, y_test))
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

result = classifier.score(x_test, y_test)

from matplotlib.colors import ListedColormap
x_set, y_set = x_train, y_train
x1, x2 = np.meshgrid(
    np.arange(x_set[:, 0].min() - 1, x_set[:, 0].max() + 1, 0.01),
    np.arange(x_set[:, 1].min() - 1, x_set[:, 1].max() + 1, 0.01))
y_d = np.array([x1.ravel(), x2.ravel()]).T
plt.contourf(x1,
             x2,
             classifier.predict(np.array([x1.ravel(),
                                          x2.ravel()]).T).reshape(x1.shape),
             alpha=0.4,
             cmap=ListedColormap(('red', 'green')))
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
svm = SVC(random_state=1)
svm.fit(x_train.T, y_train.T)

acc = svm.score(x_test.T, y_test.T) * 100
accuracies['SVM'] = acc
print("Test Accuracy of SVM Algorithm: {:.2f}%".format(acc))

# In[35]:

#Naive Bayes Algorithm

from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train.T, y_train.T)

acc = nb.score(x_test.T, y_test.T) * 100
accuracies['Naive Bayes'] = acc
print("Accuracy of Naive Bayes: {:.2f}%".format(acc))

# In[36]:

#Decision Tree Algorithm

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train.T, y_train.T)

acc = dtc.score(x_test.T, y_test.T) * 100
accuracies['Decision Tree'] = acc
print("Decision Tree Test Accuracy {:.2f}%".format(acc))
Example #44
0
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('Predicted Class', fontsize=12)
    plt.xlabel('Actual Class', fontsize=12)


# predict data
for i in range(3):
    #Use Gaussian Naive Bayes method
    classifier = GaussianNB()
    #Fit the model
    classifier.fit(train[i], classification_train)
    #Calculate the result & accuracy
    result = classifier.predict(test[i])
    accuracy = classifier.score(test[i], classification_test)
    #Calculate the probability estimates of the positive class
    prob_data = classifier.predict_proba(test[i])
    prob_data = prob_data[:, 1]
    #Calculate fpr & ftr
    fpr, tpr, thresholds = metrics.roc_curve(classification_test, prob_data)
    fprs.append(fpr)
    tprs.append(tpr)
    #Calculate confusion matrix, precision & recall
    conf_mat = metrics.confusion_matrix(classification_test, result)
    precision = metrics.precision_score(classification_test, result)
    recall = metrics.recall_score(classification_test, result)
    roc_auc = metrics.auc(fpr, tpr)

    print 'min_df = ' + str(min_df[i])
    print 'dimension reduction method: ' + str(method[i])
    'AST', 'BLK']

#Pandas DataFrame allows you to select columns.
#We use column selection to split the data into features and class.
nba_feature = nba[feature_columns]
nba_class = nba[class_column]

print(nba_feature[0:3])
print(list(nba_class[0:3]))

train_feature, test_feature, train_class, test_class = \
    train_test_split(nba_feature, nba_class, stratify=nba_class, \
    train_size=0.75, test_size=0.25, random_state=0)

training_accuracy = []
test_accuracy = []

nb = GaussianNB().fit(train_feature, train_class)
print("Test set score: {:.3f}".format(nb.score(test_feature, test_class)))
prediction = nb.predict(test_feature)
print("Confusion matrix:")
print(
    pd.crosstab(test_class,
                prediction,
                rownames=['True'],
                colnames=['Predicted'],
                margins=True))

scores = cross_val_score(nb, nba_feature, nba_class, cv=10)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.2f}".format(scores.mean()))
Example #46
0
logic_reg = LogisticRegression()
logic_reg.fit(x_train, y_train)
print("Test accuarcy: {:.2f}%".format(logic_reg.score(x_test, y_test) * 100))

#for Knn model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
print("Test accuracy of knn is {:.2f}%".format(
    knn.score(x_test, y_test) * 100))
'''#for Svm model   not working
from sklearn.svm import SVC
sps=SVC(random_state=1,kernel='rbf')
sps.fit_transpose(x_train,y_test)
print("SVM Accuracy report {:.2f}%".format(sps.score(x_test,y_test)*100))
'''
#naive bayes
from sklearn.naive_bayes import GaussianNB
nai = GaussianNB()
nai.fit(x_train, y_train)
print("Naive Bayes Accuracy report {:.2f}%".format(
    nai.score(x_test, y_test) * 100))

#Random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=1000, random_state=1)
rf.fit(x_train, y_train)
print("Random Forest Accuracy report {:.2f}%".format(
    rf.score(x_test, y_test) * 100))
#predict method
print " 1 means survived, 0 means not survived"
print 'Classified as :', classifier.predict([data[0]])
print 'Classified as :', classifier.predict([[3,27,0]])
print 'Classified as :', classifier.predict([data[2],data[4]])

#TEST DATA-training and classiffication
#split to 60 percent-train and 40 perccent-test of total data
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(data, t,test_size=0.4, random_state=0)
print 'Number of records used for training',train.shape
print 'Number of records used for testing',test.shape

#train and test
classifier.fit(train,t_train) # train with 1st part:60 percent
print 'Accuracy is =',classifier.score(test,t_test) # test with 2nd part:40 percent

#CONFUSION MATRICS TO SHOW ACCURACY
from sklearn.metrics import confusion_matrix
print 'confusion matrix\n',confusion_matrix(classifier.predict(test),t_test)

#Function that gives us a complete report on the performance
from sklearn.metrics import classification_report
print classification_report(classifier.predict(test),t_test,target_names=['Survived', 'Not Survived'])

#Sophisticated evaluation model like Cross Validation. The idea behind the model is simple: the data is split into train and test sets several consecutive times and the averaged value of the prediction scores obtained with the different sets is the evaluation of the classifier
from sklearn.cross_validation import cross_val_score
# cross validation with 6 iterations
scores = cross_val_score(classifier, data, t, cv=20)
#print scores
Example #48
0
def bayes():
    #se carga el dataset
    dataset = pd.read_csv("Dataset_Bayes.csv")

    #Se imprime la cantidad de usuarios que ganaron y perdieron
    print(dataset.groupby('Gano').size())

    #Imprime grafica de barras  Gano vs Variables
    dataset.drop(['Gano'], axis=1).hist()
    plt.show()

    #se elimina userId, completer son irrelevantes para aplicar el metodo
    dataset_limpio = dataset.drop(['userId', 'completer'], axis=1)
    dataset_limpio.describe()

    # se limpia el dataset de valores NaN, Inf
    dataset_limpio = limpiar_dataset_Para_Bayes(dataset_limpio)

    #se elimna y obtiene la variable Gano con el fin de poder buscar las 5 mejores variables que pueden determinar si Gano o perdio
    a = dataset_limpio.drop(['Gano'], axis=1)
    b = dataset_limpio['Gano']
    best = SelectKBest(k=5)
    a_new = best.fit_transform(a, b)
    a_new.shape
    selected = best.get_support(indices=True)
    print("Mejores 5 variables")
    print(a.columns[selected])

    #Imprime grafica de correlación de pearson con respecto a las 5 mejores variables
    used_features = a.columns[selected]
    colormap = plt.viridis()
    plt.figure(figsize=(12, 12))
    plt.title('Coeficiente de correlación de Pearson', y=1.05, size=15)
    sns.heatmap(dataset_limpio[used_features].astype(float).corr(),
                linewidths=0.1,
                vmax=1.0,
                square=True,
                cmap=colormap,
                linecolor='white',
                annot=True)
    plt.show()

    #se dividen los datos de entrada en 'entrenamiento' y 'pruebas'
    a_entrenamiento, a_pruebas = train_test_split(dataset_limpio,
                                                  test_size=0.2,
                                                  random_state=6)
    b_entrenamiento = a_entrenamiento["Gano"]
    b_pruebas = a_pruebas["Gano"]

    gnb = GaussianNB()
    gnb.fit(a_entrenamiento[used_features].values, b_entrenamiento)
    y_pred = gnb.predict(a_pruebas[used_features])

    print('Precisión en el set de Entrenamiento: {:.2f}'.format(
        gnb.score(a_entrenamiento[used_features], b_entrenamiento)))
    print('Precisión en el set de Pruebas: {:.2f}'.format(
        gnb.score(a_pruebas[used_features], b_pruebas)))

    #cinco mejores variables
    #'SRL', 'Atry to lecture', 'num_events', 'grade', 'cluster'
    #tomamos datos del dataset donde un usuario perdio y gano (0,1) con relacion a las 5 mejores variables
    print(
        gnb.predict([[1.666666667, 0, 2, 5.999999866, 0],
                     [2.041666667, 150, 151, 62.00000048, 1]]))
Example #49
0
    Data.extend(i)
X = [i[0] for i in Data]
Y = [i[1] for i in Data]
#print(t)
split = len(corpus)-len(corpus)//5


tf = CountVectorizer()
t = tf.fit_transform(X).toarray()
print(t.shape)
print(len(Y))
x_train = t[:split]
x_test = t[split:]
y_train = Y[:split]
y_test =Y[split:]
from sklearn.naive_bayes import  GaussianNB
clf = GaussianNB()
clf.fit(x_train,y_train)
print("Finished Trainning")
print(clf.score(x_test,y_test))

#from nltk.tag import tnt
#tnt_pos_tagger = tnt.TnT()
#tnt_pos_tagger.train(train)

#print(word_tokenize(word_test))

#print(tnt_pos_tagger.evaluate(test))

#print(tnt_pos_tagger.tag(word_tokenize(word_test)))
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data[:, 2:][0:140]
y = iris.target[0:140]
x_test = iris.data[:, 2:][141:150]
y_test = iris.target[141:150]
'''NAIVE BAYES'''

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(x, y)
nb = model.score(x, y)

pred = model.predict(x_test)
sum(x == 0 for x in pred - y_test) / len(pred)
'''DECISION TREES'''

from sklearn import tree
model = tree.DecisionTreeClassifier(class_weight=None,
                                    criterion='entropy',
                                    max_depth=20,
                                    max_features=x.shape[1],
                                    max_leaf_nodes=4,
                                    min_samples_leaf=1,
                                    min_samples_split=1,
                                    min_weight_fraction_leaf=0.0,
                                    presort=False,
Example #51
0
temp = X_test.groupby(['label'])
t = temp.packets.count()
label_predicted = [-1 if e == 0 else e for e in label_predicted]
test_labels = [-1 if e == 0 else e for e in test_labels]

pred = [a*b for a,b in zip(label_predicted,t)]
act =  [a*b for a,b in zip(test_labels,t)]
pp =0
pn = 0
nn = 0
np = 0
for a,b in zip(pred,act):
    if a>0 and b>0:
        pp +=a;
    elif a<0 and b<0:
        nn -= a;
    elif a>0 and b<0:
        pn += a;
    else :
        np += b;
print pp
print nn
print np
print pn
###############################################################################
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_cluster_features,train_labels)
label_predicted=gnb.predict(test_cluster_features)
gnb.score(test_cluster_features,test_labels)
###############################################################################
# Training set and targets
X = bank.drop(columns='y').values
t = bank['y'].values

#experiment 1
from sklearn.model_selection import train_test_split
X_train, X_test, t_train, t_test = train_test_split(X, t, test_size = 0.2, shuffle = True)

#experiment 2
from sklearn.naive_bayes import GaussianNB
gaussian_clf = GaussianNB()
gaussian_clf.fit(X_train, t_train)

#experiment 3
from sklearn.metrix import confusion_matrix
gaussian_score = gaussian_clf.score(X_test, t_test)

gaussian_pred - gaussian_clf.predict(X_test)
cm = confusion_matrix(t_test, gaussian_pred)

gaussian_proba = gaussian_clf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(t_test, gaussian_proba)
auc = roc_auc_score(t_test, gaussian_proba)

print "Gausian CLF Score: " + str(gaussian_score)
print "Confusion Matrix "
print cm
print "Gaussian CLF auc Score: " + str(roc_auc_score)

plt.figure()
plt.plot(fpr, tpr)
Example #53
0
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

digits = load_digits()

x = digits.data  #样本
y = digits.target  #标签

#划分训练集、测试集,其中测试集的比例为0.3
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
"""高斯贝叶斯分类器:GaussianNB"""
gnb = GaussianNB().fit(x_train, y_train)  #利用训练集数据训练模型
gnb_predict = gnb.predict(x_test)  #对测试集进行预测
for i in range(10):  #输出前十个预测结果与实际结果进行比对
    print(f"actual:{y_test[i]},predict:{gnb_predict[i]}")
    gnb_score = gnb.score(x_test, y_test)
print(f"accuracy(GaussianNB):{gnb_score}")

print("-------------------")
"""多项贝叶斯分类器:MultinomialNB"""
mnb = MultinomialNB().fit(x_train, y_train)
mnb_predict = mnb.predict(x_test)
for i in range(10):
    print(f"actual:{y_test[i]},predict:{mnb_predict[i]}")
    mnb_score = mnb.score(x_test, y_test)
print(f"accuracy(MultinomialNB):{mnb_score}")

print("-------------------")
"""伯努利贝叶斯分类器:BernoulliNB"""
bnb = BernoulliNB().fit(x_train, y_train)
bnb_predict = bnb.predict(x_test)
Example #54
0
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

# Look at our data
print(label_names)
print(labels[0])
print(feature_names[0])
print(features[0])

#Split the data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

#Classifier
gnb = GaussianNB()

#Training
model = gnb.fit(train, train_labels)

#Make prediction
preds = gnb.predict(test)
print(preds)

#Evaluate accuracy
print(accuracy_score(test_labels, preds))
print(gnb.score(test, test_labels))
Example #55
0
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=3)
dt_model.fit(train_set, train_labels)

dt_model.score(test_set, test_labels)

y_predict = dt_model.predict(test_set)
y_predict[:5]

test_set.head(5)
"""### Naive Bayes"""

naive_model = GaussianNB()
naive_model.fit(train_set, train_labels)

prediction = naive_model.predict(test_set)
naive_model.score(test_set, test_labels)
"""### Random Forest classifier"""

randomforest_model = RandomForestClassifier(max_depth=2, random_state=0)
randomforest_model.fit(train_set, train_labels)

Importance = pd.DataFrame(
    {'Importance': randomforest_model.feature_importances_ * 100},
    index=train_set.columns)
Importance.sort_values('Importance', axis=0, ascending=True).plot(
    kind='barh',
    color='r',
)

predicted_random = randomforest_model.predict(test_set)
randomforest_model.score(test_set, test_labels)
plt.ylabel("Accuracy")
plt.legend()

# ### Question 2

# In[169]:

pca = PCA(n_components=2)
pca.fit(X)
X_New = pca.transform(X)
X_Test_New = pca.transform(X_Test)

# Naive Baye classifier
clf1 = GaussianNB()
clf1 = clf1.fit(X_New, Y)
print("GaussianNB Acc: {}".format(clf1.score(X_Test_New, Y_Test) * 100))

# KNeighborsClassifier
clf2 = KNeighborsClassifier(n_jobs=-1)
clf2 = clf2.fit(X_New, Y)
print("KNeighborsClassifier Acc: {}".format(
    clf2.score(X_Test_New, Y_Test) * 100))

# DecisionTreeClassifier
clf3 = DecisionTreeClassifier(max_depth=BestDep)
clf3 = clf3.fit(X_New, Y)
print("DecisionTreeClassifier Acc: {}".format(
    clf3.score(X_Test_New, Y_Test) * 100))

for clf in [clf1, clf2, clf3]:
    print(clf.score(X_New, Y))
Example #57
0
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
rfc = RandomForestClassifier(n_estimators=10)
lor = LogisticRegression(random_state=1)
gnb = GaussianNB()
vot = VotingClassifier(estimators=[('lr', lor), ('rf', rfc), ('gnb', gnb),
                                   ('knn', knn)],
                       voting='hard')
lr.fit(x_train, y_train)
svc.fit(x_train, y_train)
knn.fit(x_train, y_train)
rfc.fit(x_train, y_train)
lor.fit(x_train, y_train)
gnb.fit(x_train, y_train)
vot.fit(x_train, y_train)
print("LogisticRegression", lor.score(x_test, y_test))
print("GaussianNB", gnb.score(x_test, y_test))
print("RandomForestClassifier ", rfc.score(x_test, y_test))
print("KNeighborsClassifier ", knn.score(x_test, y_test))
print("SVC ", svc.score(x_test, y_test))
print("LinearRegression ", lr.score(x_test, y_test))
print('VotingClassifier', vot.score(x_test, y_test))
N = 7
x = range(N)
y = [
    lor.score(x_test, y_test),
    gnb.score(x_test, y_test),
    rfc.score(x_test, y_test),
    knn.score(x_test, y_test),
    svc.score(x_test, y_test),
    lr.score(x_test, y_test),
    vot.score(x_test, y_test)
Example #58
0
    plt.figure()
    plt.bar(np.arange(2) + 0.2, trainsc, width=0.4, color='c', align='center')
    plt.bar(np.arange(2) + 0.6, testsc, width=0.4, color='r', align='center')
    plt.xticks(np.arange(2) + 0.4, alg)
    plt.title('Linear Discriminant Analysis accuracy')
    plt.ylabel('Accuracy')
    plt.legend(['Train', 'Test'])
    plt.show()

#%% Naive Bayes Gaussian

if (GNB_cl == 1):
    nb = GaussianNB()
    nb.fit(Xtrain, Ytrain)
    scores = np.empty((4))
    scores[0] = nb.score(Xtrain, Ytrain)
    scores[1] = nb.score(Xtest, Ytest)
    print('Gaussian Naive Bayes, train: {0:.02f}% '.format(scores[0] * 100))
    print('Gaussian Naive Bayes, test: {0:.02f}% '.format(scores[1] * 100))

    bnb = BaggingClassifier(GaussianNB(), max_samples=0.5, n_jobs=-1)
    bnb.fit(Xtrain, Ytrain)
    scores[2] = bnb.score(Xtrain, Ytrain)
    scores[3] = bnb.score(Xtest, Ytest)
    print('Bagging Naive Bayes, test: {0:.02f}% '.format(scores[2] * 100))
    print('Bagging Naive Bayes, test: {0:.02f}% '.format(scores[3] * 100))

    alg = ['Naive Bayes', 'Bagged Naive Bayes']
    trainsc = [scores[0], scores[2]]
    testsc = [scores[1], scores[3]]
    plt.figure()
Example #59
0
def modeloNaiveBayesSampling():
    
    #Carga del dataset almacenado en csv
    dataset = pd.read_csv('dataset2.csv')
   
    
   
    #Reducción de la dimensionalidad, con Feature Selection, usando SelctKBest de Sklearn    
    X=dataset.drop(['Plag'], axis=1)
    y=dataset['Plag']
     
    best=SelectKBest(k=50)
    X_new = best.fit_transform(X, y)
    X_new.shape
    selected = best.get_support(indices=True)
    #print(X.columns[selected])
    used_features =X.columns[selected]
    
    # Separación de los datos del dataset en los cjtos de entrenamiento y test:
    X_train, X_test = train_test_split(dataset, test_size=0.3, random_state=6) 
    y_train =X_train["Plag"]
    y_test = X_test["Plag"]
    
	
    #Configuración del muestreo que combina oversampling y subsamplig:
    os = make_pipeline(
    SMOTE(sampling_strategy={1: 5000}),
    NearMiss(sampling_strategy={0: 15000}))


    X_train_res, y_train_res = os.fit_resample(X_train, y_train)
  
    X_test_res, y_test_res= (X_test, y_test)
     
    
    # Uso del clasificador Gausiano:
    gnb = GaussianNB()
    
    
    #Con el modelo creado, se utiliza fit() para el aprendizaje
    gnb.fit(
        X_train_res[used_features].values,
        y_train_res
    )
    y_pred = gnb.predict(X_test_res[used_features])
    
    #Calculamos la precisión
     
    print('Precisión en el set de Entrenamiento: {:.2f}'
         .format(gnb.score(X_train_res[used_features], y_train_res)))
    print('Precisión en el set de Test: {:.2f}'
         .format(gnb.score(X_test_res[used_features], y_test_res)))
    
    
    #Calculo de la matriz de confusión
    print(confusion_matrix(y_test_res, y_pred))
    
    print ("Distribución inicial de entrenamiento{}".format(Counter(y_train)))
    print ("Distribución finalde entrenamiento: {}".format(Counter(y_train_res)))
    
    print ("Distribución inicial de test {}".format(Counter(y_test)))
    print ("Distribución final de test: {}".format(Counter(y_test_res)))
Example #60
0
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Importing data set
train_df = pd.read_csv('glass.csv')
X = train_df.drop("Type", axis=1)
Y = train_df["Type"]

# Training and testing data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# using navie bays
gnb = GaussianNB()

# Showing the result of test data
Y_prediction = gnb.fit(X_train, y_train).predict(X_test)
acc_gnb = round(gnb.score(X_test, y_test) * 100)

# Calculating the accuracy
print("Accuracy is:", acc_gnb)
print(classification_report(y_test, Y_prediction))