예제 #1
0
 def test_prob_density_function(self):
     print("test_prob_density_function")
     # Creating object of classifier for unit testing
     nb = NaiveBayesClassifier()
     nb.mean = [1]
     nb.variance = [3]
     # Testing probability calc with known calculation.
     self.assertAlmostEqual(nb.prob_den_func(0, 3), 0.11825507)
     nb.mean = [1, 2]
     nb.variance = [3, 1]
     self.assertAlmostEqual(nb.prob_den_func(1, 0.1), 0.06561581)
예제 #2
0
 def test_fit(self):
     print("test_fit")
     nb = NaiveBayesClassifier()
     Xis = np.array([[3, 4], [2, 3]])
     yis = [0, 1]
     nb.prior_probs = np.zeros(2, dtype=np.float64)
     # Testing value updation with known calculation.
     self.assertEqual(nb.prior_probs[0], 0)
     self.assertEqual(nb.prior_probs[1], 0)
     self.assertEqual(nb.fit(Xis, yis), None)
     self.assertEqual(nb.prior_probs[0], 0.5)
     self.assertEqual(nb.prior_probs[1], 0.5)
예제 #3
0
    def __init__(self, file_location):
        self.features = set([])
        raw_data = []
        training_data = []
        word_freq = {}
        #self.word_freq = {}
        with open(file_location, 'rb') as data:
            data_reader = csv.DictReader(data)
            for row in data_reader:
                # print row
                h_tokens = nltk.word_tokenize(row['headline'].lower())
                #self.features = self.features.union(set(h_tokens))

                for token in h_tokens:
                    if token in word_freq:
                        word_freq[token] += 1
                    else:
                        word_freq[token] = 1

                #for token in h_tokens:
                #    if token in self.word_freq:
                #        self.word_freq[token] += 1
                #    else:
                #        self.word_freq[token] = 1

                raw_data.append(
                    (h_tokens, 0, float(row[' anger']) / 100))  # anger
                raw_data.append(
                    (h_tokens, 1, float(row[' disgust']) / 100))  # disgust
                raw_data.append(
                    (h_tokens, 2, float(row[' fear']) / 100))  # fear
                raw_data.append((h_tokens, 3, float(row[' joy']) / 100))  # joy
                raw_data.append(
                    (h_tokens, 4, float(row[' sadness']) / 100))  # sadness
                raw_data.append(
                    (h_tokens, 5, float(row[' surprise']) / 100))  # surprise

        for key in word_freq.keys():
            if word_freq[key] > self.threshold:
                self.features.add(key)

        print "F-vec size: " + str(len(self.features))

        for data in raw_data:
            f_vector = []
            for f in self.features:
                f_vector.append(1 if f in data[0] else 0)
            training_data.append((f_vector, data[1], data[2]))

        self.classifier = NaiveBayesClassifier(6, len(self.features))
        self.classifier.train(training_data)
예제 #4
0
 def test_pred(self):
     print("test_pred")
     nb = NaiveBayesClassifier()
     nb.mean = [1, 2]
     nb.variance = [3, 1]
     # testing condition where it is not trained
     self.assertEqual(nb.predict([1.4, 12, 3, 9]), None)
     self.assertEqual(nb.predict([2, 3, 4, 5]), None)
     self.assertEqual(nb.predict([1]), None)
     self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None)
     nb.mean = [1, 1]
     nb.variance = [1, 5]
     nb.in_classes = [1, 0]
     nb.prior_probs = [0.3, 0.2]
     pred = nb.predict([5, 3, 4, 7])
     # Testing condition simulating trained model.
     self.assertEqual(pred[0], [0])
     self.assertEqual(pred[1], [0])
     self.assertEqual(pred[2], [0])
     self.assertEqual(pred[3], [0])
예제 #5
0
from NaiveBayes import NaiveBayesClassifier
#importing the class

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt


#Calculating accuracy --(actual-prediction)/(total no of samples)
def accuracy(y_actual, yhat):
    accuracy = np.sum(y_actual == yhat) / len(y_actual)
    return accuracy


X, y = datasets.make_classification(n_samples=1000,
                                    n_features=10,
                                    n_classes=2,
                                    random_state=123)
#Dividing the dataset into train and test -80-20 division
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)

nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)  #fitting the model
predictions = nb.predict(X_test)  #predicting on the test set

print("Accuracy", accuracy(y_test, predictions))  #calculating the accuracy