def test_pred(self): print("test_pred") nb = NaiveBayesClassifier() nb.mean = [1, 2] nb.variance = [3, 1] # testing condition where it is not trained self.assertEqual(nb.predict([1.4, 12, 3, 9]), None) self.assertEqual(nb.predict([2, 3, 4, 5]), None) self.assertEqual(nb.predict([1]), None) self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None) nb.mean = [1, 1] nb.variance = [1, 5] nb.in_classes = [1, 0] nb.prior_probs = [0.3, 0.2] pred = nb.predict([5, 3, 4, 7]) # Testing condition simulating trained model. self.assertEqual(pred[0], [0]) self.assertEqual(pred[1], [0]) self.assertEqual(pred[2], [0]) self.assertEqual(pred[3], [0])
from NaiveBayes import NaiveBayesClassifier #importing the class import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt #Calculating accuracy --(actual-prediction)/(total no of samples) def accuracy(y_actual, yhat): accuracy = np.sum(y_actual == yhat) / len(y_actual) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) #Dividing the dataset into train and test -80-20 division X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayesClassifier() nb.fit(X_train, y_train) #fitting the model predictions = nb.predict(X_test) #predicting on the test set print("Accuracy", accuracy(y_test, predictions)) #calculating the accuracy
class SentimentAnalyzer: threshold = 1 def __init__(self, file_location): self.features = set([]) raw_data = [] training_data = [] word_freq = {} #self.word_freq = {} with open(file_location, 'rb') as data: data_reader = csv.DictReader(data) for row in data_reader: # print row h_tokens = nltk.word_tokenize(row['headline'].lower()) #self.features = self.features.union(set(h_tokens)) for token in h_tokens: if token in word_freq: word_freq[token] += 1 else: word_freq[token] = 1 #for token in h_tokens: # if token in self.word_freq: # self.word_freq[token] += 1 # else: # self.word_freq[token] = 1 raw_data.append( (h_tokens, 0, float(row[' anger']) / 100)) # anger raw_data.append( (h_tokens, 1, float(row[' disgust']) / 100)) # disgust raw_data.append( (h_tokens, 2, float(row[' fear']) / 100)) # fear raw_data.append((h_tokens, 3, float(row[' joy']) / 100)) # joy raw_data.append( (h_tokens, 4, float(row[' sadness']) / 100)) # sadness raw_data.append( (h_tokens, 5, float(row[' surprise']) / 100)) # surprise for key in word_freq.keys(): if word_freq[key] > self.threshold: self.features.add(key) print "F-vec size: " + str(len(self.features)) for data in raw_data: f_vector = [] for f in self.features: f_vector.append(1 if f in data[0] else 0) training_data.append((f_vector, data[1], data[2])) self.classifier = NaiveBayesClassifier(6, len(self.features)) self.classifier.train(training_data) def predict(self, text): token_set = set(nltk.word_tokenize(text.lower())) f_vector = [] for f in self.features: f_vector.append(1 if f in token_set else 0) return self.classifier.predict(f_vector) def predict_all(self, text): token_set = set(nltk.word_tokenize(text.lower())) f_vector = [] for f in self.features: f_vector.append(1 if f in token_set else 0) return self.classifier.predict_all(f_vector) def test(self, test_file_location): test_data = open(test_file_location, 'rb') test_reader = csv.DictReader(test_data) total = 0 correct = 0 for row in test_reader: total += 1 emotions = map(float, [ row[' anger'], row[' disgust'], row[' fear'], row[' joy'], row[' sadness'], row[' surprise'] ]) acceptable_emotions = [] for i in xrange(len(emotions)): if emotions[i] > 1: acceptable_emotions.append(i) acceptable_emotions = sorted(acceptable_emotions, reverse=True, key=lambda x: emotions[x])[:3] #print acceptable_emotions #print emotion prediction = self.predict(row['headline'])[0] #print prediction if prediction in acceptable_emotions: correct += 1 return float(correct) / total