def train( data_dir, outfile , test_pct = 0.3, verbose = True): ''' train naive bayes classifier using ML estimates. if test percentage (test_pct) > 0, hold out that percentage of training data files from each class and use for statistics. ''' labels = os.listdir( data_dir ) if verbose: print(labels) nbc = NaiveBayesClassifier( labels ) def load_label_dir( ddir ): ''' load all datafiles within single directory ''' fnames = os.listdir( os.path.join( data_dir, ddir ) ) def w0(f): ''' worker function (full path) ''' return load_words( os.path.join( data_dir, ddir, f ) ) return [ (fname, w0(fname) ) for fname in fnames ] te_dl = [] n_tr = 0 for label in labels: for (f, ws) in load_label_dir( label ): if random.random() < test_pct: te_dl.append( (f, ws, label) ) else: nbc.add_example( label, ws ) n_tr += 1 #if we've picked a test set, use it def show_stats(s, lbl = None): ''' display F1 stats ''' f1 = s['F1'] * 100.0 pr = s['precision'] * 100.0 rc = s['recall'] * 100.0 if None != lbl: print('%s : F1=%f precision=%f recall=%f' % ( lbl, f1, pr, rc) ) else: print('F1=%f precision=%f recall=%f' % ( f1, pr, rc) ) if test_pct > 0.0: preds = [] obs = [] for (f, ws, l) in te_dl: obs.append(l) c = nbc.classify( ws ) preds.append(c) sts = calc_F1( preds, obs ) show_stats( sts['overall'] ) for l in labels: show_stats( sts[l], l ) #store trained classifier store_classifier( outfile, nbc )
def test_fit(self): print("test_fit") nb = NaiveBayesClassifier() Xis = np.array([[3, 4], [2, 3]]) yis = [0, 1] nb.prior_probs = np.zeros(2, dtype=np.float64) # Testing value updation with known calculation. self.assertEqual(nb.prior_probs[0], 0) self.assertEqual(nb.prior_probs[1], 0) self.assertEqual(nb.fit(Xis, yis), None) self.assertEqual(nb.prior_probs[0], 0.5) self.assertEqual(nb.prior_probs[1], 0.5)
def __init__(self, file_location): self.features = set([]) raw_data = [] training_data = [] word_freq = {} #self.word_freq = {} with open(file_location, 'rb') as data: data_reader = csv.DictReader(data) for row in data_reader: # print row h_tokens = nltk.word_tokenize(row['headline'].lower()) #self.features = self.features.union(set(h_tokens)) for token in h_tokens: if token in word_freq: word_freq[token] += 1 else: word_freq[token] = 1 #for token in h_tokens: # if token in self.word_freq: # self.word_freq[token] += 1 # else: # self.word_freq[token] = 1 raw_data.append( (h_tokens, 0, float(row[' anger']) / 100)) # anger raw_data.append( (h_tokens, 1, float(row[' disgust']) / 100)) # disgust raw_data.append( (h_tokens, 2, float(row[' fear']) / 100)) # fear raw_data.append((h_tokens, 3, float(row[' joy']) / 100)) # joy raw_data.append( (h_tokens, 4, float(row[' sadness']) / 100)) # sadness raw_data.append( (h_tokens, 5, float(row[' surprise']) / 100)) # surprise for key in word_freq.keys(): if word_freq[key] > self.threshold: self.features.add(key) print "F-vec size: " + str(len(self.features)) for data in raw_data: f_vector = [] for f in self.features: f_vector.append(1 if f in data[0] else 0) training_data.append((f_vector, data[1], data[2])) self.classifier = NaiveBayesClassifier(6, len(self.features)) self.classifier.train(training_data)
def test_prob_density_function(self): print("test_prob_density_function") # Creating object of classifier for unit testing nb = NaiveBayesClassifier() nb.mean = [1] nb.variance = [3] # Testing probability calc with known calculation. self.assertAlmostEqual(nb.prob_den_func(0, 3), 0.11825507) nb.mean = [1, 2] nb.variance = [3, 1] self.assertAlmostEqual(nb.prob_den_func(1, 0.1), 0.06561581)
def test_pred(self): print("test_pred") nb = NaiveBayesClassifier() nb.mean = [1, 2] nb.variance = [3, 1] # testing condition where it is not trained self.assertEqual(nb.predict([1.4, 12, 3, 9]), None) self.assertEqual(nb.predict([2, 3, 4, 5]), None) self.assertEqual(nb.predict([1]), None) self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None) nb.mean = [1, 1] nb.variance = [1, 5] nb.in_classes = [1, 0] nb.prior_probs = [0.3, 0.2] pred = nb.predict([5, 3, 4, 7]) # Testing condition simulating trained model. self.assertEqual(pred[0], [0]) self.assertEqual(pred[1], [0]) self.assertEqual(pred[2], [0]) self.assertEqual(pred[3], [0])
from NaiveBayes import NaiveBayesClassifier #importing the class import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt #Calculating accuracy --(actual-prediction)/(total no of samples) def accuracy(y_actual, yhat): accuracy = np.sum(y_actual == yhat) / len(y_actual) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) #Dividing the dataset into train and test -80-20 division X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayesClassifier() nb.fit(X_train, y_train) #fitting the model predictions = nb.predict(X_test) #predicting on the test set print("Accuracy", accuracy(y_test, predictions)) #calculating the accuracy
x_train, x_test, y_train, y_test = train_test_split(car_data, car_targets, test_size=0.3) classifier = GaussianNB() classifier.fit(x_train, y_train) predicted = classifier.predict(x_test) total_correct = 0 total = 0 for i in range(len(predicted)): if np.array_equal(predicted[i], y_test[i]): total_correct += 1 total += 1 print("Accuracy of existing classifier: " + str(total_correct/total)) classifier = NaiveBayesClassifier() # x_train = [["comedy", "deep", "yes"], # ["comedy", "shallow", "yes"], # ["drama", "deep", "yes"], # ["drama", "shallow", "no"], # ["comedy", "deep", "no"], # ["comedy", "shallow", "no"], # ["drama", "deep", "no"]] # y_train = ["low", "high", "high", "low", "high", "high", "low"] classifier.fit(x_train, y_train) predicted = classifier.predict(x_test) total_correct = 0 total = 0 for i in range(len(predicted)):
arff.entropy_discretize_numerics("class", gain_threshold=GAIN_THRESHOLD) print("DONE PREPARING DATA") validation_results = [] for run_num in range(1, NUMBER_OF_TRIALS + 1): print("SELECTING TRAINING DATA") # We just need to remove a random 10% of records from arff.data ten_percent = len(arff.data) // 10 training_records = [] for i in range(ten_percent): index = random.randrange(len(arff.data)) training_records.append(arff.data.pop(index)) print("DONE SELECTING TRAINING DATA") print("BUILDING MODEL") nb = NaiveBayesClassifier(arff) nb.build_model("class") print("DONE BUILDING MODEL") confusion_matrices = {} for core_value in arff.attributes[arff.attr_position["class"]][1]: confusion_matrices[core_value] = {} for core_value in arff.attributes[arff.attr_position["class"]][1]: confusion_matrices[core_value]["tp"] = 0 confusion_matrices[core_value]["tn"] = 0 confusion_matrices[core_value]["fp"] = 0 confusion_matrices[core_value]["fn"] = 0 for record in training_records: classification = nb.classify_record(record) if classification == record[arff.attr_position["class"]]:
class SentimentAnalyzer: threshold = 1 def __init__(self, file_location): self.features = set([]) raw_data = [] training_data = [] word_freq = {} #self.word_freq = {} with open(file_location, 'rb') as data: data_reader = csv.DictReader(data) for row in data_reader: # print row h_tokens = nltk.word_tokenize(row['headline'].lower()) #self.features = self.features.union(set(h_tokens)) for token in h_tokens: if token in word_freq: word_freq[token] += 1 else: word_freq[token] = 1 #for token in h_tokens: # if token in self.word_freq: # self.word_freq[token] += 1 # else: # self.word_freq[token] = 1 raw_data.append( (h_tokens, 0, float(row[' anger']) / 100)) # anger raw_data.append( (h_tokens, 1, float(row[' disgust']) / 100)) # disgust raw_data.append( (h_tokens, 2, float(row[' fear']) / 100)) # fear raw_data.append((h_tokens, 3, float(row[' joy']) / 100)) # joy raw_data.append( (h_tokens, 4, float(row[' sadness']) / 100)) # sadness raw_data.append( (h_tokens, 5, float(row[' surprise']) / 100)) # surprise for key in word_freq.keys(): if word_freq[key] > self.threshold: self.features.add(key) print "F-vec size: " + str(len(self.features)) for data in raw_data: f_vector = [] for f in self.features: f_vector.append(1 if f in data[0] else 0) training_data.append((f_vector, data[1], data[2])) self.classifier = NaiveBayesClassifier(6, len(self.features)) self.classifier.train(training_data) def predict(self, text): token_set = set(nltk.word_tokenize(text.lower())) f_vector = [] for f in self.features: f_vector.append(1 if f in token_set else 0) return self.classifier.predict(f_vector) def predict_all(self, text): token_set = set(nltk.word_tokenize(text.lower())) f_vector = [] for f in self.features: f_vector.append(1 if f in token_set else 0) return self.classifier.predict_all(f_vector) def test(self, test_file_location): test_data = open(test_file_location, 'rb') test_reader = csv.DictReader(test_data) total = 0 correct = 0 for row in test_reader: total += 1 emotions = map(float, [ row[' anger'], row[' disgust'], row[' fear'], row[' joy'], row[' sadness'], row[' surprise'] ]) acceptable_emotions = [] for i in xrange(len(emotions)): if emotions[i] > 1: acceptable_emotions.append(i) acceptable_emotions = sorted(acceptable_emotions, reverse=True, key=lambda x: emotions[x])[:3] #print acceptable_emotions #print emotion prediction = self.predict(row['headline'])[0] #print prediction if prediction in acceptable_emotions: correct += 1 return float(correct) / total
with open(fn, 'r') as file: for line in file: if line.startswith("Subject:"): subject = re.sub(r"^Subject:", "", line).strip() data.append((subject, is_spam)) def split_data(data, p): return data[:int(len(data) * p)], data[int(len(data) * p):] def in_random_order(data): indices = [i for i, _ in enumerate(data)] random.shuffle(indices) result = [] for i in indices: result.append(data[i]) return result random.seed(0) train_data, test_data = split_data(in_random_order(data), 0.75) classifier = NaiveBayesClassifier() classifier.train(train_data) classified = [(message, is_spam, classifier.classify(message)) for message, is_spam in test_data] counts = Counter( (is_spam, spam_prob > 0.5) for (_, is_spam, spam_prob) in classified)
def load_classifier( infile ): inp = open(infile, 'r') jtxt = inp.read() inp.close() jd = json.loads( jtxt ) return NaiveBayesClassifier.unfold_classifier( jd )