def testExample1(self): dataset = pd.read_csv('allelectronics.csv') self.model = NaiveBayes(dataset) self.model.train() datatuple = {'age':'youth','income':'medium','student':'yes','credit_rating':'fair'} print(self.model.predict(datatuple)) self.assertEqual(self.model.predict(datatuple),'yes')
def runCV(folders,test_sample): results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]} results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]} results_tests = {'yes_8_10':0,'no_1_7':0} for index in xrange(len(folders)): test = folders[index] train = [] for index2 in xrange(len(folders)): if index2 != index: train.extend(folders[index2]) NV = NaiveBayes(train,test,False) NV.train() result = NV.test() results_tests[NV.test_one(test_sample)] += 1 for metric,value in result.iteritems(): results[metric].append(value) for metric, values in results.iteritems(): results_AVG[metric] = sum(values)/len(values) return (results_AVG,results_tests)
def test_nb_using_iris(self): iris = load_iris() data = iris['data'] target = iris['target'] nb = NaiveBayes() nb.fit(data, target) preds = nb.predict(data) assert accuracy_score(preds, target) > 0.9
def runCV(folders, new_data): # results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]} # results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]} results = {} for index in xrange(len(folders)): test = folders[index] train = [] for index2 in xrange(len(folders)): if index2 != index: train.extend(folders[index2]) NV = NaiveBayes(train,test,False) NV.train() for sample in new_data: result_class = NV.test_one(sample) if sample.file_name not in results: results[sample.file_name] = {} if result_class not in results[sample.file_name]: results[sample.file_name][result_class] = 0 results[sample.file_name][result_class] += 1 # result = NV.test() # for metric,value in result.iteritems(): # results[metric].append(value) # for metric, values in results.iteritems(): # results_AVG[metric] = sum(values)/len(values) # return (results_AVG,results_tests) return results
def test_naive_bayes(self): data = array([ [1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'], [2, 'S'], [2, 'M'], [2, 'M'], [2, 'L'], [2, 'L'], [3, 'L'], [3, 'M'], [3, 'M'], [3, 'L'], [3, 'L'], ]) labels = array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1]) nb = NaiveBayes() nb.fit(data, labels) preds = nb.predict(data) assert accuracy_score(preds, labels) > 0.7
from dataset_parser import parse_dataset from bayes import NaiveBayes from test_train_split import dataset_split dataset = parse_dataset() (train, test) = dataset_split(dataset, 0.2) naive_bayes = NaiveBayes(train, 'class') score = naive_bayes.evaluate(test) print(score)
+##+ +#+ ### +#+ +##+ +##+ +## +##+ +#+ +##+ +#+ +## +#+ +#+ +##+ +##+ ###+ +##+ +####++###++ +######### ++####### +###+++ """ digitPercep = PerceptronNetwork(digitWidth * digitHeight, digitY) digitPercep.train(digitWidth, digitHeight, digitTrainingImagesPath, digitTrainingLabelsPath) print "Perceptron guess:" print digitPercep.test_one(digitWidth, digitHeight, digit) digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2) digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath, digitTrainingLabelsPath) print "Naive Bayes guess:" print digitBayes.test_one(digitWidth, digitHeight, digit)
from bayes import NaiveBayes from util import FileOperate from util import train_test_split from metrics import accuracy_score # 运行这部分代码的时候,要将 playML 这个文件夹设置为源代码的根文件夹 if __name__ == '__main__': # 1、加载数据,spam 表示垃圾短信(1),ham 表示非垃圾短信(0) data_path = '../input/SMSSpamCollection' label = '\t' fo = FileOperate(data_path, label) X, y = fo.load_data() # 2、分割数据集,得到训练数据集与测试数据集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=666) # 开始训练 nb = NaiveBayes() nb.fit(X_train, y_train) # 开始预测 y_pred = nb.predict(X_test) # 计算得分 score = accuracy_score(y_test, y_pred) print('准确率:', score)
def __init__(self, categories, path): self.naive_bayes = NaiveBayes(categories) self.path = path self.classified_examples = dict()
class SpamHamDetector(object): def __init__(self, categories, path): self.naive_bayes = NaiveBayes(categories) self.path = path self.classified_examples = dict() def train(self): with open('%s/labels.csv' % self.path, 'r') as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = (row['Prediction']) filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id']) try: body = extract_body(filename) self.naive_bayes.train(int(label), body) except Exception as e: logger.info("Error training email %s: %s", row['Id'], e.message) def train_and_evaluate(self): all_ids = list(range(1, 2501)) random.shuffle(all_ids) training_ids, labeling_ids = all_ids[:2250], all_ids[2250:] with open('%s/labels.csv' % self.path, 'r') as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = (row['Prediction']) filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id']) if int(row['Id']) in training_ids: try: body = extract_body(filename) self.naive_bayes.train(int(label), body) except Exception as e: logger.info("Error training email %s: %s", row['Id'], e.message) correct, incorrect = 0, 0 with open('%s/labels.csv' % self.path, 'r') as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = (row['Prediction']) filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id']) if int(row['Id']) in labeling_ids: try: test_body = extract_body(filename) result = self.naive_bayes.classify(test_body) if result == int(label): correct += 1 else: incorrect += 1 except Exception as e: logger.info("Error classifying email %s: %s", row['Id'], e.message) return self._calculate_results(correct, incorrect) def classify(self, size): counter = 1 test = self.path + '/TT/TEST_%s.eml' while counter < size + 1: try: test_body = extract_body(test % counter) self.classified_examples[str(counter)] = str( self.naive_bayes.classify(test_body)) except Exception as e: logger.info("Error classifying email %s: %s", counter, e.message) counter += 1 self._store_results() def display_results(self): spam = sum(1 for category in self.classified_examples.values() if category == '0') ham = sum(1 for category in self.classified_examples.values() if category == '1') return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" \ % (spam, ham, (float(spam) / len(self.classified_examples)), (float(ham) / len(self.classified_examples))) def _calculate_results(self, correct, incorrect): return "correct %s, incorrect %s, performance measurement %s" % ( correct, incorrect, (float(correct) / (correct + incorrect))) def _store_results(self): with open('%s/results.csv' % self.path, 'w+') as resultscsv: writer = csv.DictWriter(resultscsv, fieldnames=['id', 'Prediction']) writer.writeheader() for example_num, category in self.classified_examples.items(): writer.writerow({'id': example_num, 'Prediction': category})
('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ('fm', 'tech news', 'cnn'), ) cmap = pickle.load(open('../hadoop/thread_views/var/cmap.b', 'rb')) clsfr = FMClassifier(cmap) backend = RedisBackend() bayes = NaiveBayes(backend=backend) bayes.train(training_data) # no we are ready to test the bayes filter # TODO add support for subfeatures in features import time _start = time.time() # bayes.classify(clsfr, ('aldfksjalskdjfasdflapoliticsadlskfajsldfj',), 'cnn') # bayes.classify(clsfr, ('politics', 'aldfksjalspoliticskdjfasdflbusinessapmusicadlhomeskfajsldfj', 'music'), 'cnn') # bayes.classify(clsfr, ('business', 'music', 'love', 'living', 'politics', 'music'), 'cnn', linear_weight_vector=True) # bayes.classify(clsfr, ('tech', 'computers', 'news'), 'cnn') line = 'gaming.www.myvidster.com/video/2797926/PornoTubecom_-_Keymon_Phoenix_Mister_Buck_Dee_Truth_Intrigue_and_Jermany_-_Browsin' bayes.classify(clsfr, line.split('/'), 'myvidster.com', linear_weight_vector=True) print (time.time() - _start), 'seconds'
labels.append('0') for review in islice(reviews,200000,None): if 'Restaurants' in business_dict[review['business_id']]['categories']: if review['votes']['useful'] >= 1: test.append(review['text']) correct_labels.append('1') elif review['votes']['useful'] == 0: test.append(review['text']) correct_labels.append('0') print "data loaded" clfr = NaiveBayes(data,labels, 60, 2000, 50) print "training done" stops = clfr.find_n_most_common_words(50) for i in range(len(stops)): print i, stops[i] max_ent = clfr.max_entropy(20) for i in range(len(max_ent)): print i, max_ent[i][1] """a, b = stops['1'], stops['0'] for i in range(len(a)): print i, a[i], b[i]""" #clfr.find_max_prob_dif()
# -*- coding:utf-8 -*- from bayes import NaiveBayes def loadDataSet(): train_samples = [ ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid'] ] test_samples = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'], ['Haha', 'I', 'really', "Love", "You"], ['This', 'is', "my", "dog"]] train_classes = [0, 1, 0, 1, 0, 1] # 0:good; 1:bad return train_samples, train_classes, test_samples if __name__ == "__main__": train_samples, train_classes, test_samples = loadDataSet() clf = NaiveBayes() clf.train(train_samples, train_classes) # test: for item in test_samples: clf.classify(item)
print "times: {}".format(digitPercepTimes) print "means: {}".format(digitPercepAvgs) print "stds: {}".format(digitPercepStds) # naive bayes classification #print "---------- Naive Bayes ----------" digitBayesAvgs = [] digitBayesStds = [] digitBayesTimes = [] for percent in percents: p = percent / 10.0 x = 5 res = [] times = [] for i in range(0, 5): digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2) t1 = time.time() digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath, digitTrainingLabelsPath, p) dt = time.time() - t1 percentageCorrect = digitBayes.test(digitWidth, digitHeight, digitTestImagesPath, digitTestLabelsPath) res.append(percentageCorrect) times.append(dt) avgTime = mean(times) avgAcc = mean(res) stdAcc = stddev(res) digitBayesAvgs.append(avgAcc) digitBayesStds.append(stdAcc) digitBayesTimes.append(avgTime)
#!/usr/bin/env python import cPickle as pickle import json import time from backends import RedisBackend from bayes import NaiveBayes from classifiers import FMClassifier from optparse import OptionParser parser = OptionParser(conflict_handler='resolve') parser.add_option('-h', dest='host') parser.add_option('-p', '--port', dest='port') options, args = parser.parse_args() clsfr = FMClassifier(pickle.load(open('/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b', 'rb'))) backend = RedisBackend(host=options.host, port=options.port) bayes = NaiveBayes(backend=backend) _start = time.time() for file_name in args: print 'Training file %s' % file_name fd = open(file_name, 'r') _counter = 0 for line in fd: _counter += 1 if _counter % 100000 == 0: print _counter, (time.time() - _start) # if _counter % 1000000 == 0: # break # grab args vector, count = line.split('\t') vector = json.loads(vector)
labels.append('0') for review in islice(reviews,200000,None): if 'Restaurants' in business_dict[review['business_id']]['categories']: if review['votes']['useful'] >= 1: test.append(review['text']) correct_labels.append('1') elif review['votes']['useful'] == 0: test.append(review['text']) correct_labels.append('0') print "data loaded" clfr = NaiveBayes(data,labels) print "training done" #stops = clfr.find_n_most_common_words(50) #for i in range(len(stops)): # print i, stops[i] max_ent = clfr.max_entropy(20) for i in range(len(max_ent)): print i, max_ent[i] """a, b = stops['1'], stops['0'] for i in range(len(a)): print i, a[i], b[i]""" #clfr.find_max_prob_dif()
from backends import RedisBackend from bayes import NaiveBayes from classifiers import FMClassifier from optparse import OptionParser parser = OptionParser(conflict_handler='resolve') parser.add_option('-h', dest='host') parser.add_option('-p', '--port', dest='port') options, args = parser.parse_args() clsfr = FMClassifier( pickle.load( open( '/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b', 'rb'))) backend = RedisBackend(host=options.host, port=options.port) bayes = NaiveBayes(backend=backend) _start = time.time() for file_name in args: print 'Training file %s' % file_name fd = open(file_name, 'r') _counter = 0 for line in fd: _counter += 1 if _counter % 100000 == 0: print _counter, (time.time() - _start) # if _counter % 1000000 == 0: # break # grab args vector, count = line.split('\t') vector = json.loads(vector)
def __init__(self, categories, path): self.naive_bayes = NaiveBayes(categories) self.path = path self.classified_examples = dict()
class SpamHamDetector(object): def __init__(self, categories, path): self.naive_bayes = NaiveBayes(categories) self.path = path self.classified_examples = dict() def train(self): with open("{0}/labels.csv".format(self.path), "r") as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = row["Prediction"] filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"]) try: body = extract_body(filename) self.naive_bayes.train(int(label), body) except Exception as e: logger.info("Error training email %s: %s", row["Id"], e.message) def train_and_evaluate(self): all_ids = list(range(1, 2501)) random.shuffle(all_ids) training_ids, labeling_ids = all_ids[:2250], all_ids[2250:] with open("{0}/labels.csv".format(self.path), "r") as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = row["Prediction"] filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"]) if int(row["Id"]) in training_ids: try: body = extract_body(filename) self.naive_bayes.train(int(label), body) except Exception as e: logger.info("Error training email %s: %s", row["Id"], e.message) correct, incorrect = 0, 0 with open("%s/labels.csv" % self.path, "r") as labels_csv: reader = csv.DictReader(labels_csv) for row in reader: label = row["Prediction"] filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"]) if int(row["Id"]) in labeling_ids: try: test_body = extract_body(filename) result = self.naive_bayes.classify(test_body) if result == int(label): correct += 1 else: incorrect += 1 except Exception as e: logger.info("Error classifying email %s: %s", row["Id"], e.message) return self._calculate_results(correct, incorrect) def classify(self, size): counter = 1 test = self.path + "/TT/TEST_%s.eml" while counter < size + 1: try: test_body = extract_body(test % counter) self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body)) except Exception as e: logger.info("Error classifying email %s: %s", counter, e.message) counter += 1 self._store_results() def display_results(self): spam = sum(1 for category in self.classified_examples.values() if category == "0") ham = sum(1 for category in self.classified_examples.values() if category == "1") return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" % ( spam, ham, (float(spam) / len(self.classified_examples)), (float(ham) / len(self.classified_examples)), ) def _calculate_results(self, correct, incorrect): return "correct %s, incorrect %s, performance measurement %s" % ( correct, incorrect, (float(correct) / (correct + incorrect)), ) def _store_results(self): with open("%s/results.csv" % self.path, "w+") as resultscsv: writer = csv.DictWriter(resultscsv, fieldnames=["id", "Prediction"]) writer.writeheader() for example_num, category in self.classified_examples.items(): writer.writerow({"id": example_num, "Prediction": category})