Ejemplo n.º 1
0
 def testExample1(self):
     dataset = pd.read_csv('allelectronics.csv')
     self.model = NaiveBayes(dataset)
     self.model.train()
     datatuple = {'age':'youth','income':'medium','student':'yes','credit_rating':'fair'}
     print(self.model.predict(datatuple))
     self.assertEqual(self.model.predict(datatuple),'yes')
Ejemplo n.º 2
0
def runCV(folders,test_sample):
	
	results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	
	results_tests = {'yes_8_10':0,'no_1_7':0}
	
	for index in xrange(len(folders)):
		test = folders[index]
		train = []
		
		for index2 in xrange(len(folders)):
			if index2 != index:
				train.extend(folders[index2])	
		
		NV = NaiveBayes(train,test,False)
		NV.train()
		result = NV.test()
		results_tests[NV.test_one(test_sample)] += 1
		
		for metric,value in result.iteritems():
			results[metric].append(value)
		
	for metric, values in results.iteritems():
		results_AVG[metric] = sum(values)/len(values)

	return (results_AVG,results_tests)
 def test_nb_using_iris(self):
     iris = load_iris()
     data = iris['data']
     target = iris['target']
     nb = NaiveBayes()
     nb.fit(data, target)
     preds = nb.predict(data)
     assert accuracy_score(preds, target) > 0.9
Ejemplo n.º 4
0
def runCV(folders, new_data):
	
	# results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
# 	results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	results = {}
	
	for index in xrange(len(folders)):
		test = folders[index]
		train = []
		
		for index2 in xrange(len(folders)):
			if index2 != index:
				train.extend(folders[index2])	
		
		NV = NaiveBayes(train,test,False)
		NV.train()
		
		for sample in new_data:
			result_class = NV.test_one(sample)
			
			if sample.file_name not in results:
				results[sample.file_name] = {}
			
			if result_class not in results[sample.file_name]:
				results[sample.file_name][result_class] = 0
			
			results[sample.file_name][result_class] += 1
		
# 		result = NV.test()
# 		for metric,value in result.iteritems():
# 			results[metric].append(value)
		
# 	for metric, values in results.iteritems():
# 		results_AVG[metric] = sum(values)/len(values)

# 	return (results_AVG,results_tests)

	return results
    def test_naive_bayes(self):
        data = array([
            [1, 'S'],
            [1, 'M'],
            [1, 'M'],
            [1, 'S'],
            [1, 'S'],
            [2, 'S'],
            [2, 'M'],
            [2, 'M'],
            [2, 'L'],
            [2, 'L'],
            [3, 'L'],
            [3, 'M'],
            [3, 'M'],
            [3, 'L'],
            [3, 'L'],
        ])
        labels = array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])

        nb = NaiveBayes()
        nb.fit(data, labels)
        preds = nb.predict(data)
        assert accuracy_score(preds, labels) > 0.7
Ejemplo n.º 6
0
from dataset_parser import parse_dataset
from bayes import NaiveBayes
from test_train_split import dataset_split

dataset = parse_dataset()
(train, test) = dataset_split(dataset, 0.2)
naive_bayes = NaiveBayes(train, 'class')
score = naive_bayes.evaluate(test)

print(score)
Ejemplo n.º 7
0
         +##+     +#+       
         ###      +#+       
        +##+      +##+      
        +##       +##+      
        +#+       +##+      
        +#+       +##       
        +#+       +#+       
        +##+     +##+       
         ###+    +##+       
         +####++###++       
         +#########         
          ++#######         
            +###+++         
                            
                            
                            
                            """

digitPercep = PerceptronNetwork(digitWidth * digitHeight, digitY)
digitPercep.train(digitWidth, digitHeight, digitTrainingImagesPath,
                  digitTrainingLabelsPath)

print "Perceptron guess:"
print digitPercep.test_one(digitWidth, digitHeight, digit)

digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2)
digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath,
                 digitTrainingLabelsPath)

print "Naive Bayes guess:"
print digitBayes.test_one(digitWidth, digitHeight, digit)
Ejemplo n.º 8
0
from bayes import NaiveBayes
from util import FileOperate
from util import train_test_split
from metrics import accuracy_score

# 运行这部分代码的时候,要将 playML 这个文件夹设置为源代码的根文件夹

if __name__ == '__main__':
    # 1、加载数据,spam 表示垃圾短信(1),ham 表示非垃圾短信(0)
    data_path = '../input/SMSSpamCollection'
    label = '\t'
    fo = FileOperate(data_path, label)
    X, y = fo.load_data()

    # 2、分割数据集,得到训练数据集与测试数据集
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=666)

    # 开始训练
    nb = NaiveBayes()
    nb.fit(X_train, y_train)

    # 开始预测
    y_pred = nb.predict(X_test)

    # 计算得分
    score = accuracy_score(y_test, y_pred)
    print('准确率:', score)
 def __init__(self, categories, path):
     self.naive_bayes = NaiveBayes(categories)
     self.path = path
     self.classified_examples = dict()
Ejemplo n.º 10
0
class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row['Id'],
                                e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row['Id'],
                                    e.message)

        correct, incorrect = 0, 0
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s",
                                    row['Id'], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + '/TT/TEST_%s.eml'

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(
                    self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter,
                            e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values()
                   if category == '0')
        ham = sum(1 for category in self.classified_examples.values()
                  if category == '1')
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" \
               % (spam, ham, (float(spam) / len(self.classified_examples)),
                  (float(ham) / len(self.classified_examples)))

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct, incorrect, (float(correct) / (correct + incorrect)))

    def _store_results(self):
        with open('%s/results.csv' % self.path, 'w+') as resultscsv:
            writer = csv.DictWriter(resultscsv,
                                    fieldnames=['id', 'Prediction'])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({'id': example_num, 'Prediction': category})
Ejemplo n.º 11
0
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
)

cmap = pickle.load(open('../hadoop/thread_views/var/cmap.b',
        'rb'))
clsfr = FMClassifier(cmap)
backend = RedisBackend()
bayes = NaiveBayes(backend=backend)
bayes.train(training_data)
# no we are ready to test the bayes filter

# TODO add support for subfeatures in features

import time
_start = time.time()
# bayes.classify(clsfr, ('aldfksjalskdjfasdflapoliticsadlskfajsldfj',), 'cnn')
# bayes.classify(clsfr, ('politics', 'aldfksjalspoliticskdjfasdflbusinessapmusicadlhomeskfajsldfj', 'music'), 'cnn')
# bayes.classify(clsfr, ('business', 'music', 'love', 'living', 'politics', 'music'), 'cnn', linear_weight_vector=True)
# bayes.classify(clsfr, ('tech', 'computers', 'news'), 'cnn')
line = 'gaming.www.myvidster.com/video/2797926/PornoTubecom_-_Keymon_Phoenix_Mister_Buck_Dee_Truth_Intrigue_and_Jermany_-_Browsin'
bayes.classify(clsfr, line.split('/'), 'myvidster.com', linear_weight_vector=True)
print (time.time() - _start), 'seconds'
Ejemplo n.º 12
0
            labels.append('0')

for review in islice(reviews,200000,None):
    if 'Restaurants' in business_dict[review['business_id']]['categories']:

        if review['votes']['useful'] >= 1:
            test.append(review['text'])
            correct_labels.append('1')
        elif review['votes']['useful'] == 0:
            test.append(review['text'])
            correct_labels.append('0')

print "data loaded"


clfr = NaiveBayes(data,labels, 60, 2000, 50)
print "training done"
stops = clfr.find_n_most_common_words(50)
for i in range(len(stops)):
    print i, stops[i]

max_ent = clfr.max_entropy(20)
for i in range(len(max_ent)):
    print i, max_ent[i][1]

"""a, b = stops['1'], stops['0']
for i in range(len(a)):
    print i, a[i], b[i]"""
#clfr.find_max_prob_dif()

Ejemplo n.º 13
0
# -*- coding:utf-8 -*-
from bayes import NaiveBayes


def loadDataSet():
    train_samples = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    test_samples = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'],
                    ['Haha', 'I', 'really', "Love", "You"],
                    ['This', 'is', "my", "dog"]]
    train_classes = [0, 1, 0, 1, 0, 1]  # 0:good; 1:bad
    return train_samples, train_classes, test_samples


if __name__ == "__main__":
    train_samples, train_classes, test_samples = loadDataSet()

    clf = NaiveBayes()
    clf.train(train_samples, train_classes)
    # test:
    for item in test_samples:
        clf.classify(item)
print "times: {}".format(digitPercepTimes)
print "means: {}".format(digitPercepAvgs)
print "stds: {}".format(digitPercepStds)

# naive bayes classification
#print "---------- Naive Bayes ----------"
digitBayesAvgs = []
digitBayesStds = []
digitBayesTimes = []
for percent in percents:
    p = percent / 10.0
    x = 5
    res = []
    times = []
    for i in range(0, 5):
        digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2)
        t1 = time.time()
        digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath,
                         digitTrainingLabelsPath, p)
        dt = time.time() - t1
        percentageCorrect = digitBayes.test(digitWidth, digitHeight,
                                            digitTestImagesPath,
                                            digitTestLabelsPath)
        res.append(percentageCorrect)
        times.append(dt)
    avgTime = mean(times)
    avgAcc = mean(res)
    stdAcc = stddev(res)
    digitBayesAvgs.append(avgAcc)
    digitBayesStds.append(stdAcc)
    digitBayesTimes.append(avgTime)
Ejemplo n.º 15
0
#!/usr/bin/env python
import cPickle as pickle
import json
import time
from backends import RedisBackend
from bayes import NaiveBayes
from classifiers import FMClassifier
from optparse import OptionParser
parser = OptionParser(conflict_handler='resolve')
parser.add_option('-h', dest='host')
parser.add_option('-p', '--port', dest='port')
options, args = parser.parse_args()

clsfr = FMClassifier(pickle.load(open('/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b', 'rb')))
backend = RedisBackend(host=options.host, port=options.port)
bayes = NaiveBayes(backend=backend)

_start = time.time()
for file_name in args:
    print 'Training file %s' % file_name
    fd = open(file_name, 'r')
    _counter = 0
    for line in fd:
        _counter += 1
        if _counter % 100000 == 0:
            print _counter, (time.time() - _start)
        # if _counter % 1000000 == 0:
        #     break
        # grab args
        vector, count = line.split('\t')
        vector = json.loads(vector)
Ejemplo n.º 16
0
            labels.append('0')

for review in islice(reviews,200000,None):
    if 'Restaurants' in business_dict[review['business_id']]['categories']:

        if review['votes']['useful'] >= 1:
            test.append(review['text'])
            correct_labels.append('1')
        elif review['votes']['useful'] == 0:
            test.append(review['text'])
            correct_labels.append('0')

print "data loaded"


clfr = NaiveBayes(data,labels)
print "training done"
#stops = clfr.find_n_most_common_words(50)
#for i in range(len(stops)):
#    print i, stops[i]

max_ent = clfr.max_entropy(20)
for i in range(len(max_ent)):
    print i, max_ent[i]

"""a, b = stops['1'], stops['0']
for i in range(len(a)):
    print i, a[i], b[i]"""
#clfr.find_max_prob_dif()

Ejemplo n.º 17
0
from backends import RedisBackend
from bayes import NaiveBayes
from classifiers import FMClassifier
from optparse import OptionParser
parser = OptionParser(conflict_handler='resolve')
parser.add_option('-h', dest='host')
parser.add_option('-p', '--port', dest='port')
options, args = parser.parse_args()

clsfr = FMClassifier(
    pickle.load(
        open(
            '/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b',
            'rb')))
backend = RedisBackend(host=options.host, port=options.port)
bayes = NaiveBayes(backend=backend)

_start = time.time()
for file_name in args:
    print 'Training file %s' % file_name
    fd = open(file_name, 'r')
    _counter = 0
    for line in fd:
        _counter += 1
        if _counter % 100000 == 0:
            print _counter, (time.time() - _start)
        # if _counter % 1000000 == 0:
        #     break
        # grab args
        vector, count = line.split('\t')
        vector = json.loads(vector)
Ejemplo n.º 18
0
 def __init__(self, categories, path):
     self.naive_bayes = NaiveBayes(categories)
     self.path = path
     self.classified_examples = dict()
class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row["Id"], e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row["Id"], e.message)

        correct, incorrect = 0, 0
        with open("%s/labels.csv" % self.path, "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s", row["Id"], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + "/TT/TEST_%s.eml"

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter, e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values() if category == "0")
        ham = sum(1 for category in self.classified_examples.values() if category == "1")
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" % (
            spam,
            ham,
            (float(spam) / len(self.classified_examples)),
            (float(ham) / len(self.classified_examples)),
        )

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct,
            incorrect,
            (float(correct) / (correct + incorrect)),
        )

    def _store_results(self):
        with open("%s/results.csv" % self.path, "w+") as resultscsv:
            writer = csv.DictWriter(resultscsv, fieldnames=["id", "Prediction"])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({"id": example_num, "Prediction": category})