コード例 #1
0
ファイル: bayesTest.py プロジェクト: dineshdb/data-mining
 def testExample1(self):
     dataset = pd.read_csv('allelectronics.csv')
     self.model = NaiveBayes(dataset)
     self.model.train()
     datatuple = {'age':'youth','income':'medium','student':'yes','credit_rating':'fair'}
     print(self.model.predict(datatuple))
     self.assertEqual(self.model.predict(datatuple),'yes')
コード例 #2
0
def runCV(folders,test_sample):
	
	results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	
	results_tests = {'yes_8_10':0,'no_1_7':0}
	
	for index in xrange(len(folders)):
		test = folders[index]
		train = []
		
		for index2 in xrange(len(folders)):
			if index2 != index:
				train.extend(folders[index2])	
		
		NV = NaiveBayes(train,test,False)
		NV.train()
		result = NV.test()
		results_tests[NV.test_one(test_sample)] += 1
		
		for metric,value in result.iteritems():
			results[metric].append(value)
		
	for metric, values in results.iteritems():
		results_AVG[metric] = sum(values)/len(values)

	return (results_AVG,results_tests)
コード例 #3
0
 def test_nb_using_iris(self):
     iris = load_iris()
     data = iris['data']
     target = iris['target']
     nb = NaiveBayes()
     nb.fit(data, target)
     preds = nb.predict(data)
     assert accuracy_score(preds, target) > 0.9
コード例 #4
0
def runCV(folders, new_data):
	
	# results = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
# 	results_AVG = {'Accuracy':[],'Precision':[],'Recall':[],'F1':[]}
	results = {}
	
	for index in xrange(len(folders)):
		test = folders[index]
		train = []
		
		for index2 in xrange(len(folders)):
			if index2 != index:
				train.extend(folders[index2])	
		
		NV = NaiveBayes(train,test,False)
		NV.train()
		
		for sample in new_data:
			result_class = NV.test_one(sample)
			
			if sample.file_name not in results:
				results[sample.file_name] = {}
			
			if result_class not in results[sample.file_name]:
				results[sample.file_name][result_class] = 0
			
			results[sample.file_name][result_class] += 1
		
# 		result = NV.test()
# 		for metric,value in result.iteritems():
# 			results[metric].append(value)
		
# 	for metric, values in results.iteritems():
# 		results_AVG[metric] = sum(values)/len(values)

# 	return (results_AVG,results_tests)

	return results
コード例 #5
0
    def test_naive_bayes(self):
        data = array([
            [1, 'S'],
            [1, 'M'],
            [1, 'M'],
            [1, 'S'],
            [1, 'S'],
            [2, 'S'],
            [2, 'M'],
            [2, 'M'],
            [2, 'L'],
            [2, 'L'],
            [3, 'L'],
            [3, 'M'],
            [3, 'M'],
            [3, 'L'],
            [3, 'L'],
        ])
        labels = array([-1, -1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, -1])

        nb = NaiveBayes()
        nb.fit(data, labels)
        preds = nb.predict(data)
        assert accuracy_score(preds, labels) > 0.7
コード例 #6
0
from dataset_parser import parse_dataset
from bayes import NaiveBayes
from test_train_split import dataset_split

dataset = parse_dataset()
(train, test) = dataset_split(dataset, 0.2)
naive_bayes = NaiveBayes(train, 'class')
score = naive_bayes.evaluate(test)

print(score)
コード例 #7
0
         +##+     +#+       
         ###      +#+       
        +##+      +##+      
        +##       +##+      
        +#+       +##+      
        +#+       +##       
        +#+       +#+       
        +##+     +##+       
         ###+    +##+       
         +####++###++       
         +#########         
          ++#######         
            +###+++         
                            
                            
                            
                            """

digitPercep = PerceptronNetwork(digitWidth * digitHeight, digitY)
digitPercep.train(digitWidth, digitHeight, digitTrainingImagesPath,
                  digitTrainingLabelsPath)

print "Perceptron guess:"
print digitPercep.test_one(digitWidth, digitHeight, digit)

digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2)
digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath,
                 digitTrainingLabelsPath)

print "Naive Bayes guess:"
print digitBayes.test_one(digitWidth, digitHeight, digit)
コード例 #8
0
from bayes import NaiveBayes
from util import FileOperate
from util import train_test_split
from metrics import accuracy_score

# 运行这部分代码的时候,要将 playML 这个文件夹设置为源代码的根文件夹

if __name__ == '__main__':
    # 1、加载数据,spam 表示垃圾短信(1),ham 表示非垃圾短信(0)
    data_path = '../input/SMSSpamCollection'
    label = '\t'
    fo = FileOperate(data_path, label)
    X, y = fo.load_data()

    # 2、分割数据集,得到训练数据集与测试数据集
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.25,
                                                        random_state=666)

    # 开始训练
    nb = NaiveBayes()
    nb.fit(X_train, y_train)

    # 开始预测
    y_pred = nb.predict(X_test)

    # 计算得分
    score = accuracy_score(y_test, y_pred)
    print('准确率:', score)
コード例 #9
0
 def __init__(self, categories, path):
     self.naive_bayes = NaiveBayes(categories)
     self.path = path
     self.classified_examples = dict()
コード例 #10
0
class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row['Id'],
                                e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row['Id'],
                                    e.message)

        correct, incorrect = 0, 0
        with open('%s/labels.csv' % self.path, 'r') as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = (row['Prediction'])
                filename = '%s/TR/TRAIN_%s.eml' % (path, row['Id'])
                if int(row['Id']) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s",
                                    row['Id'], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + '/TT/TEST_%s.eml'

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(
                    self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter,
                            e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values()
                   if category == '0')
        ham = sum(1 for category in self.classified_examples.values()
                  if category == '1')
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" \
               % (spam, ham, (float(spam) / len(self.classified_examples)),
                  (float(ham) / len(self.classified_examples)))

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct, incorrect, (float(correct) / (correct + incorrect)))

    def _store_results(self):
        with open('%s/results.csv' % self.path, 'w+') as resultscsv:
            writer = csv.DictWriter(resultscsv,
                                    fieldnames=['id', 'Prediction'])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({'id': example_num, 'Prediction': category})
コード例 #11
0
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
    ('fm', 'tech news', 'cnn'),
)

cmap = pickle.load(open('../hadoop/thread_views/var/cmap.b',
        'rb'))
clsfr = FMClassifier(cmap)
backend = RedisBackend()
bayes = NaiveBayes(backend=backend)
bayes.train(training_data)
# no we are ready to test the bayes filter

# TODO add support for subfeatures in features

import time
_start = time.time()
# bayes.classify(clsfr, ('aldfksjalskdjfasdflapoliticsadlskfajsldfj',), 'cnn')
# bayes.classify(clsfr, ('politics', 'aldfksjalspoliticskdjfasdflbusinessapmusicadlhomeskfajsldfj', 'music'), 'cnn')
# bayes.classify(clsfr, ('business', 'music', 'love', 'living', 'politics', 'music'), 'cnn', linear_weight_vector=True)
# bayes.classify(clsfr, ('tech', 'computers', 'news'), 'cnn')
line = 'gaming.www.myvidster.com/video/2797926/PornoTubecom_-_Keymon_Phoenix_Mister_Buck_Dee_Truth_Intrigue_and_Jermany_-_Browsin'
bayes.classify(clsfr, line.split('/'), 'myvidster.com', linear_weight_vector=True)
print (time.time() - _start), 'seconds'
コード例 #12
0
            labels.append('0')

for review in islice(reviews,200000,None):
    if 'Restaurants' in business_dict[review['business_id']]['categories']:

        if review['votes']['useful'] >= 1:
            test.append(review['text'])
            correct_labels.append('1')
        elif review['votes']['useful'] == 0:
            test.append(review['text'])
            correct_labels.append('0')

print "data loaded"


clfr = NaiveBayes(data,labels, 60, 2000, 50)
print "training done"
stops = clfr.find_n_most_common_words(50)
for i in range(len(stops)):
    print i, stops[i]

max_ent = clfr.max_entropy(20)
for i in range(len(max_ent)):
    print i, max_ent[i][1]

"""a, b = stops['1'], stops['0']
for i in range(len(a)):
    print i, a[i], b[i]"""
#clfr.find_max_prob_dif()

コード例 #13
0
ファイル: test.py プロジェクト: Renl1001/MachineLearning
# -*- coding:utf-8 -*-
from bayes import NaiveBayes


def loadDataSet():
    train_samples = [
        ['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
        ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
        ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
        ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
        ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
        ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']
    ]
    test_samples = [['love', 'my', 'girl', 'friend'], ['stupid', 'garbage'],
                    ['Haha', 'I', 'really', "Love", "You"],
                    ['This', 'is', "my", "dog"]]
    train_classes = [0, 1, 0, 1, 0, 1]  # 0:good; 1:bad
    return train_samples, train_classes, test_samples


if __name__ == "__main__":
    train_samples, train_classes, test_samples = loadDataSet()

    clf = NaiveBayes()
    clf.train(train_samples, train_classes)
    # test:
    for item in test_samples:
        clf.classify(item)
コード例 #14
0
print "times: {}".format(digitPercepTimes)
print "means: {}".format(digitPercepAvgs)
print "stds: {}".format(digitPercepStds)

# naive bayes classification
#print "---------- Naive Bayes ----------"
digitBayesAvgs = []
digitBayesStds = []
digitBayesTimes = []
for percent in percents:
    p = percent / 10.0
    x = 5
    res = []
    times = []
    for i in range(0, 5):
        digitBayes = NaiveBayes(digitWidth * digitHeight, 10, 2)
        t1 = time.time()
        digitBayes.train(digitWidth, digitHeight, digitTrainingImagesPath,
                         digitTrainingLabelsPath, p)
        dt = time.time() - t1
        percentageCorrect = digitBayes.test(digitWidth, digitHeight,
                                            digitTestImagesPath,
                                            digitTestLabelsPath)
        res.append(percentageCorrect)
        times.append(dt)
    avgTime = mean(times)
    avgAcc = mean(res)
    stdAcc = stddev(res)
    digitBayesAvgs.append(avgAcc)
    digitBayesStds.append(stdAcc)
    digitBayesTimes.append(avgTime)
コード例 #15
0
ファイル: train.py プロジェクト: gjcourt/bayes
#!/usr/bin/env python
import cPickle as pickle
import json
import time
from backends import RedisBackend
from bayes import NaiveBayes
from classifiers import FMClassifier
from optparse import OptionParser
parser = OptionParser(conflict_handler='resolve')
parser.add_option('-h', dest='host')
parser.add_option('-p', '--port', dest='port')
options, args = parser.parse_args()

clsfr = FMClassifier(pickle.load(open('/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b', 'rb')))
backend = RedisBackend(host=options.host, port=options.port)
bayes = NaiveBayes(backend=backend)

_start = time.time()
for file_name in args:
    print 'Training file %s' % file_name
    fd = open(file_name, 'r')
    _counter = 0
    for line in fd:
        _counter += 1
        if _counter % 100000 == 0:
            print _counter, (time.time() - _start)
        # if _counter % 1000000 == 0:
        #     break
        # grab args
        vector, count = line.split('\t')
        vector = json.loads(vector)
コード例 #16
0
            labels.append('0')

for review in islice(reviews,200000,None):
    if 'Restaurants' in business_dict[review['business_id']]['categories']:

        if review['votes']['useful'] >= 1:
            test.append(review['text'])
            correct_labels.append('1')
        elif review['votes']['useful'] == 0:
            test.append(review['text'])
            correct_labels.append('0')

print "data loaded"


clfr = NaiveBayes(data,labels)
print "training done"
#stops = clfr.find_n_most_common_words(50)
#for i in range(len(stops)):
#    print i, stops[i]

max_ent = clfr.max_entropy(20)
for i in range(len(max_ent)):
    print i, max_ent[i]

"""a, b = stops['1'], stops['0']
for i in range(len(a)):
    print i, a[i], b[i]"""
#clfr.find_max_prob_dif()

コード例 #17
0
from backends import RedisBackend
from bayes import NaiveBayes
from classifiers import FMClassifier
from optparse import OptionParser
parser = OptionParser(conflict_handler='resolve')
parser.add_option('-h', dest='host')
parser.add_option('-p', '--port', dest='port')
options, args = parser.parse_args()

clsfr = FMClassifier(
    pickle.load(
        open(
            '/Users/georgecourtsunis/projects/disqus/disqus/analytics/hadoop/thread_views/var/cmap.b',
            'rb')))
backend = RedisBackend(host=options.host, port=options.port)
bayes = NaiveBayes(backend=backend)

_start = time.time()
for file_name in args:
    print 'Training file %s' % file_name
    fd = open(file_name, 'r')
    _counter = 0
    for line in fd:
        _counter += 1
        if _counter % 100000 == 0:
            print _counter, (time.time() - _start)
        # if _counter % 1000000 == 0:
        #     break
        # grab args
        vector, count = line.split('\t')
        vector = json.loads(vector)
コード例 #18
0
 def __init__(self, categories, path):
     self.naive_bayes = NaiveBayes(categories)
     self.path = path
     self.classified_examples = dict()
コード例 #19
0
class SpamHamDetector(object):
    def __init__(self, categories, path):
        self.naive_bayes = NaiveBayes(categories)
        self.path = path
        self.classified_examples = dict()

    def train(self):
        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                try:
                    body = extract_body(filename)
                    self.naive_bayes.train(int(label), body)

                except Exception as e:
                    logger.info("Error training email %s: %s", row["Id"], e.message)

    def train_and_evaluate(self):
        all_ids = list(range(1, 2501))
        random.shuffle(all_ids)
        training_ids, labeling_ids = all_ids[:2250], all_ids[2250:]

        with open("{0}/labels.csv".format(self.path), "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in training_ids:
                    try:
                        body = extract_body(filename)
                        self.naive_bayes.train(int(label), body)
                    except Exception as e:
                        logger.info("Error training email %s: %s", row["Id"], e.message)

        correct, incorrect = 0, 0
        with open("%s/labels.csv" % self.path, "r") as labels_csv:
            reader = csv.DictReader(labels_csv)
            for row in reader:
                label = row["Prediction"]
                filename = "%s/TR/TRAIN_%s.eml" % (path, row["Id"])
                if int(row["Id"]) in labeling_ids:
                    try:
                        test_body = extract_body(filename)
                        result = self.naive_bayes.classify(test_body)
                        if result == int(label):
                            correct += 1
                        else:
                            incorrect += 1
                    except Exception as e:
                        logger.info("Error classifying email %s: %s", row["Id"], e.message)
        return self._calculate_results(correct, incorrect)

    def classify(self, size):
        counter = 1
        test = self.path + "/TT/TEST_%s.eml"

        while counter < size + 1:
            try:
                test_body = extract_body(test % counter)
                self.classified_examples[str(counter)] = str(self.naive_bayes.classify(test_body))
            except Exception as e:
                logger.info("Error classifying email %s: %s", counter, e.message)
            counter += 1

        self._store_results()

    def display_results(self):
        spam = sum(1 for category in self.classified_examples.values() if category == "0")
        ham = sum(1 for category in self.classified_examples.values() if category == "1")
        return "Spam Emails: %s\nHam Emails: %s\nSpam Percent: %s\nHam Percent: %s" % (
            spam,
            ham,
            (float(spam) / len(self.classified_examples)),
            (float(ham) / len(self.classified_examples)),
        )

    def _calculate_results(self, correct, incorrect):
        return "correct %s, incorrect %s, performance measurement %s" % (
            correct,
            incorrect,
            (float(correct) / (correct + incorrect)),
        )

    def _store_results(self):
        with open("%s/results.csv" % self.path, "w+") as resultscsv:
            writer = csv.DictWriter(resultscsv, fieldnames=["id", "Prediction"])
            writer.writeheader()
            for example_num, category in self.classified_examples.items():
                writer.writerow({"id": example_num, "Prediction": category})