def simpleTest(): # 加载训练好的模型信息 vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.getTrainedModelInfo() # 加载测试数据 filename = '../emails/test/test.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, smsWords[0]) print(smsType)
def simpleTest(): # load saved model from training vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.getTrainedModelInfo() # load test data filename = '../emails/test/test.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, smsWords[0]) print smsType
def simpleTest(): # 加载训练好的模型信息 vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \ naiveBayes.getTrainedModelInfo() # 加载测试数据 filename = '../emails/test/test.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, smsWords[0]) print smsType
def testClassifyErrorRate(): # 数据集预处理与存储 filename = '../emails/training/SMSCollection.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) # 从训练集中随机选取测试集并从训练集中删除 testWords = [] testWordsType = [] testCount = 1000 for i in range(testCount): randomIndex = int(random.uniform(0, len(smsWords))) testWordsType.append(classLables[randomIndex]) testWords.append(smsWords[randomIndex]) # 从训练集中删除要测试的数据 del (smsWords[randomIndex]) del (classLables[randomIndex]) # 创建词库 vocabularyList = naiveBayes.createVocabularyList(smsWords) print("生成语料库!") # 构建词向量 trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, smsWords) print("数据标记完成!") trainMarkedWords = np.array(trainMarkedWords) print("数据转成矩阵!") # 通过词库和词向量计算P(S)、P(Wi|S) 、P(Wi|H) pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes( trainMarkedWords, classLables) # 计算联合概率进行分类 errorCount = 0.0 for i in range(testCount): smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, testWords[i]) print('预测类别:', smsType, '实际类别:', testWordsType[i]) if smsType != testWordsType[i]: errorCount += 1 print('错误个数:', errorCount, '错误率:', errorCount / testCount)
def testClassifyErrorRate(): """ error rate test :return: """ filename = '../emails/training/SMSCollection.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) # cross validation testWords = [] testWordsType = [] testCount = 1000 for i in range(testCount): randomIndex = int(random.uniform(0, len(smsWords))) testWordsType.append(classLables[randomIndex]) testWords.append(smsWords[randomIndex]) del (smsWords[randomIndex]) del (classLables[randomIndex]) vocabularyList = naiveBayes.createVocabularyList(smsWords) print "generate one hot vector based on the word set!" trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, smsWords) print "mark data!" # convert to nd array trainMarkedWords = np.array(trainMarkedWords) print "data -> matrix!" pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes( trainMarkedWords, classLables) errorCount = 0.0 for i in range(testCount): smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, testWords[i]) print 'predict type:', smsType, 'actual type:', testWordsType[i] if smsType != testWordsType[i]: errorCount += 1 print 'error count:', errorCount, 'error rate:', errorCount / testCount
def testClassifyErrorRate(): """ 测试分类的错误率 :return: """ filename = '../emails/training/SMSCollection.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) # 交叉验证 testWords = [] testWordsType = [] testCount = 1000 for i in range(testCount): randomIndex = int(random.uniform(0, len(smsWords))) testWordsType.append(classLables[randomIndex]) testWords.append(smsWords[randomIndex]) del (smsWords[randomIndex]) del (classLables[randomIndex]) vocabularyList = naiveBayes.createVocabularyList(smsWords) print "生成语料库!" trainMarkedWords = naiveBayes.setOfWordsListToVecTor( vocabularyList, smsWords) print "数据标记完成!" # 转成array向量 trainMarkedWords = np.array(trainMarkedWords) print "数据转成矩阵!" pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes( trainMarkedWords, classLables) errorCount = 0.0 for i in range(testCount): smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, testWords[i]) print '预测类别:', smsType, '实际类别:', testWordsType[i] if smsType != testWordsType[i]: errorCount += 1 print '错误个数:', errorCount, '错误率:', errorCount / testCount
def testClassifyErrorRate(): """ 测试分类的错误率 :return: """ filename = '../emails/training/SMSCollection.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) # 交叉验证 testWords = [] testWordsType = [] testCount = 1000 for i in range(testCount): randomIndex = int(random.uniform(0, len(smsWords))) testWordsType.append(classLables[randomIndex]) testWords.append(smsWords[randomIndex]) del (smsWords[randomIndex]) del (classLables[randomIndex]) vocabularyList = naiveBayes.createVocabularyList(smsWords) print "生成语料库!" trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords) print "数据标记完成!" # 转成array向量 trainMarkedWords = np.array(trainMarkedWords) print "数据转成矩阵!" pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables) errorCount = 0.0 for i in range(testCount): smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam, testWords[i]) print '预测类别:', smsType, '实际类别:', testWordsType[i] if smsType != testWordsType[i]: errorCount += 1 print '错误个数:', errorCount, '错误率:', errorCount / testCount
#!/usr/bin/python2.7 # _*_ coding: utf-8 _*_ """ @Author: MarkLiu """ import numpy as np import SimpleNavieBayes.NavieBayes as naiveBayes filename = '../emails/training/SMSCollection.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) vocabularyList = naiveBayes.createVocabularyList(smsWords) print "生成语料库!" trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords) print "数据标记完成!" # 转成array向量 trainMarkedWords = np.array(trainMarkedWords) print "数据转成矩阵!" pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables) print 'pSpam:', pSpam fpSpam = open('pSpam.txt', 'w') spam = pSpam.__str__() fpSpam.write(spam) fpSpam.close() # 保存训练生成的语料库信息 # 保存语料库词汇 fw = open('vocabularyList.txt', 'w') for i in range(len(vocabularyList)): fw.write(vocabularyList[i] + '\t') fw.flush() fw.close()
import numpy as np import SimpleNavieBayes.NavieBayes as naiveBayes filename = 'training.txt' smsWords, classLables = naiveBayes.loadSMSData(filename) vocabularyList = naiveBayes.createVocabularyList(smsWords) print "Create Vocabulary List" trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords) print "Complete Mark word-vector" trainMarkedWords = np.array(trainMarkedWords) print "Complete word matrix" pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables) print 'pSpam:', pSpam fpSpam = open('pSpam.txt', 'w') spam = pSpam.__str__() fpSpam.write(spam) fpSpam.close() fw = open('vocabularyList.txt', 'w') for i in range(len(vocabularyList)): fw.write(vocabularyList[i] + '\t') fw.flush() fw.close() np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t') np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')