Esempio n. 1
0
def testClassifyErrorRate():
    # 数据集预处理与存储
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 从训练集中随机选取测试集并从训练集中删除
    testWords = []
    testWordsType = []
    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        # 从训练集中删除要测试的数据
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    # 创建词库
    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print("生成语料库!")

    # 构建词向量
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print("数据标记完成!")
    trainMarkedWords = np.array(trainMarkedWords)
    print("数据转成矩阵!")

    # 通过词库和词向量计算P(S)、P(Wi|S) 、P(Wi|H)
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    # 计算联合概率进行分类
    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print('预测类别:', smsType, '实际类别:', testWordsType[i])
        if smsType != testWordsType[i]:
            errorCount += 1

    print('错误个数:', errorCount, '错误率:', errorCount / testCount)
Esempio n. 2
0
def testClassifyErrorRate():
    """
    error rate test
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # cross validation
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "generate one hot vector based on the word set!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print "mark data!"
    # convert to nd array
    trainMarkedWords = np.array(trainMarkedWords)
    print "data -> matrix!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print 'predict type:', smsType, 'actual type:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print 'error count:', errorCount, 'error rate:', errorCount / testCount
Esempio n. 3
0
def testClassifyErrorRate():
    """
    测试分类的错误率
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 交叉验证
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "生成语料库!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print "数据标记完成!"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "数据转成矩阵!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print '预测类别:', smsType, '实际类别:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print '错误个数:', errorCount, '错误率:', errorCount / testCount
Esempio n. 4
0
def testClassifyErrorRate():
    """
    测试分类的错误率
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 交叉验证
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "生成语料库!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
    print "数据标记完成!"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "数据转成矩阵!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print '预测类别:', smsType, '实际类别:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print '错误个数:', errorCount, '错误率:', errorCount / testCount
"""
@Author: MarkLiu
"""
import numpy as np
import SimpleNavieBayes.NavieBayes as naiveBayes

filename = '../emails/training/SMSCollection.txt'
smsWords, classLables = naiveBayes.loadSMSData(filename)
vocabularyList = naiveBayes.createVocabularyList(smsWords)
print "生成语料库!"
trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
print "数据标记完成!"
# 转成array向量
trainMarkedWords = np.array(trainMarkedWords)
print "数据转成矩阵!"
pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)
print 'pSpam:', pSpam
fpSpam = open('pSpam.txt', 'w')
spam = pSpam.__str__()
fpSpam.write(spam)
fpSpam.close()
# 保存训练生成的语料库信息
# 保存语料库词汇
fw = open('vocabularyList.txt', 'w')
for i in range(len(vocabularyList)):
    fw.write(vocabularyList[i] + '\t')
fw.flush()
fw.close()
# 保存训练阶段获取的参数:pWordsSpamicity和pWordsHealthy
np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t')
np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')
import numpy as np
import SimpleNavieBayes.NavieBayes as naiveBayes

filename = 'training.txt'
smsWords, classLables = naiveBayes.loadSMSData(filename)

vocabularyList = naiveBayes.createVocabularyList(smsWords)
print "Create Vocabulary List"

trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
print "Complete Mark word-vector"

trainMarkedWords = np.array(trainMarkedWords)
print "Complete word matrix"

pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)
print 'pSpam:', pSpam

fpSpam = open('pSpam.txt', 'w')
spam = pSpam.__str__()
fpSpam.write(spam)
fpSpam.close()

fw = open('vocabularyList.txt', 'w')
for i in range(len(vocabularyList)):
    fw.write(vocabularyList[i] + '\t')
fw.flush()
fw.close()
np.savetxt('pWordsSpamicity.txt', pWordsSpamicity, delimiter='\t')
np.savetxt('pWordsHealthy.txt', pWordsHealthy, delimiter='\t')