コード例 #1
0
def simpleTest():
    # 加载训练好的模型信息
    vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \
        naiveBayes.getTrainedModelInfo()

    # 加载测试数据
    filename = '../emails/test/test.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                  pWordsHealthy, pSpam, smsWords[0])
    print(smsType)
コード例 #2
0
def simpleTest():
    # load saved model from training
    vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \
        naiveBayes.getTrainedModelInfo()

    # load test data
    filename = '../emails/test/test.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                  pWordsHealthy, pSpam, smsWords[0])
    print smsType
コード例 #3
0
def simpleTest():
    # 加载训练好的模型信息
    vocabularyList, pWordsSpamicity, pWordsHealthy, pSpam = \
        naiveBayes.getTrainedModelInfo()

    # 加载测试数据
    filename = '../emails/test/test.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                  pWordsHealthy, pSpam, smsWords[0])
    print smsType
コード例 #4
0
def testClassifyErrorRate():
    # 数据集预处理与存储
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 从训练集中随机选取测试集并从训练集中删除
    testWords = []
    testWordsType = []
    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        # 从训练集中删除要测试的数据
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    # 创建词库
    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print("生成语料库!")

    # 构建词向量
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print("数据标记完成!")
    trainMarkedWords = np.array(trainMarkedWords)
    print("数据转成矩阵!")

    # 通过词库和词向量计算P(S)、P(Wi|S) 、P(Wi|H)
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    # 计算联合概率进行分类
    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print('预测类别:', smsType, '实际类别:', testWordsType[i])
        if smsType != testWordsType[i]:
            errorCount += 1

    print('错误个数:', errorCount, '错误率:', errorCount / testCount)
コード例 #5
0
def testClassifyErrorRate():
    """
    error rate test
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # cross validation
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "generate one hot vector based on the word set!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print "mark data!"
    # convert to nd array
    trainMarkedWords = np.array(trainMarkedWords)
    print "data -> matrix!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print 'predict type:', smsType, 'actual type:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print 'error count:', errorCount, 'error rate:', errorCount / testCount
コード例 #6
0
def testClassifyErrorRate():
    """
    测试分类的错误率
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 交叉验证
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "生成语料库!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print "数据标记完成!"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "数据转成矩阵!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(
        trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print '预测类别:', smsType, '实际类别:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print '错误个数:', errorCount, '错误率:', errorCount / testCount
コード例 #7
0
def testClassifyErrorRate():
    """
    测试分类的错误率
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = naiveBayes.loadSMSData(filename)

    # 交叉验证
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = naiveBayes.createVocabularyList(smsWords)
    print "生成语料库!"
    trainMarkedWords = naiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
    print "数据标记完成!"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "数据转成矩阵!"
    pWordsSpamicity, pWordsHealthy, pSpam = naiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    errorCount = 0.0
    for i in range(testCount):
        smsType = naiveBayes.classify(vocabularyList, pWordsSpamicity,
                                      pWordsHealthy, pSpam, testWords[i])
        print '预测类别:', smsType, '实际类别:', testWordsType[i]
        if smsType != testWordsType[i]:
            errorCount += 1

    print '错误个数:', errorCount, '错误率:', errorCount / testCount