Python AdaboostNavieBayes.trainingNaiveBayes 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: AdaBoostAndNavieBayes

클래스/타입: AdaboostNavieBayes

메소드/함수: trainingNaiveBayes

hotexamples.com에서의 예제들: 2

Python AdaboostNavieBayes.trainingNaiveBayes - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 AdaBoostAndNavieBayes.AdaboostNavieBayes.trainingNaiveBayes에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

classify(3)

loadSMSData(3)

setOfWordsToVecTor(3)

createVocabularyList(2)

setOfWordsListToVecTor(2)

trainingNaiveBayes(2)

getVocabularyList(1)

예제 #1

파일 보기

파일: training.py 프로젝트: fengchangfight/NaiveBayesSpamFilter

def trainingAdaboostGetDS(iterateNum=40):
    """
    测试分类的错误率
    :param iterateNum:
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = boostNaiveBayes.loadSMSData(filename)

    # cross validation
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])

    vocabularyList = boostNaiveBayes.createVocabularyList(smsWords)
    print "Generating vector"
    trainMarkedWords = boostNaiveBayes.setOfWordsListToVecTor(
        vocabularyList, smsWords)
    print "Data marked"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "To matrix"
    pWordsSpamicity, pWordsHealthy, pSpam = \
        boostNaiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    DS = np.ones(len(vocabularyList))

    ds_errorRate = {}
    minErrorRate = np.inf
    for i in range(iterateNum):
        errorCount = 0.0
        for j in range(testCount):
            testWordsCount = boostNaiveBayes.setOfWordsToVecTor(
                vocabularyList, testWords[j])
            ps, ph, smsType = boostNaiveBayes.classify(pWordsSpamicity,
                                                       pWordsHealthy, DS,
                                                       pSpam, testWordsCount)

            if smsType != testWordsType[j]:
                errorCount += 1
                # alpha = (ph - ps) / ps
                alpha = ps - ph
                if alpha > 0:  # actual: ham，predict: spam  !!!!serious problem, to make D smaller, so it's more likely to predict ham
                    DS[testWordsCount != 0] = np.abs(
                        (DS[testWordsCount != 0] - np.exp(alpha)) /
                        DS[testWordsCount != 0])
                else:  # actual: spam，predict: ham, although a problem, need to make D bigger, so it's more likely to predict spam
                    DS[testWordsCount != 0] = (
                        DS[testWordsCount != 0] +
                        np.exp(alpha)) / DS[testWordsCount != 0]
        print 'DS:', DS
        errorRate = errorCount / testCount
        if errorRate < minErrorRate:
            minErrorRate = errorRate
            ds_errorRate['minErrorRate'] = minErrorRate
            ds_errorRate['DS'] = DS
        print 'Iteration %d times，number of error prediction %d ，error rate: %f' % (
            i, errorCount, errorRate)
        if errorRate == 0.0:
            break
    ds_errorRate['vocabularyList'] = vocabularyList
    ds_errorRate['pWordsSpamicity'] = pWordsSpamicity
    ds_errorRate['pWordsHealthy'] = pWordsHealthy
    ds_errorRate['pSpam'] = pSpam
    return ds_errorRate

예제 #2

파일 보기

파일: training.py 프로젝트: pzfok/NaiveBayesSpamFilter

def trainingAdaboostGetDS(iterateNum=40):
    """
    测试分类的错误率
    :param iterateNum:
    :return:
    """
    filename = '../emails/training/SMSCollection.txt'
    smsWords, classLables = boostNaiveBayes.loadSMSData(filename)

    # 交叉验证
    testWords = []
    testWordsType = []

    testCount = 1000
    for i in range(testCount):
        randomIndex = int(random.uniform(0, len(smsWords)))
        testWordsType.append(classLables[randomIndex])
        testWords.append(smsWords[randomIndex])
        del (smsWords[randomIndex])
        del (classLables[randomIndex])
    """
    训练阶段，可将选择的vocabularyList也放到整个循环中，以选出
    错误率最低的情况，获取最低错误率的vocabularyList
    """
    vocabularyList = boostNaiveBayes.createVocabularyList(smsWords)
    print "生成语料库！"
    trainMarkedWords = boostNaiveBayes.setOfWordsListToVecTor(vocabularyList, smsWords)
    print "数据标记完成！"
    # 转成array向量
    trainMarkedWords = np.array(trainMarkedWords)
    print "数据转成矩阵！"
    pWordsSpamicity, pWordsHealthy, pSpam = \
        boostNaiveBayes.trainingNaiveBayes(trainMarkedWords, classLables)

    DS = np.ones(len(vocabularyList))

    ds_errorRate = {}
    minErrorRate = np.inf
    for i in range(iterateNum):
        errorCount = 0.0
        for j in range(testCount):
            testWordsCount = boostNaiveBayes.setOfWordsToVecTor(vocabularyList, testWords[j])
            ps, ph, smsType = boostNaiveBayes.classify(pWordsSpamicity, pWordsHealthy,
                                                       DS, pSpam, testWordsCount)

            if smsType != testWordsType[j]:
                errorCount += 1
                # alpha = (ph - ps) / ps
                alpha = ps - ph
                if alpha < 0:  # 原先为spam，预测成ham
                    DS[testWordsCount != 0] = np.abs(
                            (DS[testWordsCount != 0] - np.exp(alpha)) / DS[testWordsCount != 0])
                else:  # 原先为ham，预测成spam
                    DS[testWordsCount != 0] = (DS[testWordsCount != 0] + np.exp(alpha)) / DS[testWordsCount != 0]
        print 'DS:', DS
        errorRate = errorCount / testCount
        if errorRate < minErrorRate:
            minErrorRate = errorRate
            ds_errorRate['minErrorRate'] = minErrorRate
            ds_errorRate['DS'] = DS
        print '第 %d 轮迭代，错误个数 %d ，错误率 %f' % (i, errorCount, errorRate)
        if errorRate == 0.0:
            break
    ds_errorRate['vocabularyList'] = vocabularyList
    ds_errorRate['pWordsSpamicity'] = pWordsSpamicity
    ds_errorRate['pWordsHealthy'] = pWordsHealthy
    ds_errorRate['pSpam'] = pSpam
    return ds_errorRate