def testingNB():
    listOPosts,listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat=[]
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = bayes.trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',bayes.classifyNB(thisDoc,p0V,p1V,pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ',bayes.classifyNB(thisDoc,p0V,p1V,pAb)
Exemple #2
0
def testingNB():
    listOPosts, listClasses = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses)

    testEntry = ['love', 'my', 'dalmation', 'stupid']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
    testEntry = ['quit', 'stupid']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
def testingNB():
    postList, classList = bayes.loadDataSet()
    myVocabList = bayes.createVocabList(postList)
    trainMat = []
    for post in postList:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, post))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, classList)
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry)
    print testEntry, 'classified as: ', bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry)
    print testEntry, 'classified as: ', bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('xxx/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('xxx/%d,txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWord2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
Exemple #5
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainingMat = []
    trainingClasses = []
    for docIndex in trainingSet:
        trainingMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
        trainingClasses.append(classList[docIndex])
    p0V, p1V, pSpam = trainNBO(trainingMat, trainingClasses)
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print ("the error rate is: ", float(errorCount)/len(testSet))
def spamTest():
    docList = []; classList=[]; fullText=[]
    for i in range(1, 26):
        wordList = textParse(open('./email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocalList = bayes.createVocabList(docList)
    trainingSet = range(50); testSet=[]
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[];trainClass=[];
    for docIndex in trainingSet:
        trainMat.append(bayes.addWordInList(vocalList, docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0V,p1V,ps = bayes.trainNB0(array(trainMat), array(trainClass))
    errorCount = 0
    for doc in testSet:
        wordV = bayes.addWordInList(vocalList, docList[doc])
        if bayes.classifyNB(array(wordV), p0V, p1V, ps) != classList[doc]:
            errorCount+=1
    print 'err count is %d' % errorCount
def spamTest():
    docList = []; classList = []; fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    # range对象不支持del,所以要转成list
    trainingSet = list(range(50)); testSet = []
    # 随机取10组数据作为测试机
    for i in range(10):  # 循环10次
        # 在0到49之间(包括0,49)取随机整数
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])  # 删除数组的引用
    # 把剩下的40组数据作为训练集
    trainMat = []; trainClasses = []
    for docIndex in trainingSet:  # 由于上面删除了10个引用,还剩40个
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNBO(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        # 转成词向量
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
Exemple #8
0
def spamTest():
    docList = []
    classList =[]
    fullText = []
    for i in range(1, 26):
        wordList = bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i,errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    dataSet = list(range(50))
    testSet = []
    for i in range(10):
        randIndex = int(np.random.uniform(0, len(dataSet)))
        testSet.append(dataSet[randIndex])
        del(dataSet[randIndex])
    trainMat = []
    trainClasses = []
    for doc in dataSet:
        trainMat.append(bayes.bagOfWord2Vec(vocabList, docList[doc]))
        trainClasses.append(classList[doc])
    pAbusive, p1Vect, p0Vect = bayes.trainNB0(trainMat, trainClasses)
    errorCount = 0.0
    for i in testSet:
        vec2Classify = bayes.bagOfWord2Vec(vocabList, docList[i])
        if bayes.classifyNB(vec2Classify, p1Vect, p0Vect, pAbusive) != classList[i]:
            errorCount += 1
    return errorCount/10
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('./email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('./email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocalList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClass = []
    for docIndex in trainingSet:
        trainMat.append(bayes.addWordInList(vocalList, docList[docIndex]))
        trainClass.append(classList[docIndex])
    p0V, p1V, ps = bayes.trainNB0(array(trainMat), array(trainClass))
    errorCount = 0
    for doc in testSet:
        wordV = bayes.addWordInList(vocalList, docList[doc])
        if bayes.classifyNB(array(wordV), p0V, p1V, ps) != classList[doc]:
            errorCount += 1
    print 'err count is %d' % errorCount
Exemple #10
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # 导入并解析文本
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    # 随机构建训练集
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    # 对测试集分类
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is:', float(errorCount) / len(testSet)
Exemple #11
0
def testNB():
    listOPosts, listClasses = bayes.loadDataSet()  #加载数据
    myVocabList = bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)

    resultLabel = {0: 'Not garbage', 1: 'Garbage'}
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:',
          resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])

    testEntry = ['stupid', 'garbage']
    thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
    print(testEntry, 'classified as:',
          resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])
Exemple #12
0
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    # print('minlen=%d'%minLen)
    for i in range(minLen):
        # visit one rss source every time
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        # visit rss 0
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

        # remove most frequent words
        vocabList = bayes.createVocabList(docList)
        top30words = calcMostFreq(vocabList, fullText)
        for pairW in top30words:
            if pairW[0] in vocabList:
                vocabList.remove(pairW[0])
        trainingSet = list(range(2 * minLen))
        testSet = []
        for i in range(20):
            randIdx = int(random.uniform(0, len(trainingSet)))
            testSet.append(trainingSet[randIdx])
            del (trainingSet[randIdx])

    trainMat = []
    trainClasses = []
    for docIdx in trainingSet:
        # print("doc idx:%d, len=%d" %( docIdx, len(docList)))
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIdx]))
        trainClasses.append(classList[docIdx])

    p0V, p1V, pSpam = bayes.trainNB0(numpy.array(trainMat),
                                     numpy.array(trainClasses))

    errorCount = 0
    for docIdx in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIdx])
        if bayes.classifyNB(numpy.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIdx]:
            errorCount += 1
    print('the error rate is: %.2f' % (float(errorCount) / len(testSet)))

    return vocabList, p0V, p1V
def main():
    print '开始测试...'

    listOPosts, listClasses = loadDataSet()

    myVocabList = docTool.createVocabList(listOPosts)

    print '词汇表:\n', myVocabList

    wordsVec = docTool.setOfWords2Vec(myVocabList, listOPosts[0])

    print '将第一句话转换成向量,存在的单词为1,不存在的单词为0\n', wordsVec

    trainMat = []

    for postinDoc in listOPosts:
        trainMat.append(docTool.setOfWords2Vec(myVocabList, postinDoc))

    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)

    testEntry = ['love', 'my', 'dalmation']

    # 将testEntry转化成特征量的组合,也就是一个要求的样本
    thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry))

    label = bayes.classifyNB(thisDoc, p0V, p1V, pAb)

    print testEntry, '分类是:',label

    testEntry = ['stupid', 'garbage']

    # 将testEntry转化成特征量的组合,也就是一个要求的样本
    thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry))

    label = bayes.classifyNB(thisDoc, p0V, p1V, pAb)

    print testEntry, '分类是:',label
    print '测试结束...'
Exemple #14
0
def main():
    print '开始测试...'

    listOPosts, listClasses = loadDataSet()

    myVocabList = docTool.createVocabList(listOPosts)

    print '词汇表:\n', myVocabList

    wordsVec = docTool.setOfWords2Vec(myVocabList, listOPosts[0])

    print '将第一句话转换成向量,存在的单词为1,不存在的单词为0\n', wordsVec

    trainMat = []

    for postinDoc in listOPosts:
        trainMat.append(docTool.setOfWords2Vec(myVocabList, postinDoc))

    p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses)

    testEntry = ['love', 'my', 'dalmation']

    # 将testEntry转化成特征量的组合,也就是一个要求的样本
    thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry))

    label = bayes.classifyNB(thisDoc, p0V, p1V, pAb)

    print testEntry, '分类是:', label

    testEntry = ['stupid', 'garbage']

    # 将testEntry转化成特征量的组合,也就是一个要求的样本
    thisDoc = np.array(docTool.setOfWords2Vec(myVocabList, testEntry))

    label = bayes.classifyNB(thisDoc, p0V, p1V, pAb)

    print testEntry, '分类是:', label
    print '测试结束...'
def localWords(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = st.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = st.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)

    #remove top frequent words
    top30Words = calcMostFreq(vocabList, fullText)
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])

    #build training and testing set
    trainingSet = range(2*minLen)
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    #training
    p0V, p1V, pLocal = bayes.trainNB0(array(trainMat), array(trainClasses))

    #testing
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pLocal) != classList[docIndex]:
            errorCount+=1

    print 'the error rate is: ', float(errorCount)/len(testSet)
    return vocabList, p0V, p1V
Exemple #16
0
def spamTest():
    emailList = []
    classes = []

    # 读入数据
    for i in xrange(1, 26):
        # 读取垃圾邮件
        email = textParse(open('../data/email/spam/%d.txt' % i).read())
        emailList.append(email)
        classes.append(1)
        # 读取正常邮件
        email = textParse(open('../data/email/ham/%d.txt' % i).read())
        emailList.append(email)
        classes.append(0)

    # 构建词向量
    vocabularyList = bayes.vocabularyList(emailList)
    # 构建词频矩阵
    dataSet = bayes.wordsMatrix(vocabularyList, emailList)

    # 建立训练集和测试集
    test_num = 20  #测试集数量
    testingData = []
    testingClasses = []
    for i in xrange(test_num):
        testIndex = int(random.uniform(0, len(dataSet)))
        testingData.append(dataSet[testIndex])
        testingClasses.append(classes[testIndex])

        # 在原有数据中删除
        del (dataSet[testIndex])
        del (classes[testIndex])

    # 训练模型
    p0, p1, pc1 = bayes.trainModel(dataSet, classes)

    # 计算测试误差
    errorCount = 0
    i = 0
    for testSample in testingData:
        result = bayes.classifyNB(testSample, p0, p1, pc1)
        if result != testingClasses[i]:
            errorCount += 1
            print "分类错误"
        i += 1
    errorRate = float(errorCount) / test_num
    print "错误率 %f" % errorRate
    return errorRate
def localWord(feed1, feed0):
    docList = []
    classList = []
    fullText = []
    minLength = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLength):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = bayes.createVocabList(docList)
    top30words = calMostFeq(vocabList, fullText)
    for pairW in top30words:
        if (pairW[0] in vocabList):
            vocabList.remove(pairW[0])

    trainingSet = range(2 * minLength)
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMatrix = []
    trainClass = []

    for docIndex in trainingSet:
        trainMatrix.append(bayes.bagOfWordsToVec(vocabList, docList[docIndex]))
        trainClass.append(classList[docIndex])

    p0V, p1V, pSapm = bayes.trainNB0(trainMatrix, trainClass)

    errorCount = 0

    for docIndex in testSet:
        calculatedClass = bayes.classifyNB(
            bayes.bagOfWordsToVec(vocabList, docList[docIndex]), p0V, p1V,
            pSapm)
        if (calculatedClass != classList[docIndex]):
            errorCount += 1

    print "error rate is: ", float(errorCount) / len(testSet)
    return vocabList, p0V, p1V
def spamTest():
    '''
    对贝叶斯垃圾邮件分类器进行自动化处理
    对测试集中的每封邮件进行分类,若邮件分类错误,测错误数加1,最后返回总的错误百分比
    '''
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        #切分,解析数据,并归类为1的类别
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    #创建词汇表
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    #随机取10个邮件测试
    for i in range(10):
        #random.uniform(x,y)随机产生一个范围为x-y的实数
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    errorDoc = []
    for docIndex in testSet:
        wordVector = bayes.setWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
            errorDoc.append(docList[docIndex])
    print(vocabList)
    print(trainMat)
    print('the errorCount is: ', errorCount)
    print('the testSet length is: ', len(testSet))
    print('the error rate is; ', float(errorCount) / len(testSet))
    print(errorDoc)
Exemple #19
0
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        # 每次访问一条RSS源
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    top30Words = calcMostFreq(vocabList, fullText)

    # 去掉出现频数最高的词
    for pairW in top30Words:
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])

    trainingSet = list(range(2 * minLen))
    testSet = []
    for i in range(20):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1

    print('the error rate is:', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
def spamTest():
    docList = []
    classList = []
    fullText = []

    # read the mail
    for i in range(1,26):
        wordlist1 = textParse(open('./email/spam/%d.txt' %i).read())
        docList.append(wordlist1)
        fullText.extend(docList)
        classList.append(1)
        wordlist0 = textParse(open('./email/ham/%d.txt' %i).read())
        docList.append(wordlist0)
        fullText.extend(docList)
        classList.append(0)

    # get the dictionary
    vablist = bayes.createVocablist(docList)

    # Random Test dateset
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []

    # Get the train dateset
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses)

    # test the bayes
    errorCount = 0
    for docIndex in testSet:
        testVec = bayes.setOfwords2Vec(vablist, docList[docIndex])
        result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa)
        if result != classList[docIndex]:
            errorCount += 1
    errorrate = float(errorCount) / len(testSet)
    print "the filter spam mail error rate is %f" %errorrate
def spamTest():
    docList = []
    classList = []
    fullText = []

    # read the mail
    for i in range(1, 26):
        wordlist1 = textParse(open('./email/spam/%d.txt' % i).read())
        docList.append(wordlist1)
        fullText.extend(docList)
        classList.append(1)
        wordlist0 = textParse(open('./email/ham/%d.txt' % i).read())
        docList.append(wordlist0)
        fullText.extend(docList)
        classList.append(0)

    # get the dictionary
    vablist = bayes.createVocablist(docList)

    # Random Test dateset
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])

    trainMat = []
    trainClasses = []

    # Get the train dateset
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfwords2Vec(vablist, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, trainClasses)

    # test the bayes
    errorCount = 0
    for docIndex in testSet:
        testVec = bayes.setOfwords2Vec(vablist, docList[docIndex])
        result = bayes.classifyNB(testVec, p1Vec, p0Vec, pa)
        if result != classList[docIndex]:
            errorCount += 1
    errorrate = float(errorCount) / len(testSet)
    print "the filter spam mail error rate is %f" % errorrate
def spamTest():
    docList = []
    classList = []
    fullText = []

    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)

        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)

    vocabList = docTool.createVocabList(docList)

    trainingSet = range(50)
    testSet = []

    for i in range(10):
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])

    trainMat = []
    trainClasses = []

    for docIndex in trainingSet:
        trainMat.append(docTool.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])

    p0V, p1V, pSpam = bayes.trainNB0(trainMat, trainClasses)

    errorCount = 0

    for docIndex in testSet:
        wordVector = docTool.setOfWords2Vec(vocabList, docList[docIndex])

        if bayes.classifyNB(wordVector, p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1


    print '错误率: ', float(errorCount) / len(testSet)
Exemple #23
0
def localWords(feed1, feed0):  # 两份RSS文件分别经feedparser解析,得到2个字典
    docList = []  # 一条条帖子组成的List, 帖子拆成了单词
    classList = []  # 标签列表
    fullText = []  # 所有帖子的所有单词组成的List
    # entries条目包含多个帖子,miNLen记录帖子数少的数目,怕越界
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(minLen):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])  # 取出帖子内容,并拆成词
        docList.append(wordList)  # ['12','34'].append(['56','78']) ==> [ ['12','34'], ['56','78'] ]
        fullText.extend(wordList)  # ['12','34'].extend(['56','78']) ==> ['12','34','56','78']
        classList.append(1)  # 纽约的标签是1
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)  # 旧金山的标签是0

    vocabList = bayes.createVocabList(docList)  # 创建词汇表
    # 从fulltext中找出最高频的30个单词,并从vocabList中去除它们
    top30Words = calcMostFreq(vocabList, fullText)
    for (word, count) in top30Words:
        if word in vocabList:
            vocabList.remove(word)

    trainingSet = range(2 * minLen);
    testSet = []  # 创建训练集、测试集
    for i in range(minLen / 10):  # 随机选取10%的数据,建立测试集
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = [];
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))  # 将训练集中的每一条数据,转化为词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses))  # 开始训练

    # 用测试数据,测试分类器的准确性
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
    return vocabList, p0V, p1V
Exemple #24
0
def spamTest():
    """
    将文件夹spam和ham中分别的25篇右键导入解析为词列表,再构建一个测试集与训练集,
    50篇中再随机选10篇作为测试集,其余20篇作为测试集(留存交叉验证)
    :return: 
    """
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = bayes.textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []
    for i in range(10):  #随机选出10篇
        randIndex = int(random.uniform(
            0,
            len(trainingSet)))  #random.uniform(x, y) 方法将随机生成一个实数,它在 [x,y] 范围内。
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  #遍历训练集中所有的文档
        trainMat.append(bayes.bagOfWords2VecMN(vocabList,
                                               docList[docIndex]))  #构建词向量
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat),
                                     array(trainClasses))  #计算分类所需的概率
    errorCount = 0
    for docIndex in testSet:  #遍历测试集
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is :', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
Exemple #25
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # assume we have 25 emails for normal email and spam
        wordList = textParse(
            codecs.open(baseURI + 'email/spam/%d.txt' % i,
                        encoding='ANSI').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(
            open(baseURI + 'email/ham/%d.txt' % i, encoding='ANSI').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.create_vocab_list(docList)  #create vocabulary
    trainingSet = list(range(50))
    testSet = []  #create test set
    # this will pop out 10 emails of traning set for testing algorithm randomly
    for i in range(10):
        # np.random.uniform(low, high) draw samples from a uniform distribution
        # The probability density function of the uniform distribution is p(x)=1/(high-low)
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
        trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:  #classify the remaining items
        wordVector = bagOfwords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error", docList[docIndex])
    print('the error rate is: ', float(errorCount) / len(testSet))
def spamTest():
    docList=[]; classList = []; fullText =[]
    for i in range(1,26):
        str=open('email/spam/%d.txt' % i).read()
        print "str"
        print str
        wordList = bayes.textParse(str)
        print "wordlist"
        print wordList
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = bayes.textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    print "doclist"
    print len(docList)
    print docList
    print "fulllist"
    print len(fullText)
    print fullText
    print classList

    vocabList = bayes.createVocabList(docList)#create vocabulary
    trainingSet = range(50); testSet=[]           #create test set
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = bayes.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
            print "classification error",docList[docIndex]
    print 'the error rate is: ',float(errorCount)/len(testSet)
Exemple #27
0
def spamTest():
    docList = []
    classList = []
    fullText = []

    # parse text from email
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #parse text
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)

    #build training set and test set
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        #select testset and remove the testset from all dataset
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))

    #classify and test precision
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
Exemple #28
0
def localWords(feed1, feed0):
    import feedparser
    docList = []
    classList = []
    fullText = []
    minLen = min(len(feed1['entries']), len(feed0['entries']))
    print feed1['entries']
    print feed0['entries']
    for i in range(minLen):
        wordList = bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)  #NY is class 1
        wordList = bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)  #create vocabulary
    top30Words = calcMostFreq(vocabList, fullText)  #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = list(range(2 * minLen))
    testSet = []  #create test set
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  #train the classifier (get probs) trainNB0
        trainMat.append(bayes.bagOfWord2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    for docIndex in testSet:  #classify the remaining items
        wordVector = bayes.bagOfWord2VecMN(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is: ', float(errorCount) / len(testSet))
    return vocabList, p0V, p1V
Exemple #29
0
def spamTest():
    """
    对贝叶斯垃圾邮件分类器进行自动化处理
    :return:
    """
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # 1. 导入并解析文本文件
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50)
    testSet = []
    for i in range(10):
        # 2. 随机构建训练集
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    # 3. 对测试集分类
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)
def spamTest():
    docList = []
    classList = []
    fullText = []

    # parse text from email
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        #parse text
        wordList = textParse(open('email/ham/%d.txt' %i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)

    #build training set and test set
    trainingSet = range(50);
    testSet =[]
    for i in range(10):
        randIndex = int(random.uniform(0, len(trainingSet)))
        #select testset and remove the testset from all dataset
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))

    #classify and test precision
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount)/len(testSet)
def spamTest():
	docList 	= []
	classList 	= []
	fullText 	= []
	for i in range(1, 26):
		wordList = textParse(open('email/spam/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(1)

		wordList = textParse(open('email/ham/%d.txt' % i).read())
		docList.append(wordList)
		fullText.extend(wordList)
		classList.append(0)

	vocabList 	= bayes.createVocabList(docList)
	trainingSet = range(50)
	testSet 	= []

	# randomly split data set into 2 sets: test set, and training set
	for i in range(10):
		randIndex = int(random.uniform(0, len(trainingSet)))	# random int 0~len
		testSet.append(trainingSet[randIndex])
		del(trainingSet[randIndex])		# split

	trainMat = [];
	trainClasses = [];
	for docIndex in trainingSet:
		trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
		trainClasses.append(classList[docIndex])

	p0V, p1V, pSpam = bayes.trainNB0(array(trainMat), array(trainClasses))
	errorCount = 0

	for docIndex in testSet:
		wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
		if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]:
			errorCount += 1
			print "error word: %s" % (docList[docIndex])
	print "error rate is: %f", float(errorCount) / len(testSet)
def spamTest():
    docList = []
    classList =[]
    fullText = []

    #导人文件夹spam与ham下的文本文件,并将它们解析为词列表
    for i in range(1,26):
        wordList = textParse(open('email/spam/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
        
    vocabList = bayes.createVocabList(docList)
    trainingSet = range(50);#本例中共有50封电子邮件,其中的值从0到49
    
    testSet = []    
    '''选择出的数字所对应的文档被添加到测试集, 同时也将其从训练集中剔除。    '''
    for i in range(10):#10封电子邮件被随机选择为测试集。
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMatrix=[];
    trainClasses = []
    for docIndex in trainingSet:
        trainMatrix.append(bayes.setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = bayes.trainNB(array(trainMatrix),array(trainClasses))
    
    errorCount=0
    for docIndex in testSet:
        #如果邮件分类错误,则错误数加1,最后给出总的错误百分比
        wordVector = bayes.setOfWords2Vec(vocabList,docList[docIndex])
        if bayes.classifyNB(array(wordVector),p0V,p1V,pSpam) !=classList[docIndex]:
            errorCount +=1
    print 'the error rate is :',float(errorCount) /len(testSet)
Exemple #33
0
def spamTest():
    docList = []
    classList = []
    #fullText = [] #没起作用

    for i in range(1,26): #此案例中样本集名为1.txt~25.txt
        wordList = textParse(open('email/spam/%d.txt' % i).read()) #解析邮件,分隔成一个个词汇
        docList.append(wordList)  #将样本内容存储到docList中
        #fullText.extend(wordList)
        classList.append(1) #spam下对应的类别标签设置为1
        wordList = textParse(open('email/ham/%d.txt' % i).read())
        docList.append(wordList)
        #fullText.extend(wordList)
        classList.append(0) #ham下对应的类别标签设置为0
    vocabList = bayes.createVocabList(docList) #通过docList获取全部的词汇表
    trainingSet = list(range(50)) #此处共50个案例,与classList长度对应
    
    testSet = [] #存储测试样本集
    for i in list(range(10)):
        randIndex = int(random.uniform(0,len(trainingSet))) #随机提取样本作为测试样本
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex]) #把测试样本从训练样本中剔除
    trainMat = []
    trainClasses = []

    for docIndex in trainingSet:#遍历训练样本集
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex])) #获取样本中使用词汇情况向量
        trainClasses.append(classList[docIndex])  #获取当前样本的类别标签
    p0V,p1V,pSpam = bayes.trainNB0(array(trainMat), array(trainClasses)) #训练算法,得到概率
    errorCount = 0

    for docIndex in testSet: #遍历测试样本集
        wordVector=bayes.setOfWords2Vec(vocabList, docList[docIndex])
        resultFlag = bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) #使用分类函数进行分类
        if(resultFlag != classList[docIndex]): #如果得到结果不正确,则错误数加上1
            errorCount += 1
    print('the error rate is: ', float(errorCount)/len(testSet))
Exemple #34
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        wordList = textParse(open('email/spam/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt' % i, 'r').read())
        docList.append(wordList)
        fullText.append(wordList)
        classList.append(0)
    vocabList = bayes.createVocabList(docList)
    trainingSet = list(range(50))
    testSet = []  # 创建存储训练集的索引值的列表和测试集的索引值的列表
    for i in range(10):  # 从50个邮件中,随机挑选出40个作为训练集,10个做测试集
        randIndex = int(random.uniform(0, len(
            trainingSet)))  #从一个均匀分布[low,high)中随机采样,注意定义域是左闭右开,即包含low,不包含high.
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bayes.setOfWords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNBO(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = bayes.setOfWords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
            print('wrong testSet:', docList[docIndex])
    print('wrong rate:%.2f%%' % (float(errorCount) / len(testSet) * 100))
Exemple #35
0
__author__ = 'jack'

import bayes

listOPosts,listClasses = bayes.loadDataSet()

myVocabList = bayes.createVocabList(listOPosts)

print myVocabList

#print bayes.setOfWords2Vec(myVocabList, listOPosts[0])

trainMat = []
for post in listOPosts:
    trainMat.append(bayes.setOfWords2Vec(myVocabList, post))

p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses))

print pAb
print p0V
print p1V

testEntry = ['love', 'my', 'dalmation']
thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
print thisDoc
print testEntry, 'classsified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)

testEntry = ['stupid', 'garbage']
thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
print thisDoc
print testEntry, 'classsified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
Exemple #36
0
#!/usr/bin/python
#encoding:utf-8
import bayes
from numpy import *
postingList, classList = bayes.loadDataSet()
myVocabList = bayes.createVocabList(postingList)
trainMat = []
for a in postingList:
    trainMat.append(bayes.setOfWords(myVocabList, a))
pc, p0, p1 = bayes.trainNB0(trainMat, classList)
test = ['stupid', 'garbage']
thisDoc = array(bayes.setOfWords(myVocabList, test))
print bayes.classifyNB(thisDoc, p0, p1, pc)
def spamTest():
    emailList = []
    classes = []
    
    # 读入数据
    for i in xrange(1,26):
        # 读取垃圾邮件
        email = textParse(open('../data/email/spam/%d.txt' %i).read())
        emailList.append(email)
        classes.append(1)
        # 读取正常邮件
        email = textParse(open('../data/email/ham/%d.txt' %i).read())
        emailList.append(email)
        classes.append(0)
    
    # 构建词向量
    vocabularyList = bayes.vocabularyList(emailList)
    # 构建词频矩阵
    dataSet = bayes.wordsMatrix(vocabularyList, emailList)
    
    # 建立训练集和测试集
    test_num = 20 #测试集数量
    testingData = []
    testingClasses = []
    for i in xrange(test_num):
        testIndex = int(random.uniform(0, len(dataSet)))
        testingData.append(dataSet[testIndex])
        testingClasses.append(classes[testIndex])
        
        # 在原有数据中删除
        del(dataSet[testIndex])
        del(classes[testIndex])
    
    # 训练模型
    p0, p1, pc1 = bayes.trainModel(dataSet, classes)
    
    # 计算测试误差
    errorCount = 0
    i = 0
    for testSample in testingData:
        result = bayes.classifyNB(testSample, p0, p1, pc1)
        if result != testingClasses[i]:
            errorCount += 1
            print "分类错误"
        i += 1
    errorRate = float(errorCount) / test_num
    print "错误率 %f" %errorRate
    return errorRate
        
        
    
    
    
    
        
        
    
    
    
    
    
import DividingData
import bayes
from matplotlib import pyplot as plt
import numpy as np

#生成五次实验的测试数据、训练数据、标注文件
for i in range(5):
    rightCateFile='classifyRightCate'+str(i)+'.txt'
    DividingData.dataSeg(i,rightCateFile)

#朴素贝斯分类器对五次实验的测试数据进行分类
for i in range(5):
    trainDir='trainData'+str(i)
    testDir='testData'+str(i)
    resultCateFile='classifyResultCate'+str(i)+'.txt'
    bayes.classifyNB(trainDir,testDir,resultCateFile)

#计算并记录五次实验的错误率
errorRateRecord=[]
for i in range(5):
    rightCateFile='classifyRightCate'+str(i)+'.txt'
    resultCateFile='classifyResultCate'+str(i)+'.txt'
    er=bayes.errorRate(rightCateFile,resultCateFile)
    errorRateRecord.append(er)
  
#绘制条形图:可视化每次实验的错误率
fig=plt.figure(1)
ax1=plt.subplot(111)
data=np.array([float(format(v,'.3f')) for v in errorRateRecord])
width=0.5
x_bar=np.arange(5)
Exemple #39
0
#!/usr/bin/python
#encoding:utf-8
import bayes
from numpy import *
postingList , classList = bayes.loadDataSet()
myVocabList = bayes.createVocabList(postingList)
trainMat = []
for a in postingList:
    trainMat.append( bayes.setOfWords( myVocabList, a ) )
pc, p0, p1 = bayes.trainNB0(trainMat, classList)
test = ['stupid', 'garbage']
thisDoc = array( bayes.setOfWords( myVocabList, test ) )
print bayes.classifyNB(thisDoc, p0, p1, pc)
Exemple #40
0

if __name__ == '__main__':
    weibo_emotion, weibo_content, weibo_content_emotion = get_dataset(
        XML_TRAIN_PATH)
    bow, l1_p0_vect, l1_p1_vect, p_emotion, l2_p0_vect, l2_p1_vect, p_positive = \
      process_training_xml(weibo_emotion, weibo_content, weibo_content_emotion)

    test_weibo_emotion, test_content, test_content_emotion = get_dataset(
        XML_TEST_PATH)
    test_mat, test_cat = convert_to_mat_and_cat(bow, test_content,
                                                test_content_emotion)

    # 进行两种情感的分类
    l1_pre_cat = map(
        lambda each: bayes.classifyNB(each, l1_p0_vect, l1_p1_vect, p_emotion),
        test_mat)
    l1_pre_cat_cnt = adder()
    test_content = map(
        lambda weibo: map(
            lambda sentence: (sentence, l1_pre_cat[l1_pre_cat_cnt()]), weibo),
        test_content)

    l2_pre_cat = map(
        lambda each: bayes.classifyNB(each, l2_p0_vect, l2_p1_vect, p_positive
                                      ), test_mat)
    l2_pre_cat_cnt = adder()

    def _mark_emotion(each):
        if each[1] == 0:
            return each
Exemple #41
0
    return Vec, out

if __name__ == '__main__':
    googDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\goods.txt'
    badDataPath = 'C:\\Users\\John\\Desktop\\emotion Analysis\\bad.txt'

    # 1 好评     0 差评
    goodVec, goodList = DataHandle(googDataPath, 1)
    badVec, badList = DataHandle(badDataPath, 0)

    listClasses = goodVec + badVec
    listOPosts = goodList + badList
    print(listClasses)
    print(listOPosts)

    myVocabList = bayes.createVocabList(listOPosts)
    print(myVocabList)
    # 3. 计算单词是否出现并创建数据矩阵
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
    # 4. 训练数据
    p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses))
    # 5. 测试数据
    while True:
        inputS = input(u'请输入您对本商品的评价:')

        testEntry = wordCut(inputS)
        thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
        print('评价: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb))
Exemple #42
0
def local_words(feed1, feed0):
    """
    feed1: rss源1
    feed0: rss源0
    """
    import feedparser
    # 文档列表
    doc_list = []
    # 文档类别列表
    class_list = []
    full_text = []
    # 计算两个源中用于训练的数据的数量
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    # 遍历每个训练数据(文档)
    for i in range(min_len):
        # 分词,去掉标点符号
        word_list = text_parse(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        # 分词,去掉标点符号
        word_list = text_parse(feed0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    # 构造单词列表
    vocab_list = create_vocab_list(doc_list)
    # 统计出出现频率前30的单词
    top30_words = calc_most_freq(vocab_list, full_text)
    # 从单词中去掉高频词汇
    for pair_w in top30_words:
        if pair_w[0] in vocab_list:
            vocab_list.remove(pair_w[0])
    # 因为是两个源的数据所以*2
    training_set = range(2 * min_len)
    #print(training_set)
    # 接下来构造测试数据集
    test_set = []
    for i in range(20):
        # 随机从训练数据集中获得20个数据,同时在训练数据集中将其删除
        rand_index = int(random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del (training_set[rand_index])
    train_mat = []
    train_classes = []
    # 下面遍历training_set构造出最终用于训练的文档向量和标签
    for doc_index in training_set:
        # 用词袋模型构造每个文档的向量
        train_mat.append(bag_of_word2vec(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    # 训练贝叶斯分类器,这里由于使用两源同样数量的数据,所以p_spam为0.5
    p0_v, p1_v, p_spam = trainNB0(np.array(train_mat), np.array(train_classes))
    error_count = 0
    # 测试数据
    for doc_index in test_set:
        # 构造测试文档的向量
        word_vec = bag_of_word2vec(vocab_list, doc_list[doc_index])
        if classifyNB(np.array(word_vec), p0_v, p1_v,
                      p_spam) != class_list[doc_index]:
            error_count += 1
        print('the error rate is: ', float(error_count) / len(test_set))
        return vocab_list, p0_v, p1_v