예제 #1
0
def spamTest():
    hamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\ham")
    hamclassList = [0] * len(hamemail)
    spamemail = loadDataSet("D:\学习资料\machinelearninginaction\Ch04\email\spam")
    spamclassList = [1] * len(spamemail)
    Allemail = []
    Allemail.extend(hamemail)
    Allemail.extend(spamemail)
    AllList = []
    AllList.extend(hamclassList)
    AllList.extend(spamclassList)
    VocalbList = Bayes.createVocabList(Allemail)
    # print(VocalbList)
    testMat = []
    realclass = []
    for i in range(10):
        randIndex = int(random.uniform(0, len(Allemail)))
        testMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[randIndex]))
        del (Allemail[randIndex])
        realclass.append(AllList[randIndex])
        del (AllList[randIndex])
    trainMat = []
    for i in range(len(Allemail)):
        trainMat.append(Bayes.bagOfWords2Vec(VocalbList, Allemail[i]))
    p0vect, p1vect, pA = Bayes.trainNB0(trainMat, AllList)
    # print(p0vect,'\n',p1vect,'\n',pA)
    for i in range(10):
        print("test_result=", Bayes.classifyNB(testMat[i], p0vect, p1vect, pA),
              ",real_result=", realclass[i])
예제 #2
0
def localWords(feed1, feed0):
    docList = []  #以二维数组形式存储所有样本的词汇表
    classList = []  #存储所有样本的类别信息
    fullText = []  #以一维数组形式存储所有样本的词汇表
    minLen = min(len(feed1['entries']), len(feed0['entries']))  #获取两个RSS源的最小长度
    for i in range(minLen):
        #解析feed1['entries'][i]['summary'],将长度大于2的单词提取出来,并全转换为小写
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)  #将该样本词汇添加到docList中
        fullText.extend(wordList)  #将该样本词汇追加到fullText中
        classList.append(1)  #将样本类别信息添加到classList
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = Bayes.createVocabList(docList)  #获取docList中所有不重复的单词列表
    #由于语言中大部分都是冗余和结构辅助性内容,导致词汇表中一小部分单词却占据了所有文本用词的一大部分。需要去除冗余词汇。
    #另一个常用的方法是不仅移除高频词,同时从某个预定词表中移除结构上的辅助词。该词表称为停用词表(stop word list)。
    top30Words = calcMostFreq(vocabList, fullText)  #获取在fullText中出现次数最多的30个词汇信息
    for pairW in top30Words:  #从词汇表vocabList中去除出现次数最多的30个单词
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = range(2 * minLen)
    #定义列表变量存储训练样本id
    print 'minLen : %d' % minLen
    if minLen < 20:
        print 'the len is too small.'
    testSet = []  #用于存储测试样本id
    for i in range(20):  #从训练样本中随机获取20个样本信息作为测试样本集,并从训练样本中去除这些样本
        randIndex = int(random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    #从文本样本集中获取训练样本集,将相关文本样本的词汇出现次数信息存储到矩阵trainMat中,样本分类信息存储到trainClasses中
    for docIndex in trainingSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        trainMat.append(Bayes.bagOfWords2Vec(vocabList, docList[docIndex]))
        #获取当前样本的分类信息classList[docIndex]
        trainClasses.append(classList[docIndex])
    #通过贝叶斯分类器对训练样本进行学习
    #获取两个类别各自单词的出现频率,以及样本集的概率
    p0V, p1V, pSpam = Bayes.trainNB0(array(trainMat), array(trainClasses))
    errorCount = 0
    #使用测试样本集对学习结果进行测试
    for docIndex in testSet:
        #获取样本docList[docIndex]在词汇表vocabList中各个单词出现次数情况
        wordVector = Bayes.bagOfWords2Vec(vocabList, docList[docIndex])
        #对当前测试样本进行分类,判断是否与已知类型相同
        if Bayes.classifyNB(array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ', float(errorCount) / len(testSet)  #打印出错误率
    return vocabList, p0V, p1V  #返回词汇表和各个词汇的出现概率
def localWord(feed0,feed1):
   
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    listOfPost = [];classVec = [];fullText = []
    for i in range(minLen):
        wordList = Bayes.textParse(feed1['entries'][i]['summary'])#数组从feed1开始
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(1)
        
        wordList = Bayes.textParse(feed0['entries'][i]['summary'])
        listOfPost.append(wordList)
        fullText.extend(wordList)
        classVec.append(0)
        
    vocabList = Bayes.creatVocabList(listOfPost)
    
    top30Words = calMostFreq(vocabList,fullText)
    
    for pairW in top30Words:
        if pairW in vocabList:vocabList.remove(pairW)
        
    trainingSet = range(2*minLen) ; dataSet = []
    
    for i in range(20):
        randIndex = int(np.random.uniform(len(trainingSet)))
        dataSet.append(randIndex)
        del(trainingSet[randIndex])
    
    trainMat = [];trainClass = []
    for docIndex in trainingSet:
        trainMat.append(Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex]))
        trainClass.append(classVec[docIndex])
                    
    p0V,p1V,pSpam = Bayes.trainNB0(np.array(trainMat),trainClass)
    
    errorCount = 0.0
    
    for docIndex in dataSet:
        dataMat = Bayes.bagOfWords2Vec(vocabList,listOfPost[docIndex])
        
        if Bayes.classifyNB(np.array(dataMat),p0V,p1V,pSpam) != classVec[docIndex]:
            errorCount += 1
   # print "the error rate is :",errorCount/float(len(dataSet))
    
    return vocabList,p0V,p1V        
예제 #4
0
def testingNB():
    listOPosts, listClasses = loadDataSet()
    myVocabList = Bayes.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        #trainMat.append(Bayes.setOfWords2Vec(myVocabList, postinDoc))
        trainMat.append(Bayes.bagOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = Bayes.trainNB0(trainMat, listClasses)
    testEntry = ['love', 'my', 'dalmation']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    #thisDoc = array(Bayes.setOfWords2Vec(myVocabList, testEntry))
    thisDoc = array(Bayes.bagOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', Bayes.classifyNB(
        thisDoc, p0V, p1V, pAb)