def test_word2vec2(self):
     raw = bayes.loadDataSet()[0]
     vocabList = bayes.createVocabList(raw)
     print("THE VOCABLIST IS %s" % vocabList)
     for wd in raw:
         print("RAW WORD IS %s" % wd)
         print("OUTPUT VEC IS %s" % bayes.setOfWords2Vec(vocabList, wd))
 def test_freq(self):
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     print(myVocabList)
     zz = bayes.calcMostFreq(
         myVocabList, 'haha my steak is food, my problems is garbage')
     print(zz)
 def test_word2vecBag(self):
     raw = bayes.loadDataSet()[0]
     vocabList = bayes.createVocabList(raw)
     print("THE VOCABLIST IS %s" % vocabList)
     for wd in raw:
         print("RAW WORD IS %s" % wd)
         print("OUTPUT VEC BAG IS %s" %
               bayes.bagOfWords2VecMN(vocabList, wd))  # 注意输出的向量里面how会有2个
 def test_NB(self):
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     trainMat = []
     for postinDoc in listOPosts:
         trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
     p0V, p1V, pAb = bayes.trainNB0Log(array(trainMat), array(listClasses))
     testEntry = ['love', 'my', 'dalmation']
     thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
     print(thisDoc)
     print(testEntry, 'classified as: ',
           bayes.classifyNB(thisDoc, p0V, p1V, pAb))
     testEntry = ['stupid', 'garbage']
     thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry))
     print(thisDoc)
     print(testEntry, 'classified as: ',
           bayes.classifyNB(thisDoc, p0V, p1V, pAb))
 def spamTest(self):
     docList = []
     classList = []
     fullText = []
     for i in range(1, 26):
         wordList = bayes.textParse(
             io.open('email/spam/%d.txt' % i, encoding="ISO-8859-1").read())
         docList.append(wordList)
         fullText.extend(wordList)
         classList.append(1)
         wordList = bayes.textParse(
             io.open('email/ham/%d.txt' % i, encoding="ISO-8859-1").read())
         docList.append(wordList)
         fullText.extend(wordList)
         classList.append(0)
     vocabList = bayes.createVocabList(docList)  # create vocabulary
     print("VACABULIST IS %s \nwith VACABULIST size = %d" %
           (vocabList, len(vocabList)))
     trainingSet = range(50)
     testSet = []  # create test set
     for i in range(10):
         randIndex = int(random.uniform(0, len(trainingSet)))
         print("RANDOM IS %d" % randIndex)
         testSet.append(trainingSet[randIndex])
         del (list(trainingSet)[randIndex])  # 2和3的语法不一样,这里遵从2.7
     trainMat = []
     trainClasses = []
     for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
         trainMat.append(
             bayes.bagOfWords2VecMN(vocabList, docList[docIndex]))
         trainClasses.append(classList[docIndex])
     print(trainMat)
     p0V, p1V, pSpam = bayes.trainNB0(
         array(trainMat), array(trainClasses))  # 得到训练后的朴素贝叶斯向量P(w|Ci)和P(Ci)
     errorCount = 0
     for docIndex in testSet:  # classify the remaining items
         wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex])
         if bayes.classifyNB(array(wordVector), p0V, p1V,
                             pSpam) != classList[docIndex]:
             errorCount += 1
             print("classification error", docList[docIndex])
     print('the error rate is: ', float(errorCount) / len(testSet))
 def test_trainNB(self):
     listOPosts, listClasses = bayes.loadDataSet()
     myVocabList = bayes.createVocabList(listOPosts)
     print(myVocabList)  # 获得所有词集合,去重
     trainMat = []
     for postinDoc in listOPosts:
         trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
     print(trainMat)
     p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses))
     print(p0V)
     print(p1V)
     print(pAb)
     sm1 = sum(p0V)
     sm2 = sum(p1V)
     print("sm1=%f, sm2=%f" % (sm1, sm2))  # 和不一定为1,因为做了防除0的改造
     print("USING LOG DISP")
     p0V, p1V, pAb = bayes.trainNB0Log(array(trainMat), array(listClasses))
     print(p0V)
     print(p1V)
     print(pAb)
 def test_word2vec(self):
     word = "my dog ate the food on the garbage"
     vocabList = bayes.createVocabList(bayes.loadDataSet()[0])
     print("THE VOCABLIST IS %s" % vocabList)
     print(bayes.setOfWords2Vec(vocabList, word.split()))
 def test_dataUnique(self):
     print(bayes.createVocabList(bayes.loadDataSet()[0]))  # 获得所有词集合,去重