def test_word2vec2(self): raw = bayes.loadDataSet()[0] vocabList = bayes.createVocabList(raw) print("THE VOCABLIST IS %s" % vocabList) for wd in raw: print("RAW WORD IS %s" % wd) print("OUTPUT VEC IS %s" % bayes.setOfWords2Vec(vocabList, wd))
def test_freq(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) zz = bayes.calcMostFreq( myVocabList, 'haha my steak is food, my problems is garbage') print(zz)
def test_word2vecBag(self): raw = bayes.loadDataSet()[0] vocabList = bayes.createVocabList(raw) print("THE VOCABLIST IS %s" % vocabList) for wd in raw: print("RAW WORD IS %s" % wd) print("OUTPUT VEC BAG IS %s" % bayes.bagOfWords2VecMN(vocabList, wd)) # 注意输出的向量里面how会有2个
def test_NB(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0Log(array(trainMat), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(thisDoc) print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(thisDoc) print(testEntry, 'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb))
def spamTest(self): docList = [] classList = [] fullText = [] for i in range(1, 26): wordList = bayes.textParse( io.open('email/spam/%d.txt' % i, encoding="ISO-8859-1").read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = bayes.textParse( io.open('email/ham/%d.txt' % i, encoding="ISO-8859-1").read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.createVocabList(docList) # create vocabulary print("VACABULIST IS %s \nwith VACABULIST size = %d" % (vocabList, len(vocabList))) trainingSet = range(50) testSet = [] # create test set for i in range(10): randIndex = int(random.uniform(0, len(trainingSet))) print("RANDOM IS %d" % randIndex) testSet.append(trainingSet[randIndex]) del (list(trainingSet)[randIndex]) # 2和3的语法不一样,这里遵从2.7 trainMat = [] trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append( bayes.bagOfWords2VecMN(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) print(trainMat) p0V, p1V, pSpam = bayes.trainNB0( array(trainMat), array(trainClasses)) # 得到训练后的朴素贝叶斯向量P(w|Ci)和P(Ci) errorCount = 0 for docIndex in testSet: # classify the remaining items wordVector = bayes.bagOfWords2VecMN(vocabList, docList[docIndex]) if bayes.classifyNB(array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("classification error", docList[docIndex]) print('the error rate is: ', float(errorCount) / len(testSet))
def test_trainNB(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) # 获得所有词集合,去重 trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print(trainMat) p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(listClasses)) print(p0V) print(p1V) print(pAb) sm1 = sum(p0V) sm2 = sum(p1V) print("sm1=%f, sm2=%f" % (sm1, sm2)) # 和不一定为1,因为做了防除0的改造 print("USING LOG DISP") p0V, p1V, pAb = bayes.trainNB0Log(array(trainMat), array(listClasses)) print(p0V) print(p1V) print(pAb)
def test_word2vec(self): word = "my dog ate the food on the garbage" vocabList = bayes.createVocabList(bayes.loadDataSet()[0]) print("THE VOCABLIST IS %s" % vocabList) print(bayes.setOfWords2Vec(vocabList, word.split()))
def test_dataUnique(self): print(bayes.createVocabList(bayes.loadDataSet()[0])) # 获得所有词集合,去重