def testingNB(): # 1. 加载数据集 listOPosts, listClasses = bayes.loadDataSet() print('listOPosts: ', listOPosts, '\n************************************\nlistClasses: ', listClasses) # 2. 创建单词集合 myVocabList = bayes.createVocabList(listOPosts) # 3. 计算单词是否出现并创建数据矩阵 trainMat = [] for postinDoc in listOPosts: # 返回m * len(myVocabList)的矩阵,记录的都是0,1信息 # print('postinDoc:', postinDoc) trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # 4. 训练数据 p0V, p1V, pAb = trainNB0(np.array(trainMat), np.array(listClasses)) # 5. 测试数据 testEntry = ['love', 'my', 'dalmatioin'] thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
def main(): postingList, classVec = bayes.loadDataSet() vlist = bayes.create_vacabulary_list(postingList) tranmat = [] for row in postingList: tranmat.append(bayes.setOfWords2Vec(vlist, row)) print bayes.trainNB0(tranmat, classVec)
def test_train(): listOposts,listClasses=bayes.loadDataSet() myVocabList=bayes.createVocabList(listOposts) trainMat=[] for postinDoc in listOposts: trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc)) p0v,p1v,pab=bayes.trainNB0(trainMat,listClasses) print p1v
def test_bagOfWords2VecMN(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) features = bayes.bagOfWords2VecMN(myVocabList, listOPosts[0]) expected = [ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1 ] self.assertEqual(features, expected)
def test_setOfWords2Vec(self): # listOPosts is actually... # listClasses is actually a list of labels for the data in listOPosts listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) features = bayes.setOfWords2Vec(myVocabList, listOPosts[0]) expected = [ 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1 ] self.assertEqual(features, expected)
def test_createVocablist(self): data_set, _ = bayes.loadDataSet() vocab_list = bayes.createVocabList(data_set) print("\n vocab_list == %s" % (vocab_list)) # 根据数据集第0行输出对应的向量表 # (即,第0行中所有单词,在整个data_set词汇表中出现的单词位置设置为1) vec = bayes.setOfWords2Vec(vocab_list, data_set[0]) print("\n vec == %s" % (vec)) vec = bayes.setOfWords2Vec(vocab_list, data_set[3]) print("\n vec == %s" % (vec))
def test_train_nb(self): data_set, listClasses = bayes.loadDataSet() vocab_list = bayes.createVocabList(data_set) print("\n vocab_list == %s" % (vocab_list)) trainMat = [] for postinDoc in data_set: trainMat.append(bayes.setOfWords2Vec(vocab_list, postinDoc)) p0Vect, p1Vect, pAbusive = bayes.trainNB0(trainMat, listClasses) print("\n p0Vect == %s\n p1Vect == %s\n pAbusive == %s\n" % (p0Vect, p1Vect, pAbusive))
def testingNB(): listOPosts, listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = trainNB0(array(trainMat), array(listClasses)) testEntry = ['love', 'my', 'dalmation'] tesDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] tesDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(tesDoc, p0V, p1V, pAb))
def testSimpTrain(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print "trainMat:", trainMat print "listClasses:", listClasses p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses) print "pAb:",pAb print "p0V:",p0V print "p1V:",p1V
def testingNB(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V,p1V,pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb) testEntry = ['quit', 'stupid'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print testEntry,'classified as: ', bayes.classifyNB(thisDoc, p0V, p1V, pAb)
def test_trainNBO(self): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) trainMat = [] # list of lists, e.g., [[...], ..., [...]] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) # this is interesting as the names sent to the funtion imply # different types than the names received by the function. # Compare sending trainCategory to receiving listClasses. # There isn't even a hint of meaning between those two names # at the program (self-referentiall) perspective. # p0Vect, p1Vect, pAbusive = trainNBO(trainMatrix, trainCategory) p0V, p1V, pAb = bayes.trainNBO(trainMat, listClasses) # print p0V, p1V, pAb self.assertAlmostEqual(pAb, 0.5)
def testingNB(): postList, classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postList) trainMat = [] for post in postList: trainMat.append(bayes.setOfWords2Vec(myVocabList, post)) p0V, p1V, pAb = bayes.trainNB0(trainMat, classList) testEntry = ['love', 'my', 'dalmation'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb) testEntry = ['stupid', 'garbage'] thisDoc = bayes.setOfWords2Vec(myVocabList, testEntry) print testEntry, 'classified as: ', bayes.classifyNB( thisDoc, p0V, p1V, pAb)
def testingNB(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) testEntry = ['love', 'my', 'dalmation'] print thisDoc = np.array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb)) testEntry = ['stupid', 'garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as: ', classifyNB(thisDoc, p0V, p1V, pAb))
def testNB(): listOPosts, listClasses = bayes.loadDataSet() #加载数据 myVocabList = bayes.createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) resultLabel = {0: 'Not garbage', 1: 'Garbage'} testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)]) testEntry = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', resultLabel[bayes.classifyNB(thisDoc, p0V, p1V, pAb)])
def run(): print 'begin--->run()' postingList, classVec = bayes.loadDataSet() myVocabList = bayes.createVocabList(postingList) # print myVocabList # print bayes.words2Vec(myVocabList,postingList[0]) # trainMat = [] # for postinDoc in postingList: # trainMat.append(bayes.words2Vec(myVocabList, postinDoc)) # p0V, p1V, pAb = bayes.trainNB0(trainMat, classVec) # bayes.testingNB() # bayes.spamTest() # print pAb # print p0V # print p1V import feedparser ny = feedparser.parse( 'https://newyork.craigslist.org/search/res?format=rss') sf = feedparser.parse('https://sfbay.craigslist.org/search/apa?format=rss') # bayes.localWords(ny,sf) bayes.getTopWords(ny, sf)
def simptestTest(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print myVocabList print listOPosts[0] print bayes.setOfWords2Vec(myVocabList, listOPosts[0])
def test_vocList(): listOposts,listClasses=bayes.loadDataSet() myVocabList=bayes.createVocabList(listOposts) print myVocabList
def test(): listOPosts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPosts) print(myVocabList) return
import bayes; listOPosts,listClasses = bayes.loadDataSet() print listOPosts; print listClasses; myVocabList = bayes.createVocabList(listOPosts); print myVocabList print bayes.setOfWords2Vec(myVocabList,listOPosts[0])
# coding=utf-8 import bayes dateset,vec=bayes.loadDataSet() print dateset print '--词集,不考虑出现次数' val_set= bayes.createVocabList(dateset) print val_set print '--每篇的词集向量 ' word_vec =bayes.setOfWords2Vec(val_set,dateset[0]) print word_vec traindoc=[] for doc in dateset: traindoc.append(bayes.setOfWords2Vec(val_set,doc)) p0_v,p1_v,p_ab= bayes.trainNB0(traindoc, vec) print p_ab print p0_v print p1_v print '-- test' bayes.testingNB() print '--测试垃圾邮件' bayes.spamTest()
#词集模型,每个词的出现与否作为特征 def setOfWords2Vec(vocabList, inputSet): #输入参数词汇表,文档 returnVec = [0] * len(vocabList) #创建一个所含元素都为0的向量,长度与词汇表相同 for word in inputSet: #历遍文档的单词 if word in vocabList: #如果文档的单词在词汇表中出现 returnVec[vocabList.index(word)] = 1 #相应return相应单词的索引对应的值变为1,(词汇表中未出现的单词默认为0) else: print("the word:'%s' is not in my Vocabulary!" % word) # return returnVec '''--------------测试函数---------------''' import bayes listOPosts, listClasses = bayes.loadDataSet() #导入文档,并转化为向量形式 myVocabList = bayes.createVocaList(listOPosts) #创建词汇表 myVocabList #词汇表 len(myVocabList) bayes.setOfWords2Vec(myVocabList, listOPosts[0]) bayes.setOfWords2Vec(myVocabList, listOPosts[3]) #数据分为两个两个等级 #1.文档,文档类别,向量 #2.词汇,词汇数,侮辱性词汇,正常词汇,单词''' '''--------------训练算法:从词向量计算概率---------------''' #伪代码见书和笔记 '''计算每个类别的文档数 对每片训练文档: 对每个类别: 如果词条出现在文档中:
import bayes p, c = bayes.loadDataSet() # # v = bayes.createVocabList(p) # h=bayes.setOfWords2Vec(v,['dog','do']) # # print(h) # # trainMat=[] # # for Doc in p: # # trainMat.append(bayes.setOfWords2Vec(v,Doc)) # # p0v,p1v,pAb=bayes.trainNB0(trainMat,c) # # print(pAb) # bayes.testingNB() # mysent = 'This book is the best book on Python or M.L. I have ever laid eyes upon.' # mysent.split() # import re # regex = re.compile('\\W+') # ltoken = regex.split(mysent) # tok = [token.lower() for token in ltoken if len(token)>0] # print(tok) # bayes.spamTest() import feedparser ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss') sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss') # vocablist,psf,pny = bayes.localWords(ny,sf) bayes.getTopWords(ny, sf)
__author__ = 'wanghao' """ naive bayes main function author : wanghao email : [email protected] """ from numpy import * import bayes import FilterMail postingList, classVec = bayes.loadDataSet() # get the vablist vablist = bayes.createVocablist(postingList) print "Show my vablist\n", vablist print "-------------------------------" # get the returnVec returnVec = bayes.setOfwords2Vec(vablist, ["my", "love", "dog", "happy", "daddy"]) print "the word vec is ", returnVec print "-------------------------------" # get the prior probability trainMat = [] for one in postingList: trainMat.append(bayes.setOfwords2Vec(vablist, one)) pa, p1Vec, p0Vec = bayes.trainNB0(trainMat, classVec) print "the 1 probability is %f, " % pa print "the each class , each element probability\n", p1Vec, '\n', p0Vec
#!/usr/bin/python #encoding:utf-8 import bayes from numpy import * postingList, classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postingList) trainMat = [] for a in postingList: trainMat.append(bayes.setOfWords(myVocabList, a)) pc, p0, p1 = bayes.trainNB0(trainMat, classList) test = ['stupid', 'garbage'] thisDoc = array(bayes.setOfWords(myVocabList, test)) print bayes.classifyNB(thisDoc, p0, p1, pc)
#!/usr/bin/python #encoding:utf-8 import bayes from numpy import * postingList , classList = bayes.loadDataSet() myVocabList = bayes.createVocabList(postingList) trainMat = [] for a in postingList: trainMat.append( bayes.setOfWords( myVocabList, a ) ) pc, p0, p1 = bayes.trainNB0(trainMat, classList) test = ['stupid', 'garbage'] thisDoc = array( bayes.setOfWords( myVocabList, test ) ) print bayes.classifyNB(thisDoc, p0, p1, pc)
import bayes PostingList, ClassVector = bayes.loadDataSet() print '*** PostingList ***' print PostingList print '*** ClassVector ***' print ClassVector myVocabList = bayes.createVocabList(PostingList) print '*** myVocabList ***' print myVocabList trainMat = [] for postinDoc in PostingList: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print '*** trainMatrix ***' print trainMat p0V, p1V, pAb = bayes.trainNB0(array(trainMat), array(ClassVector) print '*** (c0/w) ***' print p0V print '*** (c1/w) ***' print p1V testEntry = ['love', 'my', 'dalmation'] thisDoc = array(bayes.setOfWords2Vec(myVocabList, testEntry)) if (classifyNB(thisDoc,p0V,p1V,pAb) == 0): print testEntry, 'classified as not Abusive :)' else: print testEntry, 'classfied as Abusive :(' #print testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V,pAb);
__author__ = 'Kevin' import bayes listOPost, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOPost) print myVocabList print bayes.setOfWords2Vec(myVocabList, listOPost[0]) print bayes.setOfWords2Vec(myVocabList, listOPost[3]) s = "ab,cde,fg" print s print s.split(",") from numpy import * trainMat = [] for postinDoc in listOPost: trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print "pAb: %s" % pAb print "p0V: %s" % p0V print "p1V: %s" % p1V class TrainingLetter: result = "1" dataset = [] measure = 0 def __init__(self, letter, data, measure): self.result = letter self.dataset = data self.measure = measure
import bayes from numpy import * listOposts, listClasses = bayes.loadDataSet() myVocabList = bayes.createVocabList(listOposts) trainMat = [] print("-----start for about trainMat----- ") for postinDoc in listOposts: print("postinDoc = ", postinDoc) trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc)) print("trainMat = ", trainMat) p0V, p1V, pAb = bayes.trainNB0(trainMat, listClasses) print("p0V = ", p0V) print("p1V = ", p1V) print("pAb = ", pAb)
def test_word2vec(): listOposts,listClasses=bayes.loadDataSet() myVocabList=bayes.createVocabList(listOposts) print bayes.setOfWords2Vec(myVocabList,listOposts[0])