def local_words(feed1, feed0): # step 1: create the document list doc_list = [] class_list = [] full_text = [] min_len = min(len(feed1['entries']), len(feed0['entries'])) for i in range(min_len): # deal with 1 class word_list = text_parse(feed1['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(doc_list) class_list.append(1) # deal wist 0 class word_list = text_parse(feed0['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(doc_list) class_list.append(0) # step 2: create dictionary: unique words dictionary = bayes.create_vocab_list(doc_list) # step 3: remove the top 30 frequency words top_30_words = cal_most_freq(dictionary, full_text) for word in top_30_words: if word[0] in dictionary: dictionary.remove(word[0]) # step 4: create train set and train label # randomly select 20 txt for test, remaining for training training_number = range(2*min_len) test_number = [] for i in range(20): rand_index = int(random.uniform(0, len(training_number))) test_number.append(rand_index) del(training_number[rand_index]) train_mat = [] train_label = [] for doc_index in training_number: train_mat.append(bayes.bag_of_vector2words(dictionary, doc_list[doc_index])) train_label.append(class_list[doc_index]) p0, p1, pro_spam = bayes.train_naive_bayes(array(train_mat), array(train_label)) # step 4: use the previous classifier to classify the test set error_count = 0 for doc_index in test_number: word_to_vector = bayes.bag_of_vector2words(dictionary, doc_list[doc_index]) if bayes.classify_naive_bayes(array(word_to_vector), p0, p1, pro_spam) != \ class_list[doc_index]: error_count += 1 print 'the error rate is: ', float(error_count)/len(test_number) return dictionary, p0, p1
def test_load(): word_list, labels = bayes.load_data_set() dictionary = bayes.create_vocab_list(word_list) print dictionary # convert each document into 0-1 vector train_mat = [] for doc in word_list: vec = bayes.word2vec(dictionary, doc) train_mat.append(vec) p0_vec, p1_vec, prob_insult = bayes.train_naive_bayes(train_mat, labels) print p0_vec print p1_vec print prob_insult
def spam_test(): #使用贝叶斯垃圾邮件分类器 doc_list = [] #包含所有文件解析后的词列表(二维) class_list = [] full_text = [] #包含所有出现词语的列表(一维) text_num = 50 for i in range(1, 26): #共25个文件 try: big_string = open('email/spam/%d.txt' % i).read() #导入垃圾邮件文件 word_list = text_parse(big_string) #解析为词列表 doc_list.append(word_list) #加入所有文件的总列表中 full_text.extend(word_list) #加入所有词语的列表中 class_list.append(1) #将类别设为1 except: text_num -= 1 try: big_string = open('email/ham/%d.txt' % i).read() #导入非垃圾邮件文件 word_list = text_parse(big_string) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) except: text_num -= 1 print(text_num) vocab_list = bayes.create_vocab_list(doc_list) #去重列表(二维) training_set = list(range(text_num)) #全部样本集数 test_set = [] #测试集 for i in range(10): #从样本集中随机挑选10个作为测试集(下标) rand_index = int(random.uniform(0, len(training_set))) #随机整数 test_set.append(training_set[rand_index]) #加入测试集 del (training_set[rand_index]) #从训练集中删掉 train_mat = [] #训练矩阵 train_class = [] #训练矩阵的元素类别 for doc_index in training_set: words = bayes.set_of_words2vec( vocab_list, doc_list[doc_index]) #检测doc_list[i]中的词语是否出现在vocab_list中,返回0-1向量 train_mat.append(words) #放入矩阵中 train_class.append(class_list[doc_index]) #更新类别 p_0v, p_1v, p_spam = bayes.train_nb0(array(train_mat), array(train_class)) #求出相关概率 error_count = 0 for doc_index in test_set: #测试数据 word_vec = bayes.set_of_words2vec(vocab_list, doc_list[doc_index]) result = bayes.classify(array(word_vec), p_0v, p_1v, p_spam) #与相关概率比较得到分类结果 if result != class_list[doc_index]: error_count += 1 print('The error rate is :', float(error_count / len(test_set))) #计算错误率 return float(error_count / len(test_set))
def spam_classify(): # step 1: create the document list doc_list = [] class_list = [] full_text = [] for i in range(1, 26): # deal with spam email word_list = text_parse(open('email/spam/%d.txt' % i).read()) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) # spam email labeled as: 1 # deal with ham email word_list = text_parse(open('email/ham/%d.txt' % i).read()) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) # ham email labeled as: 0 # step 2: create the dictionary: unique words dictionary = bayes.create_vocab_list(doc_list) # step 3: create train set and train label # randomly select 10 txt for test, 40 txt for training training_number = range(50) test_number = [] for i in range(10): rand_index = int(random.uniform(0, len(training_number))) test_number.append(training_number[rand_index]) del(training_number[rand_index]) train_mat = [] train_label = [] for doc_index in training_number: # important! use bag of words, not set of words train_mat.append(bayes.bag_of_vector2words(dictionary, doc_list[doc_index])) train_label.append(class_list[doc_index]) p0_vec, p1_vec, prob_spam = bayes.train_naive_bayes(array(train_mat), array(train_label)) # step 4: use the classifier to test the email error_count = 0 for doc_index in test_number: word_to_vector = bayes.bag_of_vector2words(dictionary, doc_list[doc_index]) if bayes.classify_naive_bayes(array(word_to_vector), p0_vec, p1_vec, prob_spam) != \ class_list[doc_index]: error_count += 1 # print 'Here is the vector', word_to_vector print 'Come from: %d: %d %s' % (doc_index, class_list[doc_index], doc_list[doc_index]) print 'the current error rate is: ', float(error_count)/len(test_number) return float(error_count) / len(test_number)
def test_classify0(self): print "\ntest_classify0" dataset, labels = bayes.create_dataset() vocab_list = bayes.create_vocab_list(dataset) train_matrix = [] for doc in dataset: train_matrix.append(bayes.get_vector(vocab_list, doc)) p0_vector, p1_vector, p = bayes.train0(train_matrix, labels) test_list = ['love', 'my', 'dalmation'] docs = numpy.array(bayes.get_vector(vocab_list, test_list)) result = bayes.classify0(docs, p0_vector, p1_vector, p) print test_list, " classified as: ", result test_list = ['stupid', 'garbage', 'my'] docs = numpy.array(bayes.get_vector(vocab_list, test_list)) result = bayes.classify0(docs, p0_vector, p1_vector, p) print test_list, " classified as: ", result
def test_main_spam(self): print "\ntest_main_spam" doc_list = [] labels = [] full_text = [] for i in range(1, 26): word_list = bayes.split_email('email/spam/%d.txt' % i) doc_list.append(word_list) full_text.extend(word_list) labels.append(1) word_list = bayes.split_email('email/ham/%d.txt' % i) doc_list.append(word_list) full_text.extend(word_list) labels.append(0) vocab_list = bayes.create_vocab_list(doc_list) print vocab_list train_set = range(50) test_set = [] for i in range(10): rand_index = int(random.uniform(0, len(train_set))) test_set.append(train_set[rand_index]) del (train_set[rand_index]) train_matrix = [] train_labels = [] for i in train_set: train_matrix.append(bayes.get_vector(vocab_list, doc_list[i])) train_labels.append(labels[i]) p0_vector, p1_vector, p = bayes.train0(train_matrix, train_labels) err_count = 0 for i in test_set: word_vector = bayes.get_vector(vocab_list, doc_list[i]) predict = bayes.classify0(word_vector, p0_vector, p1_vector, p) print "predict: %s real: %s" % (predict, labels[i]) if predict != labels[i]: err_count += 1 print 'error rate: %s' % (float(err_count) / len(test_set))
def spamTest(): docList = [] classList = [] fullText = [] for i in range(1, 26): # assume we have 25 emails for normal email and spam wordList = textParse( codecs.open(baseURI + 'email/spam/%d.txt' % i, encoding='ANSI').read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList = textParse( open(baseURI + 'email/ham/%d.txt' % i, encoding='ANSI').read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList = bayes.create_vocab_list(docList) #create vocabulary trainingSet = list(range(50)) testSet = [] #create test set # this will pop out 10 emails of traning set for testing algorithm randomly for i in range(10): # np.random.uniform(low, high) draw samples from a uniform distribution # The probability density function of the uniform distribution is p(x)=1/(high-low) randIndex = int(np.random.uniform(0, len(trainingSet))) testSet.append(trainingSet[randIndex]) del (trainingSet[randIndex]) trainMat = [] trainClasses = [] for docIndex in trainingSet: # train the classifier (get probs) trainNB0 trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex])) trainClasses.append(classList[docIndex]) p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses)) errorCount = 0 for docIndex in testSet: #classify the remaining items wordVector = bagOfwords2Vec(vocabList, docList[docIndex]) if bayes.classifyNB(np.array(wordVector), p0V, p1V, pSpam) != classList[docIndex]: errorCount += 1 print("classification error", docList[docIndex]) print('the error rate is: ', float(errorCount) / len(testSet))
def test_naive_bayes(): word_list, labels = bayes.load_data_set() dictionary = bayes.create_vocab_list(word_list) train_mat = [] for doc in word_list: vec = bayes.word2vec(dictionary, doc) train_mat.append(vec) p0_vec, p1_vec, prob_insult = bayes.train_naive_bayes(train_mat, labels) # test example 1: non-spam test1 = ['love', 'my', 'dalmation', 'garbage'] test1_vec = array(bayes.word2vec(dictionary, test1)) test1_res = bayes.classify_naive_bayes(test1_vec, p0_vec, p1_vec, prob_insult) print test1, ': classified as: ', test1_res # test example 2: spam test2 = ['dalmation'] test2_vec = array(bayes.word2vec(dictionary, test2)) test2_res = bayes.classify_naive_bayes(test2_vec, p0_vec, p1_vec, prob_insult) print test2, ': classified as: ', test2_res
def local_words(feed1,feed0): #从个人广告中获取区域倾向 doc_list=[] class_list=[] full_test=[] min_len=min(len(feed1['entries']),len(feed0['entries'])) print(min_len) for i in range(min_len): #每次访问一条rss源 word_list=bayes_2.text_parse(feed1['entries'][i]['summary']) #解析feed1得到的长字符串,返回字符串列表 doc_list.append(word_list) #将这次获得字符串列表放到总列表中 full_test.extend(word_list) #包含所有单词(可重复) class_list.append(1) #标记为1(来源feed1) word_list=bayes_2.text_parse(feed0['entries'][i]['summary']) #解析feed0得到的长字符串,返回字符串列表 doc_list.append(word_list) full_test.extend(word_list) class_list.append(0) vocab_list=bayes.create_vocab_list(doc_list) #得到一个去重总词集 top30_words=cal_most_freq(vocab_list,full_test) #获得频数最高的30个词 for pair in top30_words: #从去重词集中去掉这30个词 if pair[0] in vocab_list: vocab_list.remove(pair[0]) training_set=list(range(2*min_len)) #训练集下标 test_set=[] #测试集 for i in range(20): #随机挑选20个样本作为测试集 rand_index=int(random.uniform(0,len(training_set))) test_set.append(training_set[rand_index]) del(training_set[rand_index]) train_mat=[] train_class=[] for doc_index in doc_list: #训练模型 train_mat.append(bayes.bag_of_words2vec(vocab_list,doc_list[doc_index])) train_class.append(class_list[doc_index]) p_0v,p_1v,p_spam=bayes.train_nb0(array(train_mat),array(train_class)) error_count=0 for doc_index in test_set: #测试模型,计算错误 word_vec=bayes.bag_of_words2vec(vocab_list,doc_list[doc_index]) result=bayes.classify(word_vec,p_0v,p_1v,p_spam) if result != class_list[doc_index]: error_count+=1 print('The error rate is:',float(error_count/len(test_set))) return vocab_list,p_0v,p_1v
import bayes dataset, labels = bayes.load_dataset() print(dataset) print(labels) vocab_list = bayes.create_vocab_list(dataset) print(vocab_list) matrix = [] for array in dataset: vec = bayes.words_set_to_vec(vocab_list, array) matrix.append(vec) print(matrix) p_0_v, p_1_v, p_ab = bayes.train(matrix, labels) print(p_0_v) print(p_1_v) print(p_ab) print('<--->') test = ['love', 'my', 'dalmation'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify) print('<--->') test = ['stupid', 'garbage'] vec = bayes.words_set_to_vec(vocab_list, test) classify = bayes.classify(vec, p_0_v, p_1_v, p_ab) print(test) print(vec) print(classify)
# -*- coding:utf-8 -*- """ create on July 27,2016 by Wayne function: test for create vector """ import bayes list_of_posts, list_classes = bayes.load_data_set() my_vocab_list = bayes.create_vocab_list(list_of_posts) print list_of_posts print list_classes print my_vocab_list # print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[0]) # print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[3]) train_mat = [] for post_in_doc in list_of_posts: train_mat.append(bayes.set_of_words_to_vec(my_vocab_list, post_in_doc)) p0v, p1v, pab = bayes.train_bayes0(train_mat, list_classes) print "pab:",pab print "p0v:",p0v print "p1v:",p1v
def local_words(feed1, feed0): """ feed1: rss源1 feed0: rss源0 """ import feedparser # 文档列表 doc_list = [] # 文档类别列表 class_list = [] full_text = [] # 计算两个源中用于训练的数据的数量 min_len = min(len(feed1['entries']), len(feed0['entries'])) # 遍历每个训练数据(文档) for i in range(min_len): # 分词,去掉标点符号 word_list = text_parse(feed1['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) # 分词,去掉标点符号 word_list = text_parse(feed0['entries'][i]['summary']) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) # 构造单词列表 vocab_list = create_vocab_list(doc_list) # 统计出出现频率前30的单词 top30_words = calc_most_freq(vocab_list, full_text) # 从单词中去掉高频词汇 for pair_w in top30_words: if pair_w[0] in vocab_list: vocab_list.remove(pair_w[0]) # 因为是两个源的数据所以*2 training_set = range(2 * min_len) #print(training_set) # 接下来构造测试数据集 test_set = [] for i in range(20): # 随机从训练数据集中获得20个数据,同时在训练数据集中将其删除 rand_index = int(random.uniform(0, len(training_set))) test_set.append(training_set[rand_index]) del (training_set[rand_index]) train_mat = [] train_classes = [] # 下面遍历training_set构造出最终用于训练的文档向量和标签 for doc_index in training_set: # 用词袋模型构造每个文档的向量 train_mat.append(bag_of_word2vec(vocab_list, doc_list[doc_index])) train_classes.append(class_list[doc_index]) # 训练贝叶斯分类器,这里由于使用两源同样数量的数据,所以p_spam为0.5 p0_v, p1_v, p_spam = trainNB0(np.array(train_mat), np.array(train_classes)) error_count = 0 # 测试数据 for doc_index in test_set: # 构造测试文档的向量 word_vec = bag_of_word2vec(vocab_list, doc_list[doc_index]) if classifyNB(np.array(word_vec), p0_v, p1_v, p_spam) != class_list[doc_index]: error_count += 1 print('the error rate is: ', float(error_count) / len(test_set)) return vocab_list, p0_v, p1_v
def test_get_vector(self): print "\ntest_get_vector" dataset, labels = bayes.create_dataset() vocab_list = bayes.create_vocab_list(dataset) print bayes.get_vector(vocab_list, dataset[0]) print bayes.get_vector(vocab_list, dataset[1])
def test_create_vocab_list(self): print "\ntest_create_vocab_list" dataset, labels = bayes.create_dataset() vocab_list = bayes.create_vocab_list(dataset) print vocab_list
def spam_test(): """ data/email/spam 路径下邮件的测试 :return: 返回正确率 """ doc_list = [] class_list = [] full_text = [] for i in range(1, 26): # 读取email路径下spam中邮件内容 email_content = open('./data/email/spam/%d.txt' % i, encoding='gbk').read() # 将邮件内容转化为单词数组 word_list = bayes.text_parse(email_content) # word_list 二维数组 里面的每一个数组中代表一封邮件的内容 doc_list.append(word_list) # 一维数组 将邮件中所有单词保存到full_text中 full_text.extend(word_list) class_list.append(1) email_content = open('./data/email/ham/%d.txt' % i, encoding='gbk').read() word_list = bayes.text_parse(email_content) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) trainset = list(range(50)) testset = [] for i in range(10): # 随机在(0, len(trainset))范围内一个浮点数 在取整 rand_index = int(np.random.uniform(0, len(trainset))) # 将随机到的下标 添加到测试集中 testset.append(trainset[rand_index]) # 同时在训练集中删除 del trainset[rand_index] # 将所有邮件中的单词转化为特征向量 voca_list = bayes.create_vocab_list(doc_list) train_mat = [] train_class = [] for doc_index in trainset: train_mat.append( bayes.set_of_word_2_vec(voca_list, doc_list[doc_index])) train_class.append(class_list[doc_index]) # 训练数据 p0v, p1v, p_spam = bayes.train_nb1(train_mat, train_class) # 统计正确的个数 count = 0 # 开始在测试集中验证结果是否正确 for doc_index in testset: word_vec = bayes.set_of_word_2_vec(voca_list, doc_list[doc_index]) if bayes.classify_nb(np.array(word_vec), p0v, p1v, p_spam) == class_list[doc_index]: # 统计正确的个数 count += 1 # 计算正确率 rate = 1.0 * count / len(testset) return rate
full_text = [] for i in range(1, 26): text = open('email/spam/%d.txt' % i).read() word_list = bayes.text_parse(text) doc_list.append(word_list) full_text.extend(word_list) class_list.append(1) text = open('email/ham/%d.txt' % i).read() word_list = bayes.text_parse(text) doc_list.append(word_list) full_text.extend(word_list) class_list.append(0) vocab_list = bayes.create_vocab_list(doc_list) train_set = range(50) test_set = [] for i in range(10): rand_index = int(random.uniform(0, len(train_set))) test_set.append(train_set[rand_index]) del (train_set[rand_index]) train_matrix = [] train_classes = [] for index in train_set: vec = bayes.words_set_to_vec(vocab_list, doc_list[index]) train_matrix.append(vec) train_classes.append(class_list[index]) p_0_v, p_1_v, p_ab = bayes.train(train_matrix, train_classes)
# -*- coding:utf-8 -*- """ create on July 27,2016 by Wayne function: test for create vector """ import bayes list_of_posts, list_classes = bayes.load_data_set() my_vocab_list = bayes.create_vocab_list(list_of_posts) print list_of_posts print list_classes print my_vocab_list # print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[0]) # print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[3]) train_mat = [] for post_in_doc in list_of_posts: train_mat.append(bayes.set_of_words_to_vec(my_vocab_list, post_in_doc)) p0v, p1v, pab = bayes.train_bayes0(train_mat, list_classes) print "pab:", pab print "p0v:", p0v print "p1v:", p1v