def local_words(feed1, feed0):

    # step 1: create the document list
    doc_list = []
    class_list = []
    full_text = []
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    for i in range(min_len):
        # deal with 1 class
        word_list = text_parse(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(doc_list)
        class_list.append(1)

        # deal wist 0 class
        word_list = text_parse(feed0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(doc_list)
        class_list.append(0)

    # step 2: create dictionary: unique words
    dictionary = bayes.create_vocab_list(doc_list)

    # step 3: remove the top 30 frequency words
    top_30_words = cal_most_freq(dictionary, full_text)
    for word in top_30_words:
        if word[0] in dictionary:
            dictionary.remove(word[0])

    # step 4: create train set and train label
    # randomly select 20 txt for test, remaining for training
    training_number = range(2*min_len)
    test_number = []
    for i in range(20):
        rand_index = int(random.uniform(0, len(training_number)))
        test_number.append(rand_index)
        del(training_number[rand_index])
    train_mat = []
    train_label = []
    for doc_index in training_number:
        train_mat.append(bayes.bag_of_vector2words(dictionary, doc_list[doc_index]))
        train_label.append(class_list[doc_index])
    p0, p1, pro_spam = bayes.train_naive_bayes(array(train_mat), array(train_label))

    # step 4: use the previous classifier to classify the test set
    error_count = 0
    for doc_index in test_number:
        word_to_vector = bayes.bag_of_vector2words(dictionary, doc_list[doc_index])
        if bayes.classify_naive_bayes(array(word_to_vector), p0, p1, pro_spam) != \
           class_list[doc_index]:
            error_count += 1
    print 'the error rate is: ', float(error_count)/len(test_number)
    return dictionary, p0, p1
def test_load():
    word_list, labels = bayes.load_data_set()
    dictionary = bayes.create_vocab_list(word_list)
    print dictionary
    # convert each document into 0-1 vector
    train_mat = []
    for doc in word_list:
        vec = bayes.word2vec(dictionary, doc)
        train_mat.append(vec)
    p0_vec, p1_vec, prob_insult = bayes.train_naive_bayes(train_mat, labels)
    print p0_vec
    print p1_vec
    print prob_insult
Esempio n. 3
0
def spam_test():
    #使用贝叶斯垃圾邮件分类器
    doc_list = []  #包含所有文件解析后的词列表(二维)
    class_list = []
    full_text = []  #包含所有出现词语的列表(一维)
    text_num = 50
    for i in range(1, 26):  #共25个文件
        try:
            big_string = open('email/spam/%d.txt' % i).read()  #导入垃圾邮件文件
            word_list = text_parse(big_string)  #解析为词列表
            doc_list.append(word_list)  #加入所有文件的总列表中
            full_text.extend(word_list)  #加入所有词语的列表中
            class_list.append(1)  #将类别设为1
        except:
            text_num -= 1
        try:
            big_string = open('email/ham/%d.txt' % i).read()  #导入非垃圾邮件文件
            word_list = text_parse(big_string)
            doc_list.append(word_list)
            full_text.extend(word_list)
            class_list.append(0)
        except:
            text_num -= 1
    print(text_num)
    vocab_list = bayes.create_vocab_list(doc_list)  #去重列表(二维)
    training_set = list(range(text_num))  #全部样本集数
    test_set = []  #测试集
    for i in range(10):  #从样本集中随机挑选10个作为测试集(下标)
        rand_index = int(random.uniform(0, len(training_set)))  #随机整数
        test_set.append(training_set[rand_index])  #加入测试集
        del (training_set[rand_index])  #从训练集中删掉
    train_mat = []  #训练矩阵
    train_class = []  #训练矩阵的元素类别
    for doc_index in training_set:
        words = bayes.set_of_words2vec(
            vocab_list,
            doc_list[doc_index])  #检测doc_list[i]中的词语是否出现在vocab_list中,返回0-1向量
        train_mat.append(words)  #放入矩阵中
        train_class.append(class_list[doc_index])  #更新类别
    p_0v, p_1v, p_spam = bayes.train_nb0(array(train_mat),
                                         array(train_class))  #求出相关概率
    error_count = 0
    for doc_index in test_set:  #测试数据
        word_vec = bayes.set_of_words2vec(vocab_list, doc_list[doc_index])
        result = bayes.classify(array(word_vec), p_0v, p_1v,
                                p_spam)  #与相关概率比较得到分类结果
        if result != class_list[doc_index]:
            error_count += 1
    print('The error rate is :', float(error_count / len(test_set)))  #计算错误率
    return float(error_count / len(test_set))
def spam_classify():
    # step 1: create the document list
    doc_list = []
    class_list = []
    full_text = []
    for i in range(1, 26):
        # deal with spam email
        word_list = text_parse(open('email/spam/%d.txt' % i).read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)   # spam email labeled as: 1

        # deal with ham email
        word_list = text_parse(open('email/ham/%d.txt' % i).read())
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)   # ham email labeled as: 0

    # step 2: create the dictionary: unique words
    dictionary = bayes.create_vocab_list(doc_list)

    # step 3: create train set and train label
    # randomly select 10 txt for test, 40 txt for training
    training_number = range(50)
    test_number = []
    for i in range(10):
        rand_index = int(random.uniform(0, len(training_number)))
        test_number.append(training_number[rand_index])
        del(training_number[rand_index])
    train_mat = []
    train_label = []
    for doc_index in training_number:
        # important! use bag of words, not set of words
        train_mat.append(bayes.bag_of_vector2words(dictionary, doc_list[doc_index]))
        train_label.append(class_list[doc_index])
    p0_vec, p1_vec, prob_spam = bayes.train_naive_bayes(array(train_mat), array(train_label))

    # step 4: use the classifier to test the email
    error_count = 0
    for doc_index in test_number:
        word_to_vector = bayes.bag_of_vector2words(dictionary, doc_list[doc_index])
        if bayes.classify_naive_bayes(array(word_to_vector), p0_vec, p1_vec, prob_spam) != \
           class_list[doc_index]:
                error_count += 1
                # print 'Here is the vector', word_to_vector
                print 'Come from: %d: %d %s' % (doc_index,  class_list[doc_index], doc_list[doc_index])
    print 'the current error rate is: ', float(error_count)/len(test_number)
    return float(error_count) / len(test_number)
Esempio n. 5
0
    def test_classify0(self):
        print "\ntest_classify0"
        dataset, labels = bayes.create_dataset()
        vocab_list = bayes.create_vocab_list(dataset)
        train_matrix = []
        for doc in dataset:
            train_matrix.append(bayes.get_vector(vocab_list, doc))
        p0_vector, p1_vector, p = bayes.train0(train_matrix, labels)

        test_list = ['love', 'my', 'dalmation']
        docs = numpy.array(bayes.get_vector(vocab_list, test_list))
        result = bayes.classify0(docs, p0_vector, p1_vector, p)
        print test_list, " classified as: ", result

        test_list = ['stupid', 'garbage', 'my']
        docs = numpy.array(bayes.get_vector(vocab_list, test_list))
        result = bayes.classify0(docs, p0_vector, p1_vector, p)
        print test_list, " classified as: ", result
Esempio n. 6
0
    def test_main_spam(self):
        print "\ntest_main_spam"
        doc_list = []
        labels = []
        full_text = []
        for i in range(1, 26):
            word_list = bayes.split_email('email/spam/%d.txt' % i)
            doc_list.append(word_list)
            full_text.extend(word_list)
            labels.append(1)

            word_list = bayes.split_email('email/ham/%d.txt' % i)
            doc_list.append(word_list)
            full_text.extend(word_list)
            labels.append(0)
        vocab_list = bayes.create_vocab_list(doc_list)
        print vocab_list

        train_set = range(50)
        test_set = []
        for i in range(10):
            rand_index = int(random.uniform(0, len(train_set)))
            test_set.append(train_set[rand_index])
            del (train_set[rand_index])

        train_matrix = []
        train_labels = []
        for i in train_set:
            train_matrix.append(bayes.get_vector(vocab_list, doc_list[i]))
            train_labels.append(labels[i])

        p0_vector, p1_vector, p = bayes.train0(train_matrix, train_labels)

        err_count = 0
        for i in test_set:
            word_vector = bayes.get_vector(vocab_list, doc_list[i])

            predict = bayes.classify0(word_vector, p0_vector, p1_vector, p)
            print "predict: %s real: %s" % (predict, labels[i])
            if predict != labels[i]:
                err_count += 1
        print 'error rate: %s' % (float(err_count) / len(test_set))
Esempio n. 7
0
def spamTest():
    docList = []
    classList = []
    fullText = []
    for i in range(1, 26):
        # assume we have 25 emails for normal email and spam
        wordList = textParse(
            codecs.open(baseURI + 'email/spam/%d.txt' % i,
                        encoding='ANSI').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(
            open(baseURI + 'email/ham/%d.txt' % i, encoding='ANSI').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = bayes.create_vocab_list(docList)  #create vocabulary
    trainingSet = list(range(50))
    testSet = []  #create test set
    # this will pop out 10 emails of traning set for testing algorithm randomly
    for i in range(10):
        # np.random.uniform(low, high) draw samples from a uniform distribution
        # The probability density function of the uniform distribution is p(x)=1/(high-low)
        randIndex = int(np.random.uniform(0, len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = []
    trainClasses = []
    for docIndex in trainingSet:  # train the classifier (get probs) trainNB0
        trainMat.append(bagOfwords2Vec(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V, p1V, pSpam = bayes.trainNB0(np.array(trainMat),
                                     np.array(trainClasses))
    errorCount = 0
    for docIndex in testSet:  #classify the remaining items
        wordVector = bagOfwords2Vec(vocabList, docList[docIndex])
        if bayes.classifyNB(np.array(wordVector), p0V, p1V,
                            pSpam) != classList[docIndex]:
            errorCount += 1
            print("classification error", docList[docIndex])
    print('the error rate is: ', float(errorCount) / len(testSet))
def test_naive_bayes():
    word_list, labels = bayes.load_data_set()
    dictionary = bayes.create_vocab_list(word_list)
    train_mat = []
    for doc in word_list:
        vec = bayes.word2vec(dictionary, doc)
        train_mat.append(vec)
    p0_vec, p1_vec, prob_insult = bayes.train_naive_bayes(train_mat, labels)

    # test example 1:  non-spam
    test1 = ['love', 'my', 'dalmation', 'garbage']
    test1_vec = array(bayes.word2vec(dictionary, test1))
    test1_res = bayes.classify_naive_bayes(test1_vec, p0_vec, p1_vec, prob_insult)
    print test1, ': classified as: ', test1_res

    # test example 2: spam
    test2 = ['dalmation']
    test2_vec = array(bayes.word2vec(dictionary, test2))
    test2_res = bayes.classify_naive_bayes(test2_vec, p0_vec, p1_vec, prob_insult)
    print test2, ': classified as: ', test2_res
Esempio n. 9
0
def local_words(feed1,feed0):
    #从个人广告中获取区域倾向
    doc_list=[]
    class_list=[]
    full_test=[]
    min_len=min(len(feed1['entries']),len(feed0['entries']))
    print(min_len)
    for i in range(min_len):    #每次访问一条rss源
        word_list=bayes_2.text_parse(feed1['entries'][i]['summary'])    #解析feed1得到的长字符串,返回字符串列表
        doc_list.append(word_list)      #将这次获得字符串列表放到总列表中
        full_test.extend(word_list)     #包含所有单词(可重复)
        class_list.append(1)        #标记为1(来源feed1)
        word_list=bayes_2.text_parse(feed0['entries'][i]['summary'])    #解析feed0得到的长字符串,返回字符串列表
        doc_list.append(word_list)
        full_test.extend(word_list)
        class_list.append(0)
    vocab_list=bayes.create_vocab_list(doc_list)    #得到一个去重总词集
    top30_words=cal_most_freq(vocab_list,full_test) #获得频数最高的30个词
    for pair in top30_words:        #从去重词集中去掉这30个词
        if pair[0] in vocab_list:
            vocab_list.remove(pair[0])
    training_set=list(range(2*min_len)) #训练集下标
    test_set=[] #测试集
    for i in range(20): #随机挑选20个样本作为测试集
        rand_index=int(random.uniform(0,len(training_set)))
        test_set.append(training_set[rand_index])
        del(training_set[rand_index])
    train_mat=[]
    train_class=[]
    for doc_index in doc_list:  #训练模型
        train_mat.append(bayes.bag_of_words2vec(vocab_list,doc_list[doc_index]))
        train_class.append(class_list[doc_index])
    p_0v,p_1v,p_spam=bayes.train_nb0(array(train_mat),array(train_class))
    error_count=0
    for doc_index in test_set:  #测试模型,计算错误
        word_vec=bayes.bag_of_words2vec(vocab_list,doc_list[doc_index])
        result=bayes.classify(word_vec,p_0v,p_1v,p_spam)
        if result != class_list[doc_index]:
            error_count+=1
    print('The error rate is:',float(error_count/len(test_set)))
    return vocab_list,p_0v,p_1v
Esempio n. 10
0
def test_naive_bayes():
    word_list, labels = bayes.load_data_set()
    dictionary = bayes.create_vocab_list(word_list)
    train_mat = []
    for doc in word_list:
        vec = bayes.word2vec(dictionary, doc)
        train_mat.append(vec)
    p0_vec, p1_vec, prob_insult = bayes.train_naive_bayes(train_mat, labels)

    # test example 1:  non-spam
    test1 = ['love', 'my', 'dalmation', 'garbage']
    test1_vec = array(bayes.word2vec(dictionary, test1))
    test1_res = bayes.classify_naive_bayes(test1_vec, p0_vec, p1_vec,
                                           prob_insult)
    print test1, ': classified as: ', test1_res

    # test example 2: spam
    test2 = ['dalmation']
    test2_vec = array(bayes.word2vec(dictionary, test2))
    test2_res = bayes.classify_naive_bayes(test2_vec, p0_vec, p1_vec,
                                           prob_insult)
    print test2, ': classified as: ', test2_res
Esempio n. 11
0
import bayes

dataset, labels = bayes.load_dataset()
print(dataset)
print(labels)
vocab_list = bayes.create_vocab_list(dataset)
print(vocab_list)
matrix = []
for array in dataset:
    vec = bayes.words_set_to_vec(vocab_list, array)
    matrix.append(vec)
print(matrix)
p_0_v, p_1_v, p_ab = bayes.train(matrix, labels)
print(p_0_v)
print(p_1_v)
print(p_ab)
print('<--->')
test = ['love', 'my', 'dalmation']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
print('<--->')
test = ['stupid', 'garbage']
vec = bayes.words_set_to_vec(vocab_list, test)
classify = bayes.classify(vec, p_0_v, p_1_v, p_ab)
print(test)
print(vec)
print(classify)
Esempio n. 12
0
# -*- coding:utf-8 -*-

"""
create on July 27,2016 by Wayne
function: test for create vector
"""

import bayes


list_of_posts, list_classes = bayes.load_data_set()
my_vocab_list = bayes.create_vocab_list(list_of_posts)

print list_of_posts
print list_classes
print my_vocab_list

# print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[0])
# print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[3])

train_mat = []
for post_in_doc in list_of_posts:
    train_mat.append(bayes.set_of_words_to_vec(my_vocab_list, post_in_doc))

p0v, p1v, pab = bayes.train_bayes0(train_mat, list_classes)
print "pab:",pab
print "p0v:",p0v
print "p1v:",p1v


Esempio n. 13
0
def local_words(feed1, feed0):
    """
    feed1: rss源1
    feed0: rss源0
    """
    import feedparser
    # 文档列表
    doc_list = []
    # 文档类别列表
    class_list = []
    full_text = []
    # 计算两个源中用于训练的数据的数量
    min_len = min(len(feed1['entries']), len(feed0['entries']))
    # 遍历每个训练数据(文档)
    for i in range(min_len):
        # 分词,去掉标点符号
        word_list = text_parse(feed1['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(1)
        # 分词,去掉标点符号
        word_list = text_parse(feed0['entries'][i]['summary'])
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)
    # 构造单词列表
    vocab_list = create_vocab_list(doc_list)
    # 统计出出现频率前30的单词
    top30_words = calc_most_freq(vocab_list, full_text)
    # 从单词中去掉高频词汇
    for pair_w in top30_words:
        if pair_w[0] in vocab_list:
            vocab_list.remove(pair_w[0])
    # 因为是两个源的数据所以*2
    training_set = range(2 * min_len)
    #print(training_set)
    # 接下来构造测试数据集
    test_set = []
    for i in range(20):
        # 随机从训练数据集中获得20个数据,同时在训练数据集中将其删除
        rand_index = int(random.uniform(0, len(training_set)))
        test_set.append(training_set[rand_index])
        del (training_set[rand_index])
    train_mat = []
    train_classes = []
    # 下面遍历training_set构造出最终用于训练的文档向量和标签
    for doc_index in training_set:
        # 用词袋模型构造每个文档的向量
        train_mat.append(bag_of_word2vec(vocab_list, doc_list[doc_index]))
        train_classes.append(class_list[doc_index])
    # 训练贝叶斯分类器,这里由于使用两源同样数量的数据,所以p_spam为0.5
    p0_v, p1_v, p_spam = trainNB0(np.array(train_mat), np.array(train_classes))
    error_count = 0
    # 测试数据
    for doc_index in test_set:
        # 构造测试文档的向量
        word_vec = bag_of_word2vec(vocab_list, doc_list[doc_index])
        if classifyNB(np.array(word_vec), p0_v, p1_v,
                      p_spam) != class_list[doc_index]:
            error_count += 1
        print('the error rate is: ', float(error_count) / len(test_set))
        return vocab_list, p0_v, p1_v
Esempio n. 14
0
 def test_get_vector(self):
     print "\ntest_get_vector"
     dataset, labels = bayes.create_dataset()
     vocab_list = bayes.create_vocab_list(dataset)
     print bayes.get_vector(vocab_list, dataset[0])
     print bayes.get_vector(vocab_list, dataset[1])
Esempio n. 15
0
 def test_create_vocab_list(self):
     print "\ntest_create_vocab_list"
     dataset, labels = bayes.create_dataset()
     vocab_list = bayes.create_vocab_list(dataset)
     print vocab_list
Esempio n. 16
0
def spam_test():
    """
    data/email/spam 路径下邮件的测试
    :return: 返回正确率
    """
    doc_list = []
    class_list = []
    full_text = []

    for i in range(1, 26):
        # 读取email路径下spam中邮件内容
        email_content = open('./data/email/spam/%d.txt' % i,
                             encoding='gbk').read()
        # 将邮件内容转化为单词数组
        word_list = bayes.text_parse(email_content)
        # word_list 二维数组 里面的每一个数组中代表一封邮件的内容
        doc_list.append(word_list)
        # 一维数组 将邮件中所有单词保存到full_text中
        full_text.extend(word_list)
        class_list.append(1)

        email_content = open('./data/email/ham/%d.txt' % i,
                             encoding='gbk').read()
        word_list = bayes.text_parse(email_content)
        doc_list.append(word_list)
        full_text.extend(word_list)
        class_list.append(0)

    trainset = list(range(50))
    testset = []
    for i in range(10):
        # 随机在(0, len(trainset))范围内一个浮点数 在取整
        rand_index = int(np.random.uniform(0, len(trainset)))
        # 将随机到的下标 添加到测试集中
        testset.append(trainset[rand_index])
        # 同时在训练集中删除
        del trainset[rand_index]

    # 将所有邮件中的单词转化为特征向量
    voca_list = bayes.create_vocab_list(doc_list)
    train_mat = []
    train_class = []

    for doc_index in trainset:
        train_mat.append(
            bayes.set_of_word_2_vec(voca_list, doc_list[doc_index]))
        train_class.append(class_list[doc_index])

    # 训练数据
    p0v, p1v, p_spam = bayes.train_nb1(train_mat, train_class)
    # 统计正确的个数
    count = 0
    # 开始在测试集中验证结果是否正确
    for doc_index in testset:
        word_vec = bayes.set_of_word_2_vec(voca_list, doc_list[doc_index])
        if bayes.classify_nb(np.array(word_vec), p0v, p1v,
                             p_spam) == class_list[doc_index]:
            # 统计正确的个数
            count += 1
    # 计算正确率
    rate = 1.0 * count / len(testset)
    return rate
full_text = []

for i in range(1, 26):
    text = open('email/spam/%d.txt' % i).read()
    word_list = bayes.text_parse(text)
    doc_list.append(word_list)
    full_text.extend(word_list)
    class_list.append(1)

    text = open('email/ham/%d.txt' % i).read()
    word_list = bayes.text_parse(text)
    doc_list.append(word_list)
    full_text.extend(word_list)
    class_list.append(0)

vocab_list = bayes.create_vocab_list(doc_list)

train_set = range(50)
test_set = []
for i in range(10):
    rand_index = int(random.uniform(0, len(train_set)))
    test_set.append(train_set[rand_index])
    del (train_set[rand_index])
train_matrix = []
train_classes = []
for index in train_set:
    vec = bayes.words_set_to_vec(vocab_list, doc_list[index])
    train_matrix.append(vec)
    train_classes.append(class_list[index])

p_0_v, p_1_v, p_ab = bayes.train(train_matrix, train_classes)
Esempio n. 18
0
# -*- coding:utf-8 -*-
"""
create on July 27,2016 by Wayne
function: test for create vector
"""

import bayes

list_of_posts, list_classes = bayes.load_data_set()
my_vocab_list = bayes.create_vocab_list(list_of_posts)

print list_of_posts
print list_classes
print my_vocab_list

# print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[0])
# print bayes.set_of_words_to_vec(my_vocab_list, list_of_posts[3])

train_mat = []
for post_in_doc in list_of_posts:
    train_mat.append(bayes.set_of_words_to_vec(my_vocab_list, post_in_doc))

p0v, p1v, pab = bayes.train_bayes0(train_mat, list_classes)
print "pab:", pab
print "p0v:", p0v
print "p1v:", p1v