Beispiel #1
0
import bayes
import feedparser
listOPosts, listClasses = bayes.loadDataSet()
myVocabList = bayes.createVocabList(listOPosts)

# print(myVocabList)

# print(bayes.setOfWords2Vec(myVocabList, listOPosts[0]))

# trainMat =[]
# for postinDoc in listOPosts:
#     trainMat.append(bayes.setOfWords2Vec(myVocabList, postinDoc))
# p0v, p1v, pAb = bayes.trainNB0(trainMat, listClasses)
# print(p0v)
# print(p1v)
# print(pAb)

# bayes.testingNB()

# bayes.spamTest()

ny = feedparser.parse('http://newyork/craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay/craigslist.org/stp/index.rss')
vocabList, pSF, pNY = bayes.localWords(ny, sf)
Beispiel #2
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import bayes
# listOPosts,listClasses = bayes.loadDataSet()
# myVocabList = bayes.createVocabList(listOPosts)
# print myVocabList
# print bayes.setOfWords2Vec(myVocabList,listOPosts[0])
# trainMat = []
# for postinDoc in listOPosts:
#     trainMat.append(bayes.setOfWords2Vec(myVocabList,postinDoc))
# p0V,p1V,pAb = bayes.trainNB0(trainMat,listClasses)
# print pAb
# print sum(p0V)
# 
# bayes.testingNB()


# import re
# regEx=re.compile('\\W*')
# emailText=open('email/ham/6.txt').read()
# listOfTokens=regEx.split(emailText)
# print listOfTokens

#bayes.spamTest()
import feedparser
ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
vocabList,pSF,pNY=bayes.localWords(ny,sf)
bayes.getTopWords(ny,sf)
#!/usr/bin/python
# -*- coding:utf-8 -*-

'''
Created on Oct 31, 2015

@author: yanruibo

用feedparser数据集测试NBC(朴素贝叶斯)算法
因为前面测试是从所有数据中随机选择20个文档数据做为测试集,剩下的作为训练集
这里测试count次,取精确度的平均值
'''
import feedparser
import bayes
import time
if __name__ == '__main__':
    
    ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
    sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
    errorRateSum = 0.0
    count = 200
    for i in range(count):
        vocabList, p0V, p1V,errorRate = bayes.localWords(ny, sf)
        errorRateSum+=errorRate
    averageAccuracy =(1-errorRateSum/count)
    print "average accuracy is: ", averageAccuracy
    
Beispiel #4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import feedparser
import bayes
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
print 'ny download over'
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
#valueOfFeat = secondDict[key]
print 'sf download over'
#由于随机构建测试集,通过多次测试减小有误差
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
bayes.localWords(ny, sf)
Beispiel #5
0
#!/usr/bin/python
# -*- coding:utf-8 -*-
'''
Created on Oct 31, 2015

@author: yanruibo

用feedparser数据集测试NBC(朴素贝叶斯)算法
因为前面测试是从所有数据中随机选择20个文档数据做为测试集,剩下的作为训练集
这里测试count次,取精确度的平均值
'''
import feedparser
import bayes
import time
if __name__ == '__main__':

    ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
    sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
    errorRateSum = 0.0
    count = 200
    for i in range(count):
        vocabList, p0V, p1V, errorRate = bayes.localWords(ny, sf)
        errorRateSum += errorRate
    averageAccuracy = (1 - errorRateSum / count)
    print "average accuracy is: ", averageAccuracy

# if __name__ == "__main__":
#     bayes.testingNB()

# if __name__ == "__main__":
#     bayes.spamTest()


import feedparser
# ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
# sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
# ny=feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
# sf=feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
# print(ny)
# print(sf)
# vocabList,p0Vec,p1Vec = bayes.localWords(ny,sf)

# print(ny)
# print(len(ny['feed']))

if __name__== "__main__":
    # testingNB()
    #导入RSS数据源
    import operator

    ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
    sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
    vocabList, p0Vec, p1Vec = bayes.localWords(ny, sf)
    print(vocabList)
# 训练数据
from numpy import *
import bayes
postList, classVec = bayes.loadDataSet()
vocabList = bayes.createVocabList(postList)
mat = []
for i in postList:
    mat.append(bayes.setOfWords2Vec(vocabList, i))
p0, p1, pAbusive = bayes.trainNB0(mat, classVec)

# 利用rss源文档测试
import bayes
import feedparser
sci_env = feedparser.parse(
    'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml')
rate = 0.0
for i in range(10):
    vocabList, p0, p1, erate = bayes.localWords(sci_env, edu)
    rate += erate

print "error rate: %f" % (rate / 10)
# len(ny['entries'])

# 获取各分类文档的出现次数最多的词
import bayes
import feedparser
sci_env = feedparser.parse(
    'http://feeds.bbci.co.uk/news/science_and_environment/rss.xml')
edu = feedparser.parse('http://feeds.bbci.co.uk/news/education/rss.xml')
bayes.getTopWords(sci_env, edu)
Beispiel #8
0
print "\n                第一个过滤器例子: 恶意留言区分"
bayes.testingNB()

#第二个例子 垃圾邮件区分
print "\n                第二个过滤器例子: 垃圾邮件区分"
bayes.spamTest()

#第三个例子 个人广告中录取区域倾向
#书中的RSS不能读取到信息,相关参数:书中的例子RSS len=60,将20个作为测试样本,其余40个作为训练样本,去掉的是频数前30个词。
# 本程序中使用的例子len=20,将5个个作为测试样本,其余15个作为训练样本,去掉频数为个位数是效果最好,这里暂时取3。
print "\n                第三个例子:个人广告RSS中录取区域倾向"
nasa = feedparser.parse(
    'http://www.nasa.gov/rss/dyn/image_of_the_day.rss')  #len=60  NASA 航天新闻
ft = feedparser.parse(
    'http://www.ftchinese.com/rss/news')  #len=20,FT中文网(正式官方新闻)政治 经济 全球新闻
#sf = feedparser.parse('http://sports.yahoo.com/nba/teams/hou/rss.xml')   #len=6
#sf = feedparser.parse('http://rss.yule.sohu.vocabSetcom/rss/yuletoutiao.xml')  #搜狐娱乐(娱乐新闻)有时候len=30 有时却异常
'''
print "第一个的长度是:",len(nasa['entries'])
print "第一个的内容是:",nasa['entries']
print "第二个的长度是:",len(ft['entries'])
print "第二个的内容是:",ft['entries']
print "运行第一次的结果:"

'''
#将两个RSS中的数据用来训练和预测
bayes.localWords(nasa, ft)  #程序中已经完成了所有的操作,包括预测错误率的计算。

print "\n运行第二次的结果:"
bayes.getTopWords(nasa, ft)