def test(): ext = CXExtractor() urls = [ 'http://baike.baidu.com/view/25215.htm', 'http://tieba.baidu.com/p/3069273254', 'http://hi.baidu.com/handylee/blog/item/6523c4fc35a235fffc037fc5.html', 'http://xiezuoshi.baijia.baidu.com/article/15330', 'http://www.techweb.com.cn/news/2010-08-11/659082.shtml', 'http://www.ifanr.com/15876', 'http://news.cnhubei.com/xw/yl/201404/t2894467_5.shtml', ] import sys sys.path.append('../py-crawler') from spider_urllib2 import UrlSpider spider = UrlSpider() for url in urls: raw_html, err = spider.download(url) if raw_html: print 'url:', url start_time = time.time() for i in xrange(0, 10): title, content, keywords, desc= ext.extract(raw_html) end_time = time.time() print 'QPS:', 10/ (end_time-start_time) print 'title:', title print 'content:', content else: print 'url:', url print 'error_msg:', err
def test_file(p_in): ext = CXExtractor() import sys sys.path.append('../py-crawler') from spider_urllib2 import UrlSpider spider = UrlSpider() for url in open(p_in): url = url.strip() if not url: continue raw_html, err = spider.download(url) if raw_html: print '\nurl:', url title, content, keywords, desc = ext.extract(raw_html) print 'title:', title print 'content:', content