Exemple #1
0
def test():
    ext = CXExtractor()

    urls = [
        'http://baike.baidu.com/view/25215.htm',
        'http://tieba.baidu.com/p/3069273254',
        'http://hi.baidu.com/handylee/blog/item/6523c4fc35a235fffc037fc5.html',
        'http://xiezuoshi.baijia.baidu.com/article/15330',
        'http://www.techweb.com.cn/news/2010-08-11/659082.shtml',
        'http://www.ifanr.com/15876',
        'http://news.cnhubei.com/xw/yl/201404/t2894467_5.shtml',
        ]

    import sys
    sys.path.append('../py-crawler')
    from spider_urllib2 import UrlSpider
    spider = UrlSpider()
    for url in urls:
        raw_html, err = spider.download(url) 
        if raw_html:
            print 'url:', url
            start_time = time.time()
            for i in xrange(0, 10):
                title, content, keywords, desc= ext.extract(raw_html)
            end_time = time.time()
            print 'QPS:', 10/ (end_time-start_time) 
            print 'title:', title
            print 'content:', content
        else:
            print 'url:', url
            print 'error_msg:', err 
Exemple #2
0
def test_file(p_in):
    ext = CXExtractor()
    import sys
    sys.path.append('../py-crawler')
    from spider_urllib2 import UrlSpider
    spider = UrlSpider()
    for url in open(p_in):
        url = url.strip()
        if not url: continue
        raw_html, err = spider.download(url) 
        if raw_html:
            print '\nurl:', url
            title, content, keywords, desc = ext.extract(raw_html)
            print 'title:', title
            print 'content:', content