def test_proxy_fetch(): p = common.proxy.Proxy('proxies.dat') proxies_list = p.get_from_file() page = common.page.Page('http://baike.baidu.com/subview/3077/11247674.htm') html = page.proxy_fetch(proxies_list) html = html.decode('utf-8', 'ignore') html = html.encode('gbk', 'ignore') html = html.decode('gbk', 'ignore') print(html)
def test_proxy_fetch(): p = common.proxy.Proxy('proxies.dat') proxies_list = p.get_from_file() page = common.page.Page( 'http://baike.baidu.com/subview/3077/11247674.htm') html = page.proxy_fetch(proxies_list) html = html.decode('utf-8', 'ignore') html = html.encode('gbk', 'ignore') html = html.decode('gbk', 'ignore') print(html)
def fetch(self, min_id, max_id): for page_id in range(min_id, max_id): # 获取 page_id = str(page_id) url = 'http://baike.baidu.com/view/' + page_id + '.htm' page = common.page.Page(url) html = page.proxy_fetch(self.proxies_queue) if html: # 转码 html = html.decode('utf-8', 'ignore') html = html.encode('gbk', 'ignore') html = html.decode('gbk', 'ignore') # 写出 outfile = 'd:/test/baikefile/' + page_id + '.html' f = open(outfile, 'w') f.write(html) f.close() logging.debug('写入完成 %s' % page_id) logging.info('子任务完成 ' + str(min_id) + "~" + str(max_id))
def fetch(self, min_id, max_id): for page_id in range(min_id, max_id): # 获取 page_id = str(page_id) url = 'http://baike.baidu.com/view/' + page_id + '.htm' page = common.page.Page(url) html = page.proxy_fetch(self.proxies_queue) if html: # 转码 html = html.decode('utf-8', 'ignore') html = html.encode('gbk', 'ignore') html = html.decode('gbk', 'ignore') # 写出 outfile = 'd:/test/baikefile/' + page_id + '.html' f = open(outfile, 'w') f.write(html) f.close() logging.debug('写入完成 %s' % page_id) logging.info('子任务完成 '+str(min_id)+"~"+str(max_id))