Example #1
0
 def test_proxy_fetch():
     p = common.proxy.Proxy('proxies.dat')
     proxies_list = p.get_from_file()
     page = common.page.Page('http://baike.baidu.com/subview/3077/11247674.htm')
     html = page.proxy_fetch(proxies_list)
     html = html.decode('utf-8', 'ignore')
     html = html.encode('gbk', 'ignore')
     html = html.decode('gbk', 'ignore')
     print(html)
Example #2
0
 def test_proxy_fetch():
     p = common.proxy.Proxy('proxies.dat')
     proxies_list = p.get_from_file()
     page = common.page.Page(
         'http://baike.baidu.com/subview/3077/11247674.htm')
     html = page.proxy_fetch(proxies_list)
     html = html.decode('utf-8', 'ignore')
     html = html.encode('gbk', 'ignore')
     html = html.decode('gbk', 'ignore')
     print(html)
Example #3
0
 def fetch(self, min_id, max_id):
     for page_id in range(min_id, max_id):
         # 获取
         page_id = str(page_id)
         url = 'http://baike.baidu.com/view/' + page_id + '.htm'
         page = common.page.Page(url)
         html = page.proxy_fetch(self.proxies_queue)
         if html:
             # 转码
             html = html.decode('utf-8', 'ignore')
             html = html.encode('gbk', 'ignore')
             html = html.decode('gbk', 'ignore')
             # 写出
             outfile = 'd:/test/baikefile/' + page_id + '.html'
             f = open(outfile, 'w')
             f.write(html)
             f.close()
             logging.debug('写入完成 %s' % page_id)
     logging.info('子任务完成 ' + str(min_id) + "~" + str(max_id))
Example #4
0
 def fetch(self, min_id, max_id):
     for page_id in range(min_id, max_id):
         # 获取
         page_id = str(page_id)
         url = 'http://baike.baidu.com/view/' + page_id + '.htm'
         page = common.page.Page(url)
         html = page.proxy_fetch(self.proxies_queue)
         if html:
             # 转码
             html = html.decode('utf-8', 'ignore')
             html = html.encode('gbk', 'ignore')
             html = html.decode('gbk', 'ignore')
             # 写出
             outfile = 'd:/test/baikefile/' + page_id + '.html'
             f = open(outfile, 'w')
             f.write(html)
             f.close()
             logging.debug('写入完成 %s' % page_id)
     logging.info('子任务完成 '+str(min_id)+"~"+str(max_id))