def check_proxy(proxy): protocol, proxy = proxy.split('=') seg = proxy.split(':') proxy_map = {} try: proxy_support = urllib.request.ProxyHandler(proxy_map) opener = urllib.request.build_opener(proxy_support) html = opener.open('http://www.baidu.com').read() return True except Exception as e: return False
def proxy_fetch(self, proxies_queue): # 用proxy获取网页 headers = {'Content-type': 'application/x-www-form-urlencoded', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.1 ' '(KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1'} # 是否重新获取代理 if proxies_queue.qsize() < 1: logging.debug('重新获取代理') p = common.proxy.Proxy('proxies.dat') proxies_queue = p.get_from_web() # 当前代理 protocol, proxy = proxies_queue.get().split('=') seg = proxy.split(':') proxy_map = {seg[0]: seg[1]} try: # 用代理获取网页 logging.debug('正在获取 %s' % self.url) proxy_support = urllib.request.ProxyHandler(proxy_map) opener = urllib.request.build_opener(proxy_support) html_bytes = opener.open(self.url).read() logging.info('获取成功 %s' % self.url) proxies_queue.put(protocol + '=' + proxy) return html_bytes except Exception as e: # 检查代理,代理是否可用 if check_proxy(protocol + '=' + proxy): # 代理可用 proxies_queue.put(protocol + '=' + proxy) logging.info('网址有误 %s' % self.url) logging.debug(e) return False else: # 代理不可用 logging.info('待验证网址 %s ' % self.url) return False