def get_html_2XX_only(url, network=None, response=None): """Consolidated logic for http requests from newspaper. We handle error cases: - Attempt to find encoding of the html by using HTTP header. Fallback to 'ISO-8859-1' if not provided. - Error out if a non 2XX HTTP response code is returned. HTTP状态码是五个不同的类别: 1XX临时/信息响应 2XX成功 3XX重定向 4XX客户端/请求错误 5XX服务器错误 """ network = network or NetWork() useragent = network.browser_user_agent timeout = network.request_timeout proxies = network.proxies headers = network.headers if response is not None: return _get_html_from_response(response) try: response = requests.get(url=url, **get_request_kwargs(timeout, useragent, proxies, headers)) except requests.exceptions.RequestException as e: log.error('get_html_2XX_only() error. %s on URL: %s' % (e, url)) return '' html = _get_html_from_response(response) return html
def get_html(url): ''' @summary: 自动处理编码,防止乱码 --------- @param url: @param network: --------- @result: ''' html = get_html_2XX_only(url, NetWork()) if html and isinstance(html, bytes): html = get_unicode_html(html) return html