コード例 #1
0
ファイル: tools.py プロジェクト: striver-ing/weibo_news
def get_html_2XX_only(url, network=None, response=None):
    """Consolidated logic for http requests from newspaper. We handle error cases:
    - Attempt to find encoding of the html by using HTTP header. Fallback to
      'ISO-8859-1' if not provided.
    - Error out if a non 2XX HTTP response code is returned.
        HTTP状态码是五个不同的类别:
      1XX临时/信息响应
      2XX成功
      3XX重定向
      4XX客户端/请求错误
      5XX服务器错误
    """
    network = network or NetWork()
    useragent = network.browser_user_agent
    timeout = network.request_timeout
    proxies = network.proxies
    headers = network.headers

    if response is not None:
        return _get_html_from_response(response)

    try:
        response = requests.get(url=url,
                                **get_request_kwargs(timeout, useragent,
                                                     proxies, headers))
    except requests.exceptions.RequestException as e:
        log.error('get_html_2XX_only() error. %s on URL: %s' % (e, url))
        return ''

    html = _get_html_from_response(response)

    return html
コード例 #2
0
ファイル: tools.py プロジェクト: striver-ing/weibo_news
def get_html(url):
    '''
    @summary: 自动处理编码,防止乱码
    ---------
    @param url:
    @param network:
    ---------
    @result:
    '''

    html = get_html_2XX_only(url, NetWork())
    if html and isinstance(html, bytes):
        html = get_unicode_html(html)

    return html