Beispiel #1
0
def _fetch_url(db: DatabaseHandler,
               url: str,
               network_down_host: str = DEFAULT_NETWORK_DOWN_HOST,
               network_down_port: int = DEFAULT_NETWORK_DOWN_PORT,
               network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT,
               domain_timeout: Optional[int] = None) -> FetchLinkResponse:
    """Fetch a url and return the content.

    If fetching the url results in a 400 error, check whether the network_down_host is accessible.  If so,
    return the errored response.  Otherwise, wait network_down_timeout seconds and try again.

    This function catches McGetException and returns a dummy 400 Response object.

    Arguments:
    db - db handle
    url - url to fetch
    network_down_host - host to check if network is down on error
    network_down_port - port to check if network is down on error
    network_down_timeout - seconds to wait if the network is down
    domain_timeout - value to pass to ThrottledUserAgent()

    Returns:
    Response object
    """
    if url_has_binary_extension(url):
        return _make_dummy_bypassed_response(url)

    while True:
        ua = ThrottledUserAgent(db, domain_timeout=domain_timeout)

        if is_http_url(url):
            ua_response = ua.get_follow_http_html_redirects(url)
            response = FetchLinkResponse.from_useragent_response(
                url, ua_response)
        else:
            log.warning(f"URL is not HTTP(s), returning dummy response: {url}")
            response = FetchLinkResponse(
                url=url,
                is_success=False,
                code=HTTPStatus.BAD_REQUEST.value,
                message=HTTPStatus.BAD_REQUEST.phrase,
                content='bad url',
                last_requested_url=None,
            )

        if response.is_success:
            return response

        if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open(
                port=network_down_port, hostname=network_down_host):
            log.warning(
                "Response failed with %s and network is down.  Waiting to retry ..."
                % (url, ))
            time.sleep(network_down_timeout)
        else:
            return response
def test_url_has_binary_extension():
    """Test url_has_binary_extention()."""
    assert not url_has_binary_extension('http://google.com')
    assert not url_has_binary_extension(
        'https://www.nytimes.com/trump-khashoggi-dead.html')
    assert not url_has_binary_extension(
        'https://www.washingtonpost.com/war-has-not/_story.html?utm_term=.c6ddfa7f19'
    )
    assert url_has_binary_extension(
        'http://uproxx.files.wordpress.com/2017/06/push-up.jpg?quality=100&w=1024'
    )
    assert url_has_binary_extension(
        'https://cdn.theatlantic.com/assets/media/files/shubeik_lubeik_byna_mohamed.pdf'
    )
    assert url_has_binary_extension(
        'https://i1.wp.com/7miradas.com/wp-content/uploads8/02/UHJ9OKM.png?resize=62%2C62'
    )