def _fetch_url(db: DatabaseHandler, url: str, network_down_host: str = DEFAULT_NETWORK_DOWN_HOST, network_down_port: int = DEFAULT_NETWORK_DOWN_PORT, network_down_timeout: int = DEFAULT_NETWORK_DOWN_TIMEOUT, domain_timeout: Optional[int] = None) -> FetchLinkResponse: """Fetch a url and return the content. If fetching the url results in a 400 error, check whether the network_down_host is accessible. If so, return the errored response. Otherwise, wait network_down_timeout seconds and try again. This function catches McGetException and returns a dummy 400 Response object. Arguments: db - db handle url - url to fetch network_down_host - host to check if network is down on error network_down_port - port to check if network is down on error network_down_timeout - seconds to wait if the network is down domain_timeout - value to pass to ThrottledUserAgent() Returns: Response object """ if url_has_binary_extension(url): return _make_dummy_bypassed_response(url) while True: ua = ThrottledUserAgent(db, domain_timeout=domain_timeout) if is_http_url(url): ua_response = ua.get_follow_http_html_redirects(url) response = FetchLinkResponse.from_useragent_response( url, ua_response) else: log.warning(f"URL is not HTTP(s), returning dummy response: {url}") response = FetchLinkResponse( url=url, is_success=False, code=HTTPStatus.BAD_REQUEST.value, message=HTTPStatus.BAD_REQUEST.phrase, content='bad url', last_requested_url=None, ) if response.is_success: return response if response.code == HTTPStatus.BAD_REQUEST.value and not tcp_port_is_open( port=network_down_port, hostname=network_down_host): log.warning( "Response failed with %s and network is down. Waiting to retry ..." % (url, )) time.sleep(network_down_timeout) else: return response
def test_url_has_binary_extension(): """Test url_has_binary_extention().""" assert not url_has_binary_extension('http://google.com') assert not url_has_binary_extension( 'https://www.nytimes.com/trump-khashoggi-dead.html') assert not url_has_binary_extension( 'https://www.washingtonpost.com/war-has-not/_story.html?utm_term=.c6ddfa7f19' ) assert url_has_binary_extension( 'http://uproxx.files.wordpress.com/2017/06/push-up.jpg?quality=100&w=1024' ) assert url_has_binary_extension( 'https://cdn.theatlantic.com/assets/media/files/shubeik_lubeik_byna_mohamed.pdf' ) assert url_has_binary_extension( 'https://i1.wp.com/7miradas.com/wp-content/uploads8/02/UHJ9OKM.png?resize=62%2C62' )