def test_get_headers_missing_user_agent_in_prod(self, mock_in_gcp, mock_secret): mock_in_gcp.return_value = True mock_secret.return_value = None with pytest.raises(Exception) as exception: scraper_utils.get_headers() assert str(exception.value) == "No user agent string"
def test_get_headers_local(self, mock_in_gcp): mock_in_gcp.return_value = False headers = scraper_utils.get_headers() assert headers == { "User-Agent": ("For any issues, concerns, or rate constraints," "e-mail [email protected]") }
def test_get_headers_local(self, mock_in_gae): mock_in_gae.return_value = False headers = scraper_utils.get_headers() assert headers == { 'User-Agent': ('For any issues, concerns, or rate constraints,' 'e-mail [email protected]') }
def test_get_headers(self, mock_in_gcp, mock_secret): # This is prod behaviour mock_in_gcp.return_value = True user_agent = "test_user_agent" test_secrets = {"user_agent": user_agent} mock_secret.side_effect = test_secrets.get headers = scraper_utils.get_headers() assert headers == {"User-Agent": user_agent}
def fetch_page(url, headers=None, cookies=None, params=None, post_data=None, json_data=None, should_proxy=True): """Fetch content from a URL. If data is None (the default), we perform a GET for the page. If the data is set, it must be a dict of parameters to use as POST data in a POST request to the url. Args: url: (string) URL to fetch content from headers: (dict) any headers to send in addition to the default cookies: (dict) any cookies to send in the request. params: dict of parameters to pass in the url of a GET request post_data: dict of parameters to pass into the html POST request json_data: dict of parameters in JSON format to pass into the html POST request extra_headers: dict of parameters to add to the headers of this request should_proxy: (bool) whether or not to use a proxy. Returns: The content. """ if should_proxy: proxies = scraper_utils.get_proxies() else: proxies = None headers = headers.copy() if headers else {} if 'User-Agent' not in headers: headers.update(scraper_utils.get_headers()) try: if post_data is None and json_data is None: page = requests.get( url, proxies=proxies, headers=headers, cookies=cookies, params=params, verify=False) elif params is None: page = requests.post( url, proxies=proxies, headers=headers, cookies=cookies, data=post_data, json=json_data, verify=False) else: raise ValueError( "Both params ({}) for a GET request and either post_data " "({}) or json_data ({}) for a POST request were set." \ .format(params, post_data, json_data)) page.raise_for_status() except requests.exceptions.RequestException as ce: raise FetchPageError(ce.request, ce.response) return page