Exemple #1
0
 def test_get_headers_missing_user_agent_in_prod(self, mock_in_gcp,
                                                 mock_secret):
     mock_in_gcp.return_value = True
     mock_secret.return_value = None
     with pytest.raises(Exception) as exception:
         scraper_utils.get_headers()
     assert str(exception.value) == "No user agent string"
Exemple #2
0
 def test_get_headers_local(self, mock_in_gcp):
     mock_in_gcp.return_value = False
     headers = scraper_utils.get_headers()
     assert headers == {
         "User-Agent": ("For any issues, concerns, or rate constraints,"
                        "e-mail [email protected]")
     }
 def test_get_headers_local(self, mock_in_gae):
     mock_in_gae.return_value = False
     headers = scraper_utils.get_headers()
     assert headers == {
         'User-Agent': ('For any issues, concerns, or rate constraints,'
                        'e-mail [email protected]')
     }
Exemple #4
0
    def test_get_headers(self, mock_in_gcp, mock_secret):
        # This is prod behaviour
        mock_in_gcp.return_value = True
        user_agent = "test_user_agent"

        test_secrets = {"user_agent": user_agent}
        mock_secret.side_effect = test_secrets.get

        headers = scraper_utils.get_headers()
        assert headers == {"User-Agent": user_agent}
Exemple #5
0
    def fetch_page(url, headers=None, cookies=None, params=None,
                   post_data=None, json_data=None, should_proxy=True):
        """Fetch content from a URL. If data is None (the default), we perform
        a GET for the page. If the data is set, it must be a dict of parameters
        to use as POST data in a POST request to the url.

        Args:
            url: (string) URL to fetch content from
            headers: (dict) any headers to send in addition to the default
            cookies: (dict) any cookies to send in the request.
            params: dict of parameters to pass in the url of a GET request
            post_data: dict of parameters to pass into the html POST request
            json_data: dict of parameters in JSON format to pass into the html
                       POST request
            extra_headers: dict of parameters to add to the headers of this
                           request
            should_proxy: (bool) whether or not to use a proxy.

        Returns:
            The content.

        """
        if should_proxy:
            proxies = scraper_utils.get_proxies()
        else:
            proxies = None
        headers = headers.copy() if headers else {}
        if 'User-Agent' not in headers:
            headers.update(scraper_utils.get_headers())

        try:
            if post_data is None and json_data is None:
                page = requests.get(
                    url, proxies=proxies, headers=headers, cookies=cookies,
                    params=params, verify=False)
            elif params is None:
                page = requests.post(
                    url, proxies=proxies, headers=headers, cookies=cookies,
                    data=post_data, json=json_data, verify=False)
            else:
                raise ValueError(
                    "Both params ({}) for a GET request and either post_data "
                    "({}) or json_data ({}) for a POST request were set." \
                        .format(params, post_data, json_data))
            page.raise_for_status()
        except requests.exceptions.RequestException as ce:
            raise FetchPageError(ce.request, ce.response)

        return page