def get_binary_content(download_url, cookies, method="GET"): """Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param cookies: Cookies that might be necessary to download the item. :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ if not download_url: # Occurs when a DeferredList fetcher fails. msg = "NoDownloadUrlError: %s\n%s" % ( download_url, traceback.format_exc(), ) return msg, None # noinspection PyBroadException try: if method == "LOCAL": url = os.path.join(settings.MEDIA_ROOT, download_url) mr = MockRequest(url=url) r = mr.get() else: # Note that we do a GET even if site.method is POST. This is # deliberate. s = requests.session() headers = {"User-Agent": "CourtListener"} r = s.get( download_url, verify=False, # WA has a certificate we don't understand headers=headers, cookies=cookies, timeout=300, ) # test for empty files (thank you CA1) if len(r.content) == 0: msg = "EmptyFileError: %s\n%s" % ( download_url, traceback.format_exc(), ) return msg, None # test for and follow meta redirects r = follow_redirections(r, s) r.raise_for_status() except: msg = "DownloadingError: %s\n%s" % ( download_url, traceback.format_exc(), ) return msg, None # Success! return "", r
def get_binary_content(download_url, cookies, adapter, method='GET'): """ Downloads the file, covering a few special cases such as invalid SSL certificates and empty file errors. :param download_url: The URL for the item you wish to download. :param cookies: Cookies that might be necessary to download the item. :param adapter: An HTTPAdapter for use when getting content. :param method: The HTTP method used to get the item, or "LOCAL" to get an item during testing :return: Two values. The first is a msg indicating any errors encountered. If blank, that indicates success. The second value is the response object containing the downloaded file. """ if not download_url: # Occurs when a DeferredList fetcher fails. msg = 'NoDownloadUrlError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # noinspection PyBroadException try: if method == 'LOCAL': url = os.path.join( settings.MEDIA_ROOT, download_url) mr = MockRequest(url=url) r = mr.get() else: # Note that we do a GET even if site.method is POST. This is # deliberate. s = requests.session() s.mount('https://', adapter) headers = {'User-Agent': 'CourtListener'} r = s.get( download_url, verify=False, # WA has a certificate we don't understand headers=headers, cookies=cookies, timeout=300, ) # test for empty files (thank you CA1) if len(r.content) == 0: msg = 'EmptyFileError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # test for and follow meta redirects r = follow_redirections(r, s) r.raise_for_status() except: msg = 'DownloadingError: %s\n%s' % (download_url, traceback.format_exc()) return msg, None # Success! return '', r
def _download(self, request_dict={}): """Methods for downloading the latest version of Site """ if self.method == "POST": truncated_params = {} for k, v in self.parameters.iteritems(): truncated_params[k] = trunc(v, 50, ellipsis="...[truncated]") logger.info("Now downloading case page at: %s (params: %s)" % (self.url, truncated_params)) else: logger.info("Now downloading case page at: %s" % self.url) # Set up verify here and remove it from request_dict so you don't send # it to s.get or s.post in two kwargs. if request_dict.get("verify") is not None: verify = request_dict["verify"] del request_dict["verify"] else: verify = certifi.where() # Get the response. Disallow redirects so they throw an error s = requests.session() s.mount("https://", self._get_adapter_instance()) if self.method == "GET": r = s.get(self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, **request_dict) elif self.method == "POST": r = s.post( self.url, headers={"User-Agent": "Juriscraper"}, verify=verify, data=self.parameters, **request_dict ) elif self.method == "LOCAL": mr = MockRequest(url=self.url) r = mr.get() # Provides a hook for inheriting objects to tweak the request object. self.tweak_request_object(r) # Throw an error if a bad status code is returned. r.raise_for_status() # Tweak or set the encoding if needed r = self._set_encoding(r) # Provide the response in the Site object self.r = r self.status = r.status_code # Grab the content if "json" in r.headers.get("content-type", ""): return r.json() else: text = self._clean_text(r.text) html_tree = self._make_html_tree(text) html_tree.rewrite_links(self._link_repl) return html_tree
def _request_url_mock(self, url): """Execute mock request, used for testing""" self.request["url"] = url self.request["response"] = MockRequest(url=self.url).get()
def _request_url_mock(self, url): """Execute mock request, used for testing""" self.request['url'] = url self.request['request'] = MockRequest(url=self.url).get()