def _fallback_download_handler(url: str) -> bytes: """ This was previously a fallback download handler but the name is redundant as it's now the primary and takes precedence over the legacy downloader """ r = http.get(url) if not r.ok: raise Exception("Bad link returned {}: {}".format(r.status_code, url)) content = BytesIO(r.content) file_mime = mime_from_content(content) if not file_mime: file_mime = mime_from_url(url) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()
def parse(self, response) -> Generator[Dict, None, None]: content = None file_mime = mime_from_content(response.body) if not file_mime: file_mime = mime_from_url(response.url) if file_mime == "application/zip": with ZipFile(BytesIO(response.body)) as zf: if len(zf.namelist()) == 1: content = zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) content = chain_streams(c).read() else: content = response.body.getvalue() if not content: logger.info("No content from scrapy request") return None content = decode_bytes(content) item = {} item["content"] = content item["extension"] = ".csv" item["mime_type"] = file_mime yield item
def url_downloader(url: str) -> bytes: """Downloads a URL and returns content, handling embedded zips and other MIME's""" logger.debug("Downloading: {}".format(url)) r = http.get(url, verify=settings.http_verify_ssl) if not r.ok: raise Exception("Bad link returned {}: {}".format(r.status_code, url)) content = BytesIO(r.content) file_mime = mime_from_content(content) if not file_mime: file_mime = mime_from_url(url) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()