def _unzip_content(content: Any) -> bytes: content = BytesIO(content) file_mime = mime_from_content(content) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()
def parse_workbook( fh: BinaryIO, convert_xls: bool = True, worksheet: Optional[str] = None, ) -> Union[Workbook, Worksheet]: """Parse an excel file (with conversion) into a dict of lists for each sheet""" fh.seek(0) fh_mime = mime_from_content(fh) wb: Optional[Workbook] = None if fh_mime in ["application/CDFV2"]: if convert_xls: wb = convert_to_xlxs(fh) else: wb = load_workbook(fh) if not wb: raise Exception("Could not parse workbook") if worksheet: if worksheet not in wb: raise Exception(f"Could not find worksheet {worksheet} in workbook") return wb[worksheet] return wb
def _fallback_download_handler(url: str) -> bytes: """ This was previously a fallback download handler but the name is redundant as it's now the primary and takes precedence over the legacy downloader """ r = http.get(url) if not r.ok: raise Exception("Bad link returned {}: {}".format(r.status_code, url)) content = BytesIO(r.content) file_mime = mime_from_content(content) if not file_mime: file_mime = mime_from_url(url) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()
def parse(self, response) -> Generator[Dict, None, None]: content = None file_mime = mime_from_content(response.body) if not file_mime: file_mime = mime_from_url(response.url) if file_mime == "application/zip": with ZipFile(BytesIO(response.body)) as zf: if len(zf.namelist()) == 1: content = zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) content = chain_streams(c).read() else: content = response.body.getvalue() if not content: logger.info("No content from scrapy request") return None content = decode_bytes(content) item = {} item["content"] = content item["extension"] = ".csv" item["mime_type"] = file_mime yield item
def url_downloader(url: str) -> bytes: """Downloads a URL and returns content, handling embedded zips and other MIME's""" logger.debug("Downloading: {}".format(url)) r = http.get(url, verify=settings.http_verify_ssl) if not r.ok: raise Exception("Bad link returned {}: {}".format(r.status_code, url)) content = BytesIO(r.content) file_mime = mime_from_content(content) if not file_mime: file_mime = mime_from_url(url) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()
def file_opener(path: Path) -> bytes: """Opens a local file, handling embedded zips and other MIME's""" logger.debug("Opening file: {}".format(path)) if not path.is_file(): raise Exception("File not found: {}".format(path)) content: Optional[BytesIO] = None with path.open("rb") as fh: content = BytesIO(fh.read()) file_mime = mime_from_content(content) # @TODO handle all this in utils/archive.py # and make it all generic to handle other # mime types if file_mime == "application/zip": with ZipFile(content) as zf: if len(zf.namelist()) == 1: return zf.open(zf.namelist()[0]).read() c = [] stream_count = 0 for filename in zf.namelist(): if filename.endswith(".zip"): c.append(_handle_zip(zf.open(filename), "r")) stream_count += 1 else: c.append(zf.open(filename)) return chain_streams(c).read() return content.getvalue()