Exemple #1
0
def _unzip_content(content: Any) -> bytes:
    content = BytesIO(content)

    file_mime = mime_from_content(content)

    # @TODO handle all this in utils/archive.py
    # and make it all generic to handle other
    # mime types
    if file_mime == "application/zip":
        with ZipFile(content) as zf:
            if len(zf.namelist()) == 1:
                return zf.open(zf.namelist()[0]).read()

            c = []
            stream_count = 0

            for filename in zf.namelist():
                if filename.endswith(".zip"):
                    c.append(_handle_zip(zf.open(filename), "r"))
                    stream_count += 1
                else:
                    c.append(zf.open(filename))

            return chain_streams(c).read()

    return content.getvalue()
Exemple #2
0
def parse_workbook(
    fh: BinaryIO,
    convert_xls: bool = True,
    worksheet: Optional[str] = None,
) -> Union[Workbook, Worksheet]:
    """Parse an excel file (with conversion) into a dict of lists for each sheet"""

    fh.seek(0)
    fh_mime = mime_from_content(fh)
    wb: Optional[Workbook] = None

    if fh_mime in ["application/CDFV2"]:
        if convert_xls:
            wb = convert_to_xlxs(fh)
    else:
        wb = load_workbook(fh)

    if not wb:
        raise Exception("Could not parse workbook")

    if worksheet:
        if worksheet not in wb:
            raise Exception(f"Could not find worksheet {worksheet} in workbook")

        return wb[worksheet]

    return wb
Exemple #3
0
def _fallback_download_handler(url: str) -> bytes:
    """
    This was previously a fallback download handler
    but the name is redundant as it's now the primary
    and takes precedence over the legacy downloader


    """
    r = http.get(url)

    if not r.ok:
        raise Exception("Bad link returned {}: {}".format(r.status_code, url))

    content = BytesIO(r.content)

    file_mime = mime_from_content(content)

    if not file_mime:
        file_mime = mime_from_url(url)

    # @TODO handle all this in utils/archive.py
    # and make it all generic to handle other
    # mime types
    if file_mime == "application/zip":
        with ZipFile(content) as zf:
            if len(zf.namelist()) == 1:
                return zf.open(zf.namelist()[0]).read()

            c = []
            stream_count = 0

            for filename in zf.namelist():
                if filename.endswith(".zip"):
                    c.append(_handle_zip(zf.open(filename), "r"))
                    stream_count += 1
                else:
                    c.append(zf.open(filename))

            return chain_streams(c).read()

    return content.getvalue()
Exemple #4
0
    def parse(self, response) -> Generator[Dict, None, None]:
        content = None

        file_mime = mime_from_content(response.body)

        if not file_mime:
            file_mime = mime_from_url(response.url)

        if file_mime == "application/zip":
            with ZipFile(BytesIO(response.body)) as zf:
                if len(zf.namelist()) == 1:
                    content = zf.open(zf.namelist()[0]).read()

                c = []
                stream_count = 0

                for filename in zf.namelist():
                    if filename.endswith(".zip"):
                        c.append(_handle_zip(zf.open(filename), "r"))
                        stream_count += 1
                    else:
                        c.append(zf.open(filename))

                content = chain_streams(c).read()
        else:
            content = response.body.getvalue()

        if not content:
            logger.info("No content from scrapy request")
            return None

        content = decode_bytes(content)

        item = {}
        item["content"] = content
        item["extension"] = ".csv"
        item["mime_type"] = file_mime

        yield item
Exemple #5
0
def url_downloader(url: str) -> bytes:
    """Downloads a URL and returns content, handling embedded zips and other MIME's"""

    logger.debug("Downloading: {}".format(url))

    r = http.get(url, verify=settings.http_verify_ssl)

    if not r.ok:
        raise Exception("Bad link returned {}: {}".format(r.status_code, url))

    content = BytesIO(r.content)

    file_mime = mime_from_content(content)

    if not file_mime:
        file_mime = mime_from_url(url)

    # @TODO handle all this in utils/archive.py
    # and make it all generic to handle other
    # mime types
    if file_mime == "application/zip":
        with ZipFile(content) as zf:
            if len(zf.namelist()) == 1:
                return zf.open(zf.namelist()[0]).read()

            c = []
            stream_count = 0

            for filename in zf.namelist():
                if filename.endswith(".zip"):
                    c.append(_handle_zip(zf.open(filename), "r"))
                    stream_count += 1
                else:
                    c.append(zf.open(filename))

            return chain_streams(c).read()

    return content.getvalue()
Exemple #6
0
def file_opener(path: Path) -> bytes:
    """Opens a local file, handling embedded zips and other MIME's"""

    logger.debug("Opening file: {}".format(path))

    if not path.is_file():
        raise Exception("File not found: {}".format(path))

    content: Optional[BytesIO] = None

    with path.open("rb") as fh:
        content = BytesIO(fh.read())

    file_mime = mime_from_content(content)

    # @TODO handle all this in utils/archive.py
    # and make it all generic to handle other
    # mime types
    if file_mime == "application/zip":
        with ZipFile(content) as zf:
            if len(zf.namelist()) == 1:
                return zf.open(zf.namelist()[0]).read()

            c = []
            stream_count = 0

            for filename in zf.namelist():
                if filename.endswith(".zip"):
                    c.append(_handle_zip(zf.open(filename), "r"))
                    stream_count += 1
                else:
                    c.append(zf.open(filename))

            return chain_streams(c).read()

    return content.getvalue()