Exemple #1
0
async def download_data_frame(
    sheet_id: str, sheet_mime_type: str, oauth2_client: oauth2.Client
) -> Union[pd.DataFrame, str, Tuple[pd.DataFrame, str]]:
    """Download spreadsheet from Google, or return a str error message.

    Arguments decide how the download and parse will occur:

    * If `secret` is not set, return an error.
    * If `sheet_mime_type` is 'application/vnd.google-apps.spreadsheet', use
      GDrive API to _export_ a text/csv, then parse it. Otherwise, use GDrive
      API to _download_ the file, and parse it according to its mime type.
    """
    if sheet_mime_type == "application/vnd.google-apps.spreadsheet":
        url = _generate_google_sheet_url(sheet_id)
        sheet_mime_type = "text/csv"
    else:
        url = _generate_gdrive_file_url(sheet_id)
        # and use the passed sheet_mime_type

    url, headers, _ = oauth2_client.add_token(url, headers={})

    try:
        async with spooled_data_from_url(url, headers) as (blobio, _, __):
            # TODO store raw bytes and then parse in render(), like in
            # [2019-10-31] loadurl module.
            #
            # For now, we hard-code questionable params:
            #
            # * encoding=None: because GDrive doesn't know the charset, and it
            #                  returns the wrong charset sometimes.
            # * has_header=True: legacy (and buggy). When we store raw bytes,
            #                    we'll use the user's preference.
            return parse_bytesio(blobio,
                                 encoding=None,
                                 content_type=sheet_mime_type,
                                 has_header=True)
    except aiohttp.ClientResponseError as err:
        if err.status == 401:
            return "Invalid credentials. Please reconnect to Google Drive."
        elif err.status == 403:
            return "You chose a file your logged-in user cannot access. Please reconnect to Google Drive or choose a different file."
        elif err.status == 404:
            return "File not found. Please choose a different file."
        else:
            return "GDrive responded with HTTP %d %s" % (err.status,
                                                         err.message)
    except aiohttp.ClientError as err:
        return "Error during GDrive request: %s" % str(err)
    except asyncio.TimeoutError:
        return "Timeout during GDrive request"
Exemple #2
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params["url"].strip()
    tablenum: int = params["tablenum"] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error="Table number must be at least 1")

    result = None

    try:
        async with moduleutils.spooled_data_from_url(url) as (spool, headers,
                                                              charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with moduleutils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor="html5lib",  # force algorithm, for reproducibility
                    io=textio,
                    match=".+",
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f"Timeout fetching {url}")
    except aiohttp.InvalidURL:
        return ProcessResult(error=f"Invalid URL")
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=("Error from server: %d %s" %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error="Did not find any <table> tags on that page")
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error="Table has no columns")

    if not tables:
        return ProcessResult(
            error="Did not find any <table> tags on that page")

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f"The maximum table number on this page is {len(tables)}"))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    moduleutils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Exemple #3
0
 async def inner():
     async with spooled_data_from_url("mailto:[email protected]"):
         pass
Exemple #4
0
 async def inner():
     async with spooled_data_from_url("//a/b"):
         pass
Exemple #5
0
async def fetch(params, *, output_path: Path) -> Union[Path, str]:
    url: str = params["url"].strip()

    mimetypes = ",".join(AllowedMimeTypes)
    headers = {"Accept": mimetypes}
    timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30)

    try:
        async with moduleutils.spooled_data_from_url(url, headers,
                                                     timeout) as (
                                                         bytesio,
                                                         headers,
                                                         charset,
                                                     ):
            # This shouldn't be a context manager. Oh well. Ignore the fact
            # that bytesio is backed by a file. It's safe to read the file
            # after we exit the context and the file is deleted.
            pass
    except asyncio.TimeoutError:
        output_path.write_bytes(b"")  # truncate file
        return f"Timeout fetching {url}"
    except aiohttp.InvalidURL:
        return f"Invalid URL"
    except aiohttp.TooManyRedirects:
        return "The server redirected us too many times. Please try a different URL."
    except aiohttp.ClientResponseError as err:
        return "Error from server: %d %s" % (err.status, err.message)
    except aiohttp.ClientError as err:
        return str(err)

    # The following shouldn't ever error.
    with output_path.open("wb") as f:
        # set gzip mtime=0 so we can write the exact same file given the exact
        # same data. (This helps with testing and versioning.)
        with gzip.GzipFile(mode="wb", filename="", fileobj=f, mtime=0) as zf:
            # Write URL -- original URL, not redirected URL
            zf.write(
                json.dumps(
                    {
                        "url": params["url"]
                    },
                    ensure_ascii=False,
                    allow_nan=False,
                    separators=(",", ":"),
                    sort_keys=True,
                ).encode("utf-8") + b"\r\n")
            # Write status line -- INCORRECT but oh well
            zf.write(b"200 OK\r\n")
            # Write response headers.
            #
            # Ideally we'd be using raw headers. But moduleutils gives
            # parsed headers. Let's not bother with purity: just
            # re-encode the parsed headers.
            for k, v in headers.items():
                # bytesio is already dechunked and decompressed. Mangle
                # these headers to make file consistent with itself.
                if k.lower() in {
                        "transfer-encoding",
                        "content-encoding",
                        "content-length",
                }:
                    k = "Cjw-Original-" + k
                elif k.lower() not in {
                        "content-type", "content-disposition", "server"
                }:
                    # Skip writing most headers. This is a HACK: we skip the
                    # `Date` header so fetcher will see a byte-for-byte
                    # identical output file given byte-for-byte identical
                    # input. That will convince fetcher to ignore the result.
                    # See `fetcher.versions`. TODO redefine "versions" and
                    # revisit this logic: the user probably _expects_ us to
                    # store headers every fetch, though body may not change.
                    continue
                # There's no way to put \r\n in an HTTP header name or value.
                # Good thing: if a server could do that, this file format would
                # be unreadable.
                assert "\n" not in k and "\n" not in v
                zf.write(f"{k}: {v}\r\n".encode("latin1"))
            zf.write(b"\r\n")

            # Write body
            shutil.copyfileobj(bytesio, zf)
    return output_path