async def download_data_frame( sheet_id: str, sheet_mime_type: str, oauth2_client: oauth2.Client ) -> Union[pd.DataFrame, str, Tuple[pd.DataFrame, str]]: """Download spreadsheet from Google, or return a str error message. Arguments decide how the download and parse will occur: * If `secret` is not set, return an error. * If `sheet_mime_type` is 'application/vnd.google-apps.spreadsheet', use GDrive API to _export_ a text/csv, then parse it. Otherwise, use GDrive API to _download_ the file, and parse it according to its mime type. """ if sheet_mime_type == "application/vnd.google-apps.spreadsheet": url = _generate_google_sheet_url(sheet_id) sheet_mime_type = "text/csv" else: url = _generate_gdrive_file_url(sheet_id) # and use the passed sheet_mime_type url, headers, _ = oauth2_client.add_token(url, headers={}) try: async with spooled_data_from_url(url, headers) as (blobio, _, __): # TODO store raw bytes and then parse in render(), like in # [2019-10-31] loadurl module. # # For now, we hard-code questionable params: # # * encoding=None: because GDrive doesn't know the charset, and it # returns the wrong charset sometimes. # * has_header=True: legacy (and buggy). When we store raw bytes, # we'll use the user's preference. return parse_bytesio(blobio, encoding=None, content_type=sheet_mime_type, has_header=True) except aiohttp.ClientResponseError as err: if err.status == 401: return "Invalid credentials. Please reconnect to Google Drive." elif err.status == 403: return "You chose a file your logged-in user cannot access. Please reconnect to Google Drive or choose a different file." elif err.status == 404: return "File not found. Please choose a different file." else: return "GDrive responded with HTTP %d %s" % (err.status, err.message) except aiohttp.ClientError as err: return "Error during GDrive request: %s" % str(err) except asyncio.TimeoutError: return "Timeout during GDrive request"
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params["url"].strip() tablenum: int = params["tablenum"] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error="Table number must be at least 1") result = None try: async with moduleutils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with moduleutils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor="html5lib", # force algorithm, for reproducibility io=textio, match=".+", attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f"Timeout fetching {url}") except aiohttp.InvalidURL: return ProcessResult(error=f"Invalid URL") except aiohttp.ClientResponseError as err: return ProcessResult(error=("Error from server: %d %s" % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error="Did not find any <table> tags on that page") except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error="Table has no columns") if not tables: return ProcessResult( error="Did not find any <table> tags on that page") if tablenum >= len(tables): return ProcessResult( error=(f"The maximum table number on this page is {len(tables)}")) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) moduleutils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
async def inner(): async with spooled_data_from_url("mailto:[email protected]"): pass
async def inner(): async with spooled_data_from_url("//a/b"): pass
async def fetch(params, *, output_path: Path) -> Union[Path, str]: url: str = params["url"].strip() mimetypes = ",".join(AllowedMimeTypes) headers = {"Accept": mimetypes} timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30) try: async with moduleutils.spooled_data_from_url(url, headers, timeout) as ( bytesio, headers, charset, ): # This shouldn't be a context manager. Oh well. Ignore the fact # that bytesio is backed by a file. It's safe to read the file # after we exit the context and the file is deleted. pass except asyncio.TimeoutError: output_path.write_bytes(b"") # truncate file return f"Timeout fetching {url}" except aiohttp.InvalidURL: return f"Invalid URL" except aiohttp.TooManyRedirects: return "The server redirected us too many times. Please try a different URL." except aiohttp.ClientResponseError as err: return "Error from server: %d %s" % (err.status, err.message) except aiohttp.ClientError as err: return str(err) # The following shouldn't ever error. with output_path.open("wb") as f: # set gzip mtime=0 so we can write the exact same file given the exact # same data. (This helps with testing and versioning.) with gzip.GzipFile(mode="wb", filename="", fileobj=f, mtime=0) as zf: # Write URL -- original URL, not redirected URL zf.write( json.dumps( { "url": params["url"] }, ensure_ascii=False, allow_nan=False, separators=(",", ":"), sort_keys=True, ).encode("utf-8") + b"\r\n") # Write status line -- INCORRECT but oh well zf.write(b"200 OK\r\n") # Write response headers. # # Ideally we'd be using raw headers. But moduleutils gives # parsed headers. Let's not bother with purity: just # re-encode the parsed headers. for k, v in headers.items(): # bytesio is already dechunked and decompressed. Mangle # these headers to make file consistent with itself. if k.lower() in { "transfer-encoding", "content-encoding", "content-length", }: k = "Cjw-Original-" + k elif k.lower() not in { "content-type", "content-disposition", "server" }: # Skip writing most headers. This is a HACK: we skip the # `Date` header so fetcher will see a byte-for-byte # identical output file given byte-for-byte identical # input. That will convince fetcher to ignore the result. # See `fetcher.versions`. TODO redefine "versions" and # revisit this logic: the user probably _expects_ us to # store headers every fetch, though body may not change. continue # There's no way to put \r\n in an HTTP header name or value. # Good thing: if a server could do that, this file format would # be unreadable. assert "\n" not in k and "\n" not in v zf.write(f"{k}: {v}\r\n".encode("latin1")) zf.write(b"\r\n") # Write body shutil.copyfileobj(bytesio, zf) return output_path