async def fetch(params, **kwargs): url: str = params['url'].strip() mimetypes = ','.join(AllowedMimeTypes) headers = {'Accept': mimetypes} timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30) try: async with utils.spooled_data_from_url(url, headers, timeout) as (bytes_io, headers, charset): content_type = headers.get('Content-Type', '') \ .split(';')[0] \ .strip() mime_type = guess_mime_type_or_none(content_type, url) if mime_type: result = parse_bytesio(bytes_io, mime_type, charset) result.truncate_in_place_if_too_big() return result else: return ProcessResult( error=(f'Error fetching {url}: ' f'unknown content type {content_type}')) except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err))
async def fetch(wfm): async def fail(error: str) -> None: result = ProcessResult(error=error) await ModuleImpl.commit_result(wfm, result) params = wfm.get_params() table = None url = params.get_param_string('url').strip() tablenum = params.get_param_integer('tablenum') - 1 # 1 based for user if tablenum < 0: return await fail(_('Table number must be at least 1')) result = None try: async with utils.spooled_data_from_url(url) as (spool, headers, charset): # TODO use charset for encoding detection tables = pd.read_html(spool, encoding=charset, flavor='html5lib') except asyncio.TimeoutError: result = ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: result = ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: result = ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: result = ProcessResult(error=str(err)) except ValueError as e: result = ProcessResult( error=_('Did not find any <table> tags on that page')) if not result: if not tables: result = ProcessResult( error=_('Did not find any <table> tags on that page')) elif tablenum >= len(tables): result = ProcessResult( error=(_('The maximum table number on this page is %d') % len(tables))) else: table = tables[tablenum] merge_colspan_headers_in_place(table) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() result.sanitize_in_place() await ModuleImpl.commit_result(wfm, result)
async def fetch(params, **kwargs): table = None url = params.get_param_string('url').strip() tablenum = params.get_param_integer('tablenum') - 1 # 1 based for user if tablenum < 0: return ProcessResult(error='Table number must be at least 1') result = None try: async with utils.spooled_data_from_url(url) as (spool, headers, charset): # TODO use charset for encoding detection tables = pd.read_html(spool, encoding=charset, flavor='html5lib') except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % ( err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError as e: return ProcessResult( error=_('Did not find any <table> tags on that page') ) except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error='Table has no columns') if not tables: return ProcessResult( error=_('Did not find any <table> tags on that page') ) if tablenum >= len(tables): return ProcessResult(error=( f'The maximum table number on this page is {len(tables)}' )) table = tables[tablenum] merge_colspan_headers_in_place(table) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() result.sanitize_in_place() return result
async def fetch(params, **kwargs): url: str = params['url'].strip() mimetypes = ','.join(AllowedMimeTypes) headers = {'Accept': mimetypes} timeout = aiohttp.ClientTimeout(total=5 * 60, connect=30) try: async with utils.spooled_data_from_url(url, headers, timeout) as (bytesio, headers, charset): content_type = headers.get('Content-Type', '') \ .split(';')[0] \ .strip() mime_type = guess_mime_type_or_none(content_type, url) if mime_type: # FIXME has_header=True always, because of a stupid decision # ages ago that we can't fix because everything we've stored # was stored with has_header=True (which is lossy). # # In https://www.pivotaltracker.com/story/show/166712967 we'll # store the input file instead of parsed file; then we'll be # able to parse correctly moving forward. # # FIXME move this to render(). In the meantime, we need to # run_in_executor() so we continue to send AMQP heartbeats and # handle other HTTP connections, even when parsing a big file. return await asyncio.get_event_loop().run_in_executor( None, parse_bytesio, bytesio, charset, mime_type, True # has_header ) else: return ProcessResult( error=(f'Error fetching {url}: ' f'unknown content type {content_type}')) except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err))
async def inner(): async with spooled_data_from_url("mailto:[email protected]"): pass
async def inner(): async with spooled_data_from_url("//a/b"): pass
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params['url'].strip() tablenum: int = params['tablenum'] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error='Table number must be at least 1') result = None try: async with utils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with utils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor='html5lib', # force algorithm, for reproducibility io=textio, match='.+', attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error='Did not find any <table> tags on that page') except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error='Table has no columns') if not tables: return ProcessResult( error='Did not find any <table> tags on that page') if tablenum >= len(tables): return ProcessResult( error=(f'The maximum table number on this page is {len(tables)}')) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) utils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
async def inner(): async with spooled_data_from_url('/foo'): pass