def test_to_arrow_empty_dataframe(self): fd, filename = tempfile.mkstemp() os.close(fd) # Remove the file. Then we'll test that ProcessResult.to_arrow() does # not write it (because the result is an error) os.unlink(filename) try: result = ProcessResult.coerce("bad, bad error").to_arrow( Path(filename)) self.assertEqual( result, atypes.RenderResult( atypes.ArrowTable(None, None, TableMetadata(0, [])), [RenderError(TODO_i18n("bad, bad error"), [])], {}, ), ) with self.assertRaises(FileNotFoundError): open(filename) finally: try: os.unlink(filename) except FileNotFoundError: pass
def test_coerce_processresult(self): expected = ProcessResult() result = ProcessResult.coerce(expected) self.assertIs(result, expected)
def test_bad_server(self): fetch_result = fetch(url="http://example.org") self.assertEqual( fetch_result, ProcessResult(error="Error from server: 500 Server Error"))
def test_table_index_under(self): url = "http:INVALID:URL" # we should never even validate the URL fetch_result = fetch(url=url, tablenum=0) self.assertEqual( fetch_result, ProcessResult(error="Table number must be at least 1"))
def test_render_error_process_result(self): result = render(pd.DataFrame(), P(), fetch_result=ProcessResult(error="hi")) assert_process_result_equal(result, "hi")
def test_empty_table_shape(self): result = ProcessResult() self.assertEqual(result.table_shape, TableShape(0, []))
def test_empty_columns(self): result = ProcessResult() self.assertEqual(result.column_names, []) self.assertEqual(result.columns, [])
def test_status_error(self): result = ProcessResult(pd.DataFrame(), "error") self.assertEqual(result.status, "error")
def test_coerce_tuple_none_str_none(self): expected = ProcessResult(error="hi") result = ProcessResult.coerce((None, "hi", None)) self.assertEqual(result, expected)
def test_coerce_tuple_none_str_dict(self): expected = ProcessResult(error="hi", json={"a": "b"}) result = ProcessResult.coerce((None, "hi", {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_none_none(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(df) result = ProcessResult.coerce((df, None, None)) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_none_dict(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(df, "", json={"a": "b"}) result = ProcessResult.coerce((df, None, {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_tuple_dataframe_str(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(dataframe=df, error="hi") result = ProcessResult.coerce((df, "hi")) self.assertEqual(result, expected)
def test_coerce_str(self): expected = ProcessResult(error="yay") result = ProcessResult.coerce("yay") self.assertEqual(result, expected)
def test_coerce_dataframe(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult(dataframe=df) result = ProcessResult.coerce(df) self.assertEqual(result, expected)
def test_status_ok_with_warning(self): result = ProcessResult(pd.DataFrame({"A": [1]}), "warning") self.assertEqual(result.status, "ok")
def test_status_ok_with_no_rows(self): result = ProcessResult(pd.DataFrame({"A": []}), "") self.assertEqual(result.status, "ok")
def test_coerce_tuple_none_none_dict(self): expected = ProcessResult(json={"a": "b"}) result = ProcessResult.coerce((None, None, {"a": "b"})) self.assertEqual(result, expected)
def test_status_unreachable(self): result = ProcessResult(pd.DataFrame(), "") self.assertEqual(result.status, "unreachable")
def test_coerce_tuple_none_none_none(self): expected = ProcessResult() result = ProcessResult.coerce((None, None, None)) self.assertEqual(result, expected)
def test_table_shape(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = ProcessResult(df) self.assertEqual( result.table_shape, TableShape(3, [Column("A", ColumnType.NUMBER())]) )
def test_coerce_bad_tuple(self): result = ProcessResult.coerce(("foo", "bar", "baz", "moo")) self.assertIsNotNone(result.error)
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params["url"].strip() tablenum: int = params["tablenum"] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error="Table number must be at least 1") result = None try: async with moduleutils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with moduleutils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor="html5lib", # force algorithm, for reproducibility io=textio, match=".+", attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f"Timeout fetching {url}") except aiohttp.InvalidURL: return ProcessResult(error=f"Invalid URL") except aiohttp.ClientResponseError as err: return ProcessResult(error=("Error from server: %d %s" % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error="Did not find any <table> tags on that page") except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error="Table has no columns") if not tables: return ProcessResult( error="Did not find any <table> tags on that page") if tablenum >= len(tables): return ProcessResult( error=(f"The maximum table number on this page is {len(tables)}")) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) moduleutils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
def test_coerce_3tuple_no_dataframe(self): result = ProcessResult.coerce(("foo", "bar", {"a": "b"})) self.assertIsNotNone(result.error)
def test_render_empty_process_result(self): result = render(pd.DataFrame(), P(has_header=False), fetch_result=ProcessResult()) assert_process_result_equal(result, pd.DataFrame())
def test_coerce_dict_wrong_key(self): with self.assertRaises(ValueError): ProcessResult.coerce({"table": pd.DataFrame({"A": [1]})})
def test_table_index_over(self): fetch_result = fetch(url="http://example.org", tablenum=2) self.assertEqual( fetch_result, ProcessResult(error="The maximum table number on this page is 1"), )
def test_coerce_empty_dict(self): result = ProcessResult.coerce({}) expected = ProcessResult() self.assertEqual(result, expected)
def test_404(self): fetch_result = fetch(url="http://example.org") self.assertEqual( fetch_result, ProcessResult(error="Error from server: 404 Not Found"))
def test_coerce_invalid_value(self): result = ProcessResult.coerce([None, "foo"]) self.assertIsNotNone(result.error)