def test_no_tables(self): with mock.patch("pandas.read_html") as readmock: readmock.return_value = [] fetch_result = fetch(url="http://example.org") self.assertEqual( fetch_result, ProcessResult.coerce("Did not find any <table> tags on that page"), )
def test_coerce_tuple_dataframe_i18n_dict(self): df = pd.DataFrame({"foo": ["bar"]}) expected = ProcessResult( df, [RenderError(I18nMessage("message.id", {"param1": "a"}, None))], json={"a": "b"}, ) result = ProcessResult.coerce((df, ("message.id", {"param1": "a"}), {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_3tuple_i18n(self): self.assertEqual( ProcessResult.coerce(("my_id", {"hello": "there"}, "cjwmodule")), ProcessResult( errors=[ RenderError(I18nMessage("my_id", {"hello": "there"}, "cjwmodule")) ] ), )
def test_coerce_validate_processresult(self): """ProcessResult.coerce(<ProcessResult>) should raise on error.""" # render() gets access to a fetch_result. Imagine this module: # # def render(table, params, *, fetch_result): # fetch_result.dataframe.drop(0, inplace=True) # return fetch_result # invalid index # # We could (and maybe should) avoid this by banning ProcessResult # retvals from `render()`. But to be consistent we'd need to ban # ProcessResult retvals from `fetch()`; and that'd take a few hours. # # TODO ban `ProcessResult` retvals from `fetch()`, then raise # Valueerror on ProcessResult.coerce(<ProcessResult>). fetch_result = ProcessResult(pd.DataFrame({"A": [1, 2, 3]})) fetch_result.dataframe.drop(0, inplace=True) # bad index with self.assertRaisesRegex(ValueError, "must use the default RangeIndex"): ProcessResult.coerce(fetch_result)
def test_coerce_tuple_none_i18n_dict(self): expected = ProcessResult( errors=[RenderError(I18nMessage("message.id", {"param1": "a"}, None))], json={"a": "b"}, ) result = ProcessResult.coerce( (None, ("message.id", {"param1": "a"}), {"a": "b"}) ) self.assertEqual(result, expected)
def test_coerce_dict_quickfix_multiple(self): dataframe = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce({ "dataframe": dataframe, "errors": [ { "message": "an error", "quickFixes": [ dict( text="Hi", action="prependModule", args=["texttodate", { "column": "created_at" }], ), dict( text=("message.id", {}), action="prependModule", args=["texttodate", { "column": "created_at" }], ), ], }, "other error", ], "json": { "foo": "bar" }, }) expected = ProcessResult( dataframe, errors=[ RenderError( TODO_i18n("an error"), [ QuickFix( TODO_i18n("Hi"), QuickFixAction.PrependStep( "texttodate", {"column": "created_at"}), ), QuickFix( I18nMessage("message.id", {}, None), QuickFixAction.PrependStep( "texttodate", {"column": "created_at"}), ), ], ), RenderError(TODO_i18n("other error")), ], json={"foo": "bar"}, ) self.assertEqual(result, expected)
def test_coerce_dict_quickfix_dict_not_json_serializable(self): with self.assertRaises(ValueError): ProcessResult.coerce({ "errors": [{ "message": "an error", "quickFixes": [{ "text": "Hi", "action": "prependModule", "args": [ "texttodate", { "columns": pd.Index(["created_at"]) }, ], }], }] })
def test_coerce_dict_bad_quickfix_dict(self): with self.assertRaises(ValueError): ProcessResult.coerce({ "errors": [{ "message": "an error", "quickFixes": [{ "text": "Hi", "action": "prependModule", "arguments": [ "texttodate", { "column": "created_at" }, ], }], }] })
def _test( self, table: pd.DataFrame, params: Dict[str, Any] = {}, expected_table: pd.DataFrame = pd.DataFrame(), expected_error: str = "", ): result = ProcessResult.coerce(formula.render(table, P(**params))) expected = ProcessResult(expected_table, expected_error) self.assertEqual(result.error, expected.error) assert_frame_equal(result.dataframe, expected.dataframe)
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.TEXT()), ], )
def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce( { "dataframe": table, "column_formats": { "A": "{:,d}" } }, try_fallback_columns=[Column("A", ColumnType.Number("{:,.2f}"))], ) self.assertEqual(result.columns, [Column("A", ColumnType.Number("{:,d}"))])
def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.TEXT()), Column("B", ColumnType.NUMBER()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text())], )
def test_coerce_infer_columns_with_unit(self): table = pd.DataFrame( {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]} ) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "year"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.Date(unit="year")), Column("B", ColumnType.Text()), ], )
def test_coerce_dict_with_quickfix_not_json_serializable(self): dataframe = pd.DataFrame({"A": [1, 2]}) with self.assertRaises(ValueError): ProcessResult.coerce( { "dataframe": dataframe, "errors": [ { "message": "an error", "quickFixes": [ dict( text="Hi", action="prependModule", args=[ "texttodate", {"columns": pd.Index(["created_at"])}, ], ) ], } ], "json": {"foo": "bar"}, } )
def test_coerce_dict_legacy(self): dataframe = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce( { "dataframe": dataframe, "error": "an error", "json": {"foo": "bar"}, "quick_fixes": [], } ) expected = ProcessResult( dataframe, [RenderError(TODO_i18n("an error"), [])], json={"foo": "bar"}, ) self.assertEqual(result, expected)
def _safe_parse(bytesio: io.BytesIO, parser: Callable[[bytes], pd.DataFrame]) -> ProcessResult: """ Run the given parser, or return the error as a string. Empty dataset is not an error: it is just an empty dataset. """ try: return ProcessResult.coerce(parser(bytesio)) except BadInput as err: return ProcessResult(error=str(err)) except json.decoder.JSONDecodeError as err: return ProcessResult(error=str(err)) except pd.errors.EmptyDataError: return ProcessResult() except pd.errors.ParserError as err: return ProcessResult(error=str(err))
def test_to_arrow_empty_dataframe(self): fd, filename = tempfile.mkstemp() # We'll test that ProcessResult.to_arrow() writes empty bytes on error os.write(fd, b"to-remove") os.close(fd) try: result = ProcessResult.coerce("bad, bad error").to_arrow(Path(filename)) self.assertEqual( result, atypes.RenderResult( [RenderError(TODO_i18n("bad, bad error"), [])], {}, ), ) assert_arrow_table_equals( load_untrusted_arrow_file_with_columns(Path(filename))[0], make_table() ) finally: os.unlink(filename)
def test_coerce_dict_with_quickfix_tuple(self): dataframe = pd.DataFrame({"A": [1, 2]}) quick_fix = QuickFix( "Hi", "prependModule", ["texttodate", {"column": "created_at"}] ) result = ProcessResult.coerce( { "dataframe": dataframe, "error": "an error", "json": {"foo": "bar"}, "quick_fixes": [ ("Hi", "prependModule", "texttodate", {"column": "created_at"}) ], } ) expected = ProcessResult( dataframe, "an error", json={"foo": "bar"}, quick_fixes=[quick_fix] ) self.assertEqual(result, expected)
def test_to_arrow_empty_dataframe(self): fd, filename = tempfile.mkstemp() os.close(fd) # Remove the file. Then we'll test that ProcessResult.to_arrow() does # not write it (because the result is an error) os.unlink(filename) try: result = ProcessResult.coerce("bad, bad error").to_arrow( Path(filename)) self.assertEqual( result, atypes.RenderResult( atypes.ArrowTable(None, None, TableMetadata(0, [])), [RenderError(TODO_i18n("bad, bad error"), [])], {}, ), ) with self.assertRaises(FileNotFoundError): open(filename) finally: try: os.unlink(filename) except FileNotFoundError: pass
def test_to_arrow_normal_dataframe(self): fd, filename = tempfile.mkstemp() os.close(fd) # Remove the file. Then we'll test that ProcessResult.to_arrow() does # not write it (because the result is an error) os.unlink(filename) try: process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]})) result = process_result.to_arrow(Path(filename)) self.assertEqual( result, atypes.RenderResult( atypes.ArrowTable( Path(filename), atypes.TableMetadata( 2, [ atypes.Column( "A", atypes.ColumnType.Number( # Whatever .format # ProcessResult.coerce() gave process_result.columns[0].type.format ), ) ], ), ), [], {}, ), ) arrow_table = result.table.table self.assertEqual(arrow_table.to_pydict(), {"A": [1, 2]}) finally: os.unlink(filename)
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params["url"].strip() tablenum: int = params["tablenum"] - 1 # 1-based for user if tablenum < 0: return ProcessResult.coerce("Table number must be at least 1") result = None try: async with di.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor="html5lib", # force algorithm, for reproducibility io=textio, match=".+", attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult.coerce("Timeout fetching {url}") except aiohttp.InvalidURL: return ProcessResult.coerce("Invalid URL") except aiohttp.ClientResponseError as err: return ProcessResult.coerce("Error from server: %d %s" % (err.status, err.message)) except aiohttp.ClientError as err: return ProcessResult.coerce(str(err)) except ValueError: return ProcessResult.coerce( "Did not find any <table> tags on that page") except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult.coerce("Table has no columns") if not tables: return ProcessResult.coerce( "Did not find any <table> tags on that page") if tablenum >= len(tables): return ProcessResult.coerce( f"The maximum table number on this page is {len(tables)}") # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
def test_coerce_tuple_none_str_none(self): expected = ProcessResult(error="hi") result = ProcessResult.coerce((None, "hi", None)) self.assertEqual(result, expected)
def test_coerce_empty_dict(self): result = ProcessResult.coerce({}) expected = ProcessResult() self.assertEqual(result, expected)
def test_coerce_invalid_value(self): result = ProcessResult.coerce([None, "foo"]) self.assertIsNotNone(result.error)
def test_coerce_dict_wrong_key(self): with self.assertRaises(ValueError): ProcessResult.coerce({"table": pd.DataFrame({"A": [1]})})
def test_coerce_tuple_none_none_dict(self): expected = ProcessResult(json={"a": "b"}) result = ProcessResult.coerce((None, None, {"a": "b"})) self.assertEqual(result, expected)
def test_coerce_3tuple_no_dataframe(self): result = ProcessResult.coerce(("foo", "bar", {"a": "b"})) self.assertIsNotNone(result.error)
def test_coerce_bad_tuple(self): result = ProcessResult.coerce(("foo", "bar", "baz", "moo")) self.assertIsNotNone(result.error)
def test_coerce_tuple_none_none_none(self): expected = ProcessResult() result = ProcessResult.coerce((None, None, None)) self.assertEqual(result, expected)