def test_no_tables(self):
        with mock.patch("pandas.read_html") as readmock:
            readmock.return_value = []
            fetch_result = fetch(url="http://example.org")

        self.assertEqual(
            fetch_result,
            ProcessResult.coerce("Did not find any <table> tags on that page"),
        )
 def test_coerce_tuple_dataframe_i18n_dict(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(
         df,
         [RenderError(I18nMessage("message.id", {"param1": "a"}, None))],
         json={"a": "b"},
     )
     result = ProcessResult.coerce((df, ("message.id", {"param1": "a"}), {"a": "b"}))
     self.assertEqual(result, expected)
 def test_coerce_3tuple_i18n(self):
     self.assertEqual(
         ProcessResult.coerce(("my_id", {"hello": "there"}, "cjwmodule")),
         ProcessResult(
             errors=[
                 RenderError(I18nMessage("my_id", {"hello": "there"}, "cjwmodule"))
             ]
         ),
     )
Exemple #4
0
 def test_coerce_validate_processresult(self):
     """ProcessResult.coerce(<ProcessResult>) should raise on error."""
     # render() gets access to a fetch_result. Imagine this module:
     #
     # def render(table, params, *, fetch_result):
     #     fetch_result.dataframe.drop(0, inplace=True)
     #     return fetch_result  # invalid index
     #
     # We could (and maybe should) avoid this by banning ProcessResult
     # retvals from `render()`. But to be consistent we'd need to ban
     # ProcessResult retvals from `fetch()`; and that'd take a few hours.
     #
     # TODO ban `ProcessResult` retvals from `fetch()`, then raise
     # Valueerror on ProcessResult.coerce(<ProcessResult>).
     fetch_result = ProcessResult(pd.DataFrame({"A": [1, 2, 3]}))
     fetch_result.dataframe.drop(0, inplace=True)  # bad index
     with self.assertRaisesRegex(ValueError, "must use the default RangeIndex"):
         ProcessResult.coerce(fetch_result)
 def test_coerce_tuple_none_i18n_dict(self):
     expected = ProcessResult(
         errors=[RenderError(I18nMessage("message.id", {"param1": "a"}, None))],
         json={"a": "b"},
     )
     result = ProcessResult.coerce(
         (None, ("message.id", {"param1": "a"}), {"a": "b"})
     )
     self.assertEqual(result, expected)
Exemple #6
0
 def test_coerce_dict_quickfix_multiple(self):
     dataframe = pd.DataFrame({"A": [1, 2]})
     result = ProcessResult.coerce({
         "dataframe":
         dataframe,
         "errors": [
             {
                 "message":
                 "an error",
                 "quickFixes": [
                     dict(
                         text="Hi",
                         action="prependModule",
                         args=["texttodate", {
                             "column": "created_at"
                         }],
                     ),
                     dict(
                         text=("message.id", {}),
                         action="prependModule",
                         args=["texttodate", {
                             "column": "created_at"
                         }],
                     ),
                 ],
             },
             "other error",
         ],
         "json": {
             "foo": "bar"
         },
     })
     expected = ProcessResult(
         dataframe,
         errors=[
             RenderError(
                 TODO_i18n("an error"),
                 [
                     QuickFix(
                         TODO_i18n("Hi"),
                         QuickFixAction.PrependStep(
                             "texttodate", {"column": "created_at"}),
                     ),
                     QuickFix(
                         I18nMessage("message.id", {}, None),
                         QuickFixAction.PrependStep(
                             "texttodate", {"column": "created_at"}),
                     ),
                 ],
             ),
             RenderError(TODO_i18n("other error")),
         ],
         json={"foo": "bar"},
     )
     self.assertEqual(result, expected)
Exemple #7
0
 def test_coerce_dict_quickfix_dict_not_json_serializable(self):
     with self.assertRaises(ValueError):
         ProcessResult.coerce({
             "errors": [{
                 "message":
                 "an error",
                 "quickFixes": [{
                     "text":
                     "Hi",
                     "action":
                     "prependModule",
                     "args": [
                         "texttodate",
                         {
                             "columns": pd.Index(["created_at"])
                         },
                     ],
                 }],
             }]
         })
Exemple #8
0
 def test_coerce_dict_bad_quickfix_dict(self):
     with self.assertRaises(ValueError):
         ProcessResult.coerce({
             "errors": [{
                 "message":
                 "an error",
                 "quickFixes": [{
                     "text":
                     "Hi",
                     "action":
                     "prependModule",
                     "arguments": [
                         "texttodate",
                         {
                             "column": "created_at"
                         },
                     ],
                 }],
             }]
         })
Exemple #9
0
    def _test(
        self,
        table: pd.DataFrame,
        params: Dict[str, Any] = {},
        expected_table: pd.DataFrame = pd.DataFrame(),
        expected_error: str = "",
    ):
        result = ProcessResult.coerce(formula.render(table, P(**params)))
        expected = ProcessResult(expected_table, expected_error)

        self.assertEqual(result.error, expected.error)
        assert_frame_equal(result.dataframe, expected.dataframe)
Exemple #10
0
 def test_coerce_infer_columns_with_format(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         {"dataframe": table, "column_formats": {"A": "{:,d}"}}
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.NUMBER(format="{:,d}")),
             Column("B", ColumnType.TEXT()),
         ],
     )
Exemple #11
0
 def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self):
     table = pd.DataFrame({"A": [1, 2]})
     result = ProcessResult.coerce(
         {
             "dataframe": table,
             "column_formats": {
                 "A": "{:,d}"
             }
         },
         try_fallback_columns=[Column("A", ColumnType.Number("{:,.2f}"))],
     )
     self.assertEqual(result.columns,
                      [Column("A", ColumnType.Number("{:,d}"))])
Exemple #12
0
 def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         table,
         try_fallback_columns=[
             Column("A", ColumnType.TEXT()),
             Column("B", ColumnType.NUMBER()),
         ],
     )
     self.assertEqual(
         result.columns,
         [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())],
     )
 def test_coerce_infer_columns_try_fallback_columns(self):
     table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]})
     result = ProcessResult.coerce(
         table,
         try_fallback_columns=[
             Column("A", ColumnType.Number("{:,d}")),
             Column("B", ColumnType.Text()),
         ],
     )
     self.assertEqual(
         result.columns,
         [Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text())],
     )
 def test_coerce_infer_columns_with_unit(self):
     table = pd.DataFrame(
         {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]}
     )
     result = ProcessResult.coerce(
         {"dataframe": table, "column_formats": {"A": "year"}}
     )
     self.assertEqual(
         result.columns,
         [
             Column("A", ColumnType.Date(unit="year")),
             Column("B", ColumnType.Text()),
         ],
     )
 def test_coerce_dict_with_quickfix_not_json_serializable(self):
     dataframe = pd.DataFrame({"A": [1, 2]})
     with self.assertRaises(ValueError):
         ProcessResult.coerce(
             {
                 "dataframe": dataframe,
                 "errors": [
                     {
                         "message": "an error",
                         "quickFixes": [
                             dict(
                                 text="Hi",
                                 action="prependModule",
                                 args=[
                                     "texttodate",
                                     {"columns": pd.Index(["created_at"])},
                                 ],
                             )
                         ],
                     }
                 ],
                 "json": {"foo": "bar"},
             }
         )
 def test_coerce_dict_legacy(self):
     dataframe = pd.DataFrame({"A": [1, 2]})
     result = ProcessResult.coerce(
         {
             "dataframe": dataframe,
             "error": "an error",
             "json": {"foo": "bar"},
             "quick_fixes": [],
         }
     )
     expected = ProcessResult(
         dataframe,
         [RenderError(TODO_i18n("an error"), [])],
         json={"foo": "bar"},
     )
     self.assertEqual(result, expected)
Exemple #17
0
def _safe_parse(bytesio: io.BytesIO,
                parser: Callable[[bytes], pd.DataFrame]) -> ProcessResult:
    """
    Run the given parser, or return the error as a string.

    Empty dataset is not an error: it is just an empty dataset.
    """
    try:
        return ProcessResult.coerce(parser(bytesio))
    except BadInput as err:
        return ProcessResult(error=str(err))
    except json.decoder.JSONDecodeError as err:
        return ProcessResult(error=str(err))
    except pd.errors.EmptyDataError:
        return ProcessResult()
    except pd.errors.ParserError as err:
        return ProcessResult(error=str(err))
 def test_to_arrow_empty_dataframe(self):
     fd, filename = tempfile.mkstemp()
     # We'll test that ProcessResult.to_arrow() writes empty bytes on error
     os.write(fd, b"to-remove")
     os.close(fd)
     try:
         result = ProcessResult.coerce("bad, bad error").to_arrow(Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 [RenderError(TODO_i18n("bad, bad error"), [])],
                 {},
             ),
         )
         assert_arrow_table_equals(
             load_untrusted_arrow_file_with_columns(Path(filename))[0], make_table()
         )
     finally:
         os.unlink(filename)
Exemple #19
0
 def test_coerce_dict_with_quickfix_tuple(self):
     dataframe = pd.DataFrame({"A": [1, 2]})
     quick_fix = QuickFix(
         "Hi", "prependModule", ["texttodate", {"column": "created_at"}]
     )
     result = ProcessResult.coerce(
         {
             "dataframe": dataframe,
             "error": "an error",
             "json": {"foo": "bar"},
             "quick_fixes": [
                 ("Hi", "prependModule", "texttodate", {"column": "created_at"})
             ],
         }
     )
     expected = ProcessResult(
         dataframe, "an error", json={"foo": "bar"}, quick_fixes=[quick_fix]
     )
     self.assertEqual(result, expected)
Exemple #20
0
 def test_to_arrow_empty_dataframe(self):
     fd, filename = tempfile.mkstemp()
     os.close(fd)
     # Remove the file. Then we'll test that ProcessResult.to_arrow() does
     # not write it (because the result is an error)
     os.unlink(filename)
     try:
         result = ProcessResult.coerce("bad, bad error").to_arrow(
             Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 atypes.ArrowTable(None, None, TableMetadata(0, [])),
                 [RenderError(TODO_i18n("bad, bad error"), [])],
                 {},
             ),
         )
         with self.assertRaises(FileNotFoundError):
             open(filename)
     finally:
         try:
             os.unlink(filename)
         except FileNotFoundError:
             pass
Exemple #21
0
 def test_to_arrow_normal_dataframe(self):
     fd, filename = tempfile.mkstemp()
     os.close(fd)
     # Remove the file. Then we'll test that ProcessResult.to_arrow() does
     # not write it (because the result is an error)
     os.unlink(filename)
     try:
         process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]}))
         result = process_result.to_arrow(Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 atypes.ArrowTable(
                     Path(filename),
                     atypes.TableMetadata(
                         2,
                         [
                             atypes.Column(
                                 "A",
                                 atypes.ColumnType.Number(
                                     # Whatever .format
                                     # ProcessResult.coerce() gave
                                     process_result.columns[0].type.format
                                 ),
                             )
                         ],
                     ),
                 ),
                 [],
                 {},
             ),
         )
         arrow_table = result.table.table
         self.assertEqual(arrow_table.to_pydict(), {"A": [1, 2]})
     finally:
         os.unlink(filename)
Exemple #22
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params["url"].strip()
    tablenum: int = params["tablenum"] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult.coerce("Table number must be at least 1")

    result = None

    try:
        async with di.spooled_data_from_url(url) as (spool, headers, charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor="html5lib",  # force algorithm, for reproducibility
                    io=textio,
                    match=".+",
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult.coerce("Timeout fetching {url}")
    except aiohttp.InvalidURL:
        return ProcessResult.coerce("Invalid URL")
    except aiohttp.ClientResponseError as err:
        return ProcessResult.coerce("Error from server: %d %s" %
                                    (err.status, err.message))
    except aiohttp.ClientError as err:
        return ProcessResult.coerce(str(err))
    except ValueError:
        return ProcessResult.coerce(
            "Did not find any <table> tags on that page")
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult.coerce("Table has no columns")

    if not tables:
        return ProcessResult.coerce(
            "Did not find any <table> tags on that page")

    if tablenum >= len(tables):
        return ProcessResult.coerce(
            f"The maximum table number on this page is {len(tables)}")

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Exemple #23
0
 def test_coerce_tuple_none_str_none(self):
     expected = ProcessResult(error="hi")
     result = ProcessResult.coerce((None, "hi", None))
     self.assertEqual(result, expected)
Exemple #24
0
 def test_coerce_empty_dict(self):
     result = ProcessResult.coerce({})
     expected = ProcessResult()
     self.assertEqual(result, expected)
Exemple #25
0
 def test_coerce_invalid_value(self):
     result = ProcessResult.coerce([None, "foo"])
     self.assertIsNotNone(result.error)
Exemple #26
0
 def test_coerce_dict_wrong_key(self):
     with self.assertRaises(ValueError):
         ProcessResult.coerce({"table": pd.DataFrame({"A": [1]})})
Exemple #27
0
 def test_coerce_tuple_none_none_dict(self):
     expected = ProcessResult(json={"a": "b"})
     result = ProcessResult.coerce((None, None, {"a": "b"}))
     self.assertEqual(result, expected)
Exemple #28
0
 def test_coerce_3tuple_no_dataframe(self):
     result = ProcessResult.coerce(("foo", "bar", {"a": "b"}))
     self.assertIsNotNone(result.error)
Exemple #29
0
 def test_coerce_bad_tuple(self):
     result = ProcessResult.coerce(("foo", "bar", "baz", "moo"))
     self.assertIsNotNone(result.error)
Exemple #30
0
 def test_coerce_tuple_none_none_none(self):
     expected = ProcessResult()
     result = ProcessResult.coerce((None, None, None))
     self.assertEqual(result, expected)