Beispiel #1
0
 def test_to_arrow_empty_dataframe(self):
     fd, filename = tempfile.mkstemp()
     os.close(fd)
     # Remove the file. Then we'll test that ProcessResult.to_arrow() does
     # not write it (because the result is an error)
     os.unlink(filename)
     try:
         result = ProcessResult.coerce("bad, bad error").to_arrow(
             Path(filename))
         self.assertEqual(
             result,
             atypes.RenderResult(
                 atypes.ArrowTable(None, None, TableMetadata(0, [])),
                 [RenderError(TODO_i18n("bad, bad error"), [])],
                 {},
             ),
         )
         with self.assertRaises(FileNotFoundError):
             open(filename)
     finally:
         try:
             os.unlink(filename)
         except FileNotFoundError:
             pass
Beispiel #2
0
 def test_coerce_processresult(self):
     expected = ProcessResult()
     result = ProcessResult.coerce(expected)
     self.assertIs(result, expected)
Beispiel #3
0
    def test_bad_server(self):
        fetch_result = fetch(url="http://example.org")

        self.assertEqual(
            fetch_result,
            ProcessResult(error="Error from server: 500 Server Error"))
Beispiel #4
0
 def test_table_index_under(self):
     url = "http:INVALID:URL"  # we should never even validate the URL
     fetch_result = fetch(url=url, tablenum=0)
     self.assertEqual(
         fetch_result,
         ProcessResult(error="Table number must be at least 1"))
Beispiel #5
0
 def test_render_error_process_result(self):
     result = render(pd.DataFrame(),
                     P(),
                     fetch_result=ProcessResult(error="hi"))
     assert_process_result_equal(result, "hi")
Beispiel #6
0
 def test_empty_table_shape(self):
     result = ProcessResult()
     self.assertEqual(result.table_shape, TableShape(0, []))
Beispiel #7
0
 def test_empty_columns(self):
     result = ProcessResult()
     self.assertEqual(result.column_names, [])
     self.assertEqual(result.columns, [])
Beispiel #8
0
 def test_status_error(self):
     result = ProcessResult(pd.DataFrame(), "error")
     self.assertEqual(result.status, "error")
Beispiel #9
0
 def test_coerce_tuple_none_str_none(self):
     expected = ProcessResult(error="hi")
     result = ProcessResult.coerce((None, "hi", None))
     self.assertEqual(result, expected)
Beispiel #10
0
 def test_coerce_tuple_none_str_dict(self):
     expected = ProcessResult(error="hi", json={"a": "b"})
     result = ProcessResult.coerce((None, "hi", {"a": "b"}))
     self.assertEqual(result, expected)
Beispiel #11
0
 def test_coerce_tuple_dataframe_none_none(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(df)
     result = ProcessResult.coerce((df, None, None))
     self.assertEqual(result, expected)
Beispiel #12
0
 def test_coerce_tuple_dataframe_none_dict(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(df, "", json={"a": "b"})
     result = ProcessResult.coerce((df, None, {"a": "b"}))
     self.assertEqual(result, expected)
Beispiel #13
0
 def test_coerce_tuple_dataframe_str(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(dataframe=df, error="hi")
     result = ProcessResult.coerce((df, "hi"))
     self.assertEqual(result, expected)
Beispiel #14
0
 def test_coerce_str(self):
     expected = ProcessResult(error="yay")
     result = ProcessResult.coerce("yay")
     self.assertEqual(result, expected)
Beispiel #15
0
 def test_coerce_dataframe(self):
     df = pd.DataFrame({"foo": ["bar"]})
     expected = ProcessResult(dataframe=df)
     result = ProcessResult.coerce(df)
     self.assertEqual(result, expected)
Beispiel #16
0
 def test_status_ok_with_warning(self):
     result = ProcessResult(pd.DataFrame({"A": [1]}), "warning")
     self.assertEqual(result.status, "ok")
Beispiel #17
0
 def test_status_ok_with_no_rows(self):
     result = ProcessResult(pd.DataFrame({"A": []}), "")
     self.assertEqual(result.status, "ok")
Beispiel #18
0
 def test_coerce_tuple_none_none_dict(self):
     expected = ProcessResult(json={"a": "b"})
     result = ProcessResult.coerce((None, None, {"a": "b"}))
     self.assertEqual(result, expected)
Beispiel #19
0
 def test_status_unreachable(self):
     result = ProcessResult(pd.DataFrame(), "")
     self.assertEqual(result.status, "unreachable")
Beispiel #20
0
 def test_coerce_tuple_none_none_none(self):
     expected = ProcessResult()
     result = ProcessResult.coerce((None, None, None))
     self.assertEqual(result, expected)
Beispiel #21
0
 def test_table_shape(self):
     df = pd.DataFrame({"A": [1, 2, 3]})
     result = ProcessResult(df)
     self.assertEqual(
         result.table_shape, TableShape(3, [Column("A", ColumnType.NUMBER())])
     )
Beispiel #22
0
 def test_coerce_bad_tuple(self):
     result = ProcessResult.coerce(("foo", "bar", "baz", "moo"))
     self.assertIsNotNone(result.error)
Beispiel #23
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params["url"].strip()
    tablenum: int = params["tablenum"] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error="Table number must be at least 1")

    result = None

    try:
        async with moduleutils.spooled_data_from_url(url) as (spool, headers,
                                                              charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with moduleutils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor="html5lib",  # force algorithm, for reproducibility
                    io=textio,
                    match=".+",
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f"Timeout fetching {url}")
    except aiohttp.InvalidURL:
        return ProcessResult(error=f"Invalid URL")
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=("Error from server: %d %s" %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error="Did not find any <table> tags on that page")
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error="Table has no columns")

    if not tables:
        return ProcessResult(
            error="Did not find any <table> tags on that page")

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f"The maximum table number on this page is {len(tables)}"))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    moduleutils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Beispiel #24
0
 def test_coerce_3tuple_no_dataframe(self):
     result = ProcessResult.coerce(("foo", "bar", {"a": "b"}))
     self.assertIsNotNone(result.error)
Beispiel #25
0
 def test_render_empty_process_result(self):
     result = render(pd.DataFrame(),
                     P(has_header=False),
                     fetch_result=ProcessResult())
     assert_process_result_equal(result, pd.DataFrame())
Beispiel #26
0
 def test_coerce_dict_wrong_key(self):
     with self.assertRaises(ValueError):
         ProcessResult.coerce({"table": pd.DataFrame({"A": [1]})})
Beispiel #27
0
 def test_table_index_over(self):
     fetch_result = fetch(url="http://example.org", tablenum=2)
     self.assertEqual(
         fetch_result,
         ProcessResult(error="The maximum table number on this page is 1"),
     )
Beispiel #28
0
 def test_coerce_empty_dict(self):
     result = ProcessResult.coerce({})
     expected = ProcessResult()
     self.assertEqual(result, expected)
Beispiel #29
0
 def test_404(self):
     fetch_result = fetch(url="http://example.org")
     self.assertEqual(
         fetch_result,
         ProcessResult(error="Error from server: 404 Not Found"))
Beispiel #30
0
 def test_coerce_invalid_value(self):
     result = ProcessResult.coerce([None, "foo"])
     self.assertIsNotNone(result.error)