예제 #1
0
    def test_truncate_not_too_big(self):
        df = pd.DataFrame({"foo": ["foo", "bar", "baz"]})
        expected = ProcessResult(df.copy())
        result = ProcessResult(df)
        result.truncate_in_place_if_too_big()

        self.assertEqual(result, expected)
예제 #2
0
 def test_truncate_too_big_remove_unused_categories(self):
     result_df = pd.DataFrame({"A": ["x", "y", "z", "z"]}, dtype="category")
     result = ProcessResult(result_df)
     result.truncate_in_place_if_too_big()
     assert_frame_equal(
         result.dataframe, pd.DataFrame({"A": ["x", "y"]}, dtype="category")
     )
예제 #3
0
    def test_truncate_too_big_and_error(self):
        expected_df = pd.DataFrame({"foo": ["bar", "baz"]})
        expected = ProcessResult(
            dataframe=expected_df, error="Some error\nTruncated output from 3 rows to 2"
        )

        result_df = pd.DataFrame({"foo": ["bar", "baz", "moo"]})
        result = ProcessResult(result_df, error="Some error")
        result.truncate_in_place_if_too_big()

        self.assertEqual(result, expected)
예제 #4
0
    def test_truncate_too_big_no_error(self):
        expected_df = pd.DataFrame({"foo": ["bar", "baz"]})
        expected = ProcessResult(
            dataframe=expected_df,
            errors=[
                RenderError(
                    I18nMessage(
                        "py.cjwkernel.pandas.types.ProcessResult.truncate_in_place_if_too_big.warning",
                        {"old_number": 3, "new_number": 2},
                        None,
                    )
                )
            ],
        )

        result_df = pd.DataFrame({"foo": ["bar", "baz", "moo"]})
        result = ProcessResult(result_df, errors=[])
        result.truncate_in_place_if_too_big()

        self.assertEqual(result, expected)
예제 #5
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params["url"].strip()
    tablenum: int = params["tablenum"] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error="Table number must be at least 1")

    result = None

    try:
        async with moduleutils.spooled_data_from_url(url) as (spool, headers,
                                                              charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with moduleutils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor="html5lib",  # force algorithm, for reproducibility
                    io=textio,
                    match=".+",
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f"Timeout fetching {url}")
    except aiohttp.InvalidURL:
        return ProcessResult(error=f"Invalid URL")
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=("Error from server: %d %s" %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error="Did not find any <table> tags on that page")
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error="Table has no columns")

    if not tables:
        return ProcessResult(
            error="Did not find any <table> tags on that page")

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f"The maximum table number on this page is {len(tables)}"))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    moduleutils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result