예제 #1
0
 def test_autocast_mixed_types_to_str(self):
     # This is important in particular for Excel data, which is often a mix
     # of int and str.
     table = pd.DataFrame({"A": ["1A", 2]})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": ["1A", "2"]})
     assert_frame_equal(table, expected)
예제 #2
0
    def test_autocast_cast_crazy_types(self):
        class Obj:
            def __init__(self, s):
                self.s = s

            def __str__(self):
                return self.s

        obj1 = Obj("o1")
        obj2 = Obj("o2")

        table = pd.DataFrame({"A": [obj1, obj2]})
        autocast_dtypes_in_place(table)
        expected = pd.DataFrame({"A": ["o1", "o2"]})
        assert_frame_equal(table, expected)
예제 #3
0
def render(table, params, *, fetch_result):
    if not fetch_result:
        return table

    if fetch_result.status == "error":
        return fetch_result

    table = fetch_result.dataframe

    has_header: bool = params["first_row_is_header"]
    if has_header and len(table) >= 1:  # if len == 0, no-op
        table.columns = list(
            moduleutils.uniquize_colnames(
                str(c) or ("Column %d" % (i + 1))
                for i, c in enumerate(table.iloc[0, :])))
        table.drop(index=0, inplace=True)
        table.reset_index(drop=True, inplace=True)
        moduleutils.autocast_dtypes_in_place(table)

    if fetch_result.error:
        return (table, fetch_result.error)
    else:
        return table
예제 #4
0
def render(table, params, *, fetch_result):
    if not fetch_result:
        return table

    if fetch_result.status == "error":
        return fetch_result

    table = fetch_result.dataframe

    has_header: bool = params["first_row_is_header"]
    if has_header and len(table) >= 1:  # if len == 0, no-op
        # TODO inform user of column-rename warnings
        table.columns = [
            uccn.name for uccn in gen_unique_clean_colnames(
                [str(c) for c in table.iloc[0, :]])
        ]
        table.drop(index=0, inplace=True)
        table.reset_index(drop=True, inplace=True)
        moduleutils.autocast_dtypes_in_place(table)

    if fetch_result.error:
        return (table, fetch_result.error)
    else:
        return table
예제 #5
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params["url"].strip()
    tablenum: int = params["tablenum"] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error="Table number must be at least 1")

    result = None

    try:
        async with moduleutils.spooled_data_from_url(url) as (spool, headers,
                                                              charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with moduleutils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor="html5lib",  # force algorithm, for reproducibility
                    io=textio,
                    match=".+",
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f"Timeout fetching {url}")
    except aiohttp.InvalidURL:
        return ProcessResult(error=f"Invalid URL")
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=("Error from server: %d %s" %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error="Did not find any <table> tags on that page")
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error="Table has no columns")

    if not tables:
        return ProcessResult(
            error="Did not find any <table> tags on that page")

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f"The maximum table number on this page is {len(tables)}"))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    moduleutils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
예제 #6
0
 def test_autocast_float_from_str_categories_with_empty_str(self):
     # example: used read_csv(dtype='category'), now want floats
     table = pd.DataFrame({"A": ["1", "2.1", ""]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1.0, 2.1, np.nan]}, dtype=np.float64)
     assert_frame_equal(table, expected)
예제 #7
0
 def test_autocast_int_from_str_categories(self):
     # example: used read_csv(dtype='category'), now want ints
     table = pd.DataFrame({"A": ["1", "2"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1, 2]})
     assert_frame_equal(table, expected)
예제 #8
0
 def test_autocast_int_from_str(self):
     table = pd.DataFrame({"A": ["1", "2"]})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1, 2]})
     assert_frame_equal(table, expected)
예제 #9
0
 def test_autocast_all_empty_or_null_categories_is_text(self):
     table = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category")
     assert_frame_equal(table, expected)
예제 #10
0
 def test_autocast_all_empty_str_is_text(self):
     table = pd.DataFrame({"A": ["", ""]})
     autocast_dtypes_in_place(table)
     assert_frame_equal(table, pd.DataFrame({"A": ["", ""]}))
예제 #11
0
 def test_autocast_all_null_is_text(self):
     table = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object)
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object)
     assert_frame_equal(table, expected)
예제 #12
0
 def test_autocast_str_categories_from_str_categories(self):
     table = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category")
     autocast_dtypes_in_place(table)  # should be no-op
     expected = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category")
     assert_frame_equal(table, expected)
예제 #13
0
 def test_autocast_int_from_str_categories_with_empty_str(self):
     table = pd.DataFrame({"A": ["", "", "1"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [np.nan, np.nan, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
예제 #14
0
 def test_autocast_float_from_str_categories_with_dup_floats(self):
     table = pd.DataFrame({"A": ["1", "1.0"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1.0, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)