Beispiel #1
0
 def test_autocast_mixed_types_to_str(self):
     # This is important in particular for Excel data, which is often a mix
     # of int and str.
     table = pd.DataFrame({"A": ["1A", 2]})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": ["1A", "2"]})
     assert_frame_equal(table, expected)
Beispiel #2
0
    def test_autocast_cast_crazy_types(self):
        class Obj:
            def __init__(self, s):
                self.s = s

            def __str__(self):
                return self.s

        obj1 = Obj("o1")
        obj2 = Obj("o2")

        table = pd.DataFrame({"A": [obj1, obj2]})
        autocast_dtypes_in_place(table)
        expected = pd.DataFrame({"A": ["o1", "o2"]})
        assert_frame_equal(table, expected)
Beispiel #3
0
    def test_autocast_cast_crazy_types(self):
        class Obj:
            def __init__(self, s):
                self.s = s

            def __str__(self):
                return self.s

        obj1 = Obj('o1')
        obj2 = Obj('o2')

        table = pd.DataFrame({'A': [obj1, obj2]})
        autocast_dtypes_in_place(table)
        expected = pd.DataFrame({'A': ['o1', 'o2']})
        assert_frame_equal(table, expected)
Beispiel #4
0
def render(table, params, *, fetch_result):
    if not fetch_result:
        return table

    if fetch_result.status == 'error':
        return fetch_result

    table = fetch_result.dataframe

    has_header: bool = params['first_row_is_header']
    if has_header and len(table) >= 1:  # if len == 0, no-op
        table.columns = list(utils.uniquize_colnames(
            str(c) or ('Column %d' % (i + 1))
            for i, c in enumerate(table.iloc[0, :])
        ))
        table.drop(index=0, inplace=True)
        table.reset_index(drop=True, inplace=True)
        utils.autocast_dtypes_in_place(table)

    if fetch_result.error:
        return (table, fetch_result.error)
    else:
        return table
Beispiel #5
0
 def test_autocast_float_from_str_categories_with_dup_floats(self):
     table = pd.DataFrame({"A": ["1", "1.0"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1.0, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #6
0
 def test_autocast_float_from_str_categories_with_empty_str(self):
     # example: used read_csv(dtype='category'), now want floats
     table = pd.DataFrame({"A": ["1", "2.1", ""]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1.0, 2.1, np.nan]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #7
0
 def test_autocast_int_from_str_categories(self):
     # example: used read_csv(dtype='category'), now want ints
     table = pd.DataFrame({"A": ["1", "2"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1, 2]})
     assert_frame_equal(table, expected)
Beispiel #8
0
 def test_autocast_int_from_str(self):
     table = pd.DataFrame({"A": ["1", "2"]})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [1, 2]})
     assert_frame_equal(table, expected)
Beispiel #9
0
 def test_autocast_all_empty_or_null_categories_is_text(self):
     table = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category")
     assert_frame_equal(table, expected)
Beispiel #10
0
 def test_autocast_int_from_str(self):
     table = pd.DataFrame({'A': ['1', '2']})
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1, 2]})
     assert_frame_equal(table, expected)
Beispiel #11
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params['url'].strip()
    tablenum: int = params['tablenum'] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error='Table number must be at least 1')

    result = None

    try:
        async with utils.spooled_data_from_url(url) as (spool, headers,
                                                        charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with utils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor='html5lib',  # force algorithm, for reproducibility
                    io=textio,
                    match='.+',
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f'Timeout fetching {url}')
    except aiohttp.InvalidURL:
        return ProcessResult(error=f'Invalid URL')
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=('Error from server: %d %s' %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error='Did not find any <table> tags on that page')
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error='Table has no columns')

    if not tables:
        return ProcessResult(
            error='Did not find any <table> tags on that page')

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f'The maximum table number on this page is {len(tables)}'))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    utils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Beispiel #12
0
 def test_autocast_str_categories_from_str_categories(self):
     table = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category')
     autocast_dtypes_in_place(table)  # should be no-op
     expected = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category')
     assert_frame_equal(table, expected)
Beispiel #13
0
 def test_autocast_int_from_str_categories_with_empty_str(self):
     table = pd.DataFrame({'A': ['', '', '1']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [np.nan, np.nan, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #14
0
 def test_autocast_float_from_str_categories_with_dup_floats(self):
     table = pd.DataFrame({'A': ['1', '1.0']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1.0, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #15
0
 def test_autocast_float_from_str_categories(self):
     # example: used read_csv(dtype='category'), now want floats
     table = pd.DataFrame({'A': ['1', '2.1']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': [1.0, 2.1]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #16
0
 def test_autocast_int_from_str_categories_with_empty_str(self):
     table = pd.DataFrame({"A": ["", "", "1"]}, dtype="category")
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [np.nan, np.nan, 1.0]}, dtype=np.float64)
     assert_frame_equal(table, expected)
Beispiel #17
0
 def test_autocast_str_categories_from_str_categories(self):
     table = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category")
     autocast_dtypes_in_place(table)  # should be no-op
     expected = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category")
     assert_frame_equal(table, expected)
Beispiel #18
0
 def test_autocast_all_null_is_text(self):
     table = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object)
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object)
     assert_frame_equal(table, expected)
Beispiel #19
0
 def test_autocast_all_empty_str_is_text(self):
     table = pd.DataFrame({"A": ["", ""]})
     autocast_dtypes_in_place(table)
     assert_frame_equal(table, pd.DataFrame({"A": ["", ""]}))
Beispiel #20
0
 def test_autocast_all_empty_or_null_categories_is_text(self):
     table = pd.DataFrame({'A': ['', np.nan, '']}, dtype='category')
     autocast_dtypes_in_place(table)
     expected = pd.DataFrame({'A': ['', np.nan, '']}, dtype='category')
     assert_frame_equal(table, expected)