def test_autocast_mixed_types_to_str(self): # This is important in particular for Excel data, which is often a mix # of int and str. table = pd.DataFrame({"A": ["1A", 2]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["1A", "2"]}) assert_frame_equal(table, expected)
def test_autocast_cast_crazy_types(self): class Obj: def __init__(self, s): self.s = s def __str__(self): return self.s obj1 = Obj("o1") obj2 = Obj("o2") table = pd.DataFrame({"A": [obj1, obj2]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["o1", "o2"]}) assert_frame_equal(table, expected)
def test_autocast_cast_crazy_types(self): class Obj: def __init__(self, s): self.s = s def __str__(self): return self.s obj1 = Obj('o1') obj2 = Obj('o2') table = pd.DataFrame({'A': [obj1, obj2]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': ['o1', 'o2']}) assert_frame_equal(table, expected)
def render(table, params, *, fetch_result): if not fetch_result: return table if fetch_result.status == 'error': return fetch_result table = fetch_result.dataframe has_header: bool = params['first_row_is_header'] if has_header and len(table) >= 1: # if len == 0, no-op table.columns = list(utils.uniquize_colnames( str(c) or ('Column %d' % (i + 1)) for i, c in enumerate(table.iloc[0, :]) )) table.drop(index=0, inplace=True) table.reset_index(drop=True, inplace=True) utils.autocast_dtypes_in_place(table) if fetch_result.error: return (table, fetch_result.error) else: return table
def test_autocast_float_from_str_categories_with_dup_floats(self): table = pd.DataFrame({"A": ["1", "1.0"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1.0, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories_with_empty_str(self): # example: used read_csv(dtype='category'), now want floats table = pd.DataFrame({"A": ["1", "2.1", ""]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1.0, 2.1, np.nan]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories(self): # example: used read_csv(dtype='category'), now want ints table = pd.DataFrame({"A": ["1", "2"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(table, expected)
def test_autocast_int_from_str(self): table = pd.DataFrame({"A": ["1", "2"]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(table, expected)
def test_autocast_all_empty_or_null_categories_is_text(self): table = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category") assert_frame_equal(table, expected)
def test_autocast_int_from_str(self): table = pd.DataFrame({'A': ['1', '2']}) autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1, 2]}) assert_frame_equal(table, expected)
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params['url'].strip() tablenum: int = params['tablenum'] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error='Table number must be at least 1') result = None try: async with utils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with utils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor='html5lib', # force algorithm, for reproducibility io=textio, match='.+', attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error='Did not find any <table> tags on that page') except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error='Table has no columns') if not tables: return ProcessResult( error='Did not find any <table> tags on that page') if tablenum >= len(tables): return ProcessResult( error=(f'The maximum table number on this page is {len(tables)}')) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) utils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
def test_autocast_str_categories_from_str_categories(self): table = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category') autocast_dtypes_in_place(table) # should be no-op expected = pd.DataFrame({'A': ['1', '2.1', 'Yay']}, dtype='category') assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories_with_empty_str(self): table = pd.DataFrame({'A': ['', '', '1']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [np.nan, np.nan, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories_with_dup_floats(self): table = pd.DataFrame({'A': ['1', '1.0']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1.0, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories(self): # example: used read_csv(dtype='category'), now want floats table = pd.DataFrame({'A': ['1', '2.1']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': [1.0, 2.1]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories_with_empty_str(self): table = pd.DataFrame({"A": ["", "", "1"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [np.nan, np.nan, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_str_categories_from_str_categories(self): table = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category") autocast_dtypes_in_place(table) # should be no-op expected = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category") assert_frame_equal(table, expected)
def test_autocast_all_null_is_text(self): table = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object) assert_frame_equal(table, expected)
def test_autocast_all_empty_str_is_text(self): table = pd.DataFrame({"A": ["", ""]}) autocast_dtypes_in_place(table) assert_frame_equal(table, pd.DataFrame({"A": ["", ""]}))
def test_autocast_all_empty_or_null_categories_is_text(self): table = pd.DataFrame({'A': ['', np.nan, '']}, dtype='category') autocast_dtypes_in_place(table) expected = pd.DataFrame({'A': ['', np.nan, '']}, dtype='category') assert_frame_equal(table, expected)