def test_autocast_mixed_types_to_str(self): # This is important in particular for Excel data, which is often a mix # of int and str. table = pd.DataFrame({"A": ["1A", 2]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["1A", "2"]}) assert_frame_equal(table, expected)
def test_autocast_cast_crazy_types(self): class Obj: def __init__(self, s): self.s = s def __str__(self): return self.s obj1 = Obj("o1") obj2 = Obj("o2") table = pd.DataFrame({"A": [obj1, obj2]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["o1", "o2"]}) assert_frame_equal(table, expected)
def render(table, params, *, fetch_result): if not fetch_result: return table if fetch_result.status == "error": return fetch_result table = fetch_result.dataframe has_header: bool = params["first_row_is_header"] if has_header and len(table) >= 1: # if len == 0, no-op table.columns = list( moduleutils.uniquize_colnames( str(c) or ("Column %d" % (i + 1)) for i, c in enumerate(table.iloc[0, :]))) table.drop(index=0, inplace=True) table.reset_index(drop=True, inplace=True) moduleutils.autocast_dtypes_in_place(table) if fetch_result.error: return (table, fetch_result.error) else: return table
def render(table, params, *, fetch_result): if not fetch_result: return table if fetch_result.status == "error": return fetch_result table = fetch_result.dataframe has_header: bool = params["first_row_is_header"] if has_header and len(table) >= 1: # if len == 0, no-op # TODO inform user of column-rename warnings table.columns = [ uccn.name for uccn in gen_unique_clean_colnames( [str(c) for c in table.iloc[0, :]]) ] table.drop(index=0, inplace=True) table.reset_index(drop=True, inplace=True) moduleutils.autocast_dtypes_in_place(table) if fetch_result.error: return (table, fetch_result.error) else: return table
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params["url"].strip() tablenum: int = params["tablenum"] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error="Table number must be at least 1") result = None try: async with moduleutils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with moduleutils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor="html5lib", # force algorithm, for reproducibility io=textio, match=".+", attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f"Timeout fetching {url}") except aiohttp.InvalidURL: return ProcessResult(error=f"Invalid URL") except aiohttp.ClientResponseError as err: return ProcessResult(error=("Error from server: %d %s" % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error="Did not find any <table> tags on that page") except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error="Table has no columns") if not tables: return ProcessResult( error="Did not find any <table> tags on that page") if tablenum >= len(tables): return ProcessResult( error=(f"The maximum table number on this page is {len(tables)}")) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) moduleutils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
def test_autocast_float_from_str_categories_with_empty_str(self): # example: used read_csv(dtype='category'), now want floats table = pd.DataFrame({"A": ["1", "2.1", ""]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1.0, 2.1, np.nan]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories(self): # example: used read_csv(dtype='category'), now want ints table = pd.DataFrame({"A": ["1", "2"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(table, expected)
def test_autocast_int_from_str(self): table = pd.DataFrame({"A": ["1", "2"]}) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1, 2]}) assert_frame_equal(table, expected)
def test_autocast_all_empty_or_null_categories_is_text(self): table = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": ["", np.nan, ""]}, dtype="category") assert_frame_equal(table, expected)
def test_autocast_all_empty_str_is_text(self): table = pd.DataFrame({"A": ["", ""]}) autocast_dtypes_in_place(table) assert_frame_equal(table, pd.DataFrame({"A": ["", ""]}))
def test_autocast_all_null_is_text(self): table = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object) autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [np.nan, np.nan]}, dtype=object) assert_frame_equal(table, expected)
def test_autocast_str_categories_from_str_categories(self): table = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category") autocast_dtypes_in_place(table) # should be no-op expected = pd.DataFrame({"A": ["1", "2.1", "Yay"]}, dtype="category") assert_frame_equal(table, expected)
def test_autocast_int_from_str_categories_with_empty_str(self): table = pd.DataFrame({"A": ["", "", "1"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [np.nan, np.nan, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)
def test_autocast_float_from_str_categories_with_dup_floats(self): table = pd.DataFrame({"A": ["1", "1.0"]}, dtype="category") autocast_dtypes_in_place(table) expected = pd.DataFrame({"A": [1.0, 1.0]}, dtype=np.float64) assert_frame_equal(table, expected)