def test_empty_columns(self): result = ProcessResult() self.assertEqual(result.column_names, []) self.assertEqual(result.columns, [])
def test_right_is_error(self): result = render(pd.DataFrame({'A': [1]}), P(), PR('error')) self.assertEqual(result, ProcessResult(error='error'))
def Err(error): return ProcessResult(error=error)
def test_load_csv(self): fetch_result = fetch(url='http://test.com/the.csv') self.assertEqual(fetch_result, ProcessResult(mock_csv_table))
def test_no_left(self): result = render(pd.DataFrame(), P(), PR('', {'A': [1]})) self.assertEqual(result, ProcessResult())
async def fake_fetch(*args, **kwargs): return ProcessResult(pd.DataFrame({'A': [1]}))
def test_load_404(self): # 404 error should put module in error state fetch_result = fetch(url='http://example.org/x.csv') self.assertEqual( fetch_result, ProcessResult(error='Error from server: 404 Not Found'))
def test_render_has_header_false(self): result = render(False, ProcessResult(pd.DataFrame({'A': [1]}))) self.assertEqual(result, ProcessResult(pd.DataFrame({'0': ['A', '1']})))
def test_render_file_error(self): result = render(False, ProcessResult(error='x')) self.assertEqual(result, ProcessResult(error='x'))
def test_render_no_file(self): result = render(True, None) self.assertEqual(result, ProcessResult())
def test_render_has_header_true(self): result = render(True, ProcessResult(pd.DataFrame({'A': [1]}))) self.assertEqual(result, ProcessResult(pd.DataFrame({'A': [1]})))
def render(has_header, fetch_result): x = uploadfile.render(pd.DataFrame(), {'has_header': has_header}, fetch_result=fetch_result) result = ProcessResult.coerce(x) result.sanitize_in_place() return result
def test_empty_table_shape(self): result = ProcessResult() self.assertEqual(result.table_shape, TableShape(0, []))
def test_table_shape(self): df = pd.DataFrame({'A': [1, 2, 3]}) result = ProcessResult(df) self.assertEqual(result.table_shape, TableShape(3, [Column('A', ColumnType.NUMBER())]))
def test_xlsx_cast_colnames_to_str(self): with (TestDataPath / "all-numeric.xlsx").open("rb") as file: result = parse_bytesio(file, "application/vnd.ms-excel", None) expected = ProcessResult(pd.DataFrame({"1": [2]})) self.assertEqual(result, expected)
def fake_result(colnames): return ProcessResult(a_table[colnames])
def test_parse_empty_csv(self): result = parse_bytesio(io.BytesIO(b""), "text/csv", "utf-8") expected = ProcessResult.coerce(pd.DataFrame().reset_index(drop=True)) self.assertEqual(result, expected)
def render(table, reorder_history): params = {'reorder-history': reorder_history} result = reordercolumns.render(table.copy(), params) return ProcessResult.coerce(result)
def test_load_csv_use_ext_given_bad_content_type(self): # return text/plain type and rely on filename detection, as # https://raw.githubusercontent.com/ does fetch_result = fetch(url='http://test.com/the.csv') self.assertEqual(fetch_result, ProcessResult(mock_csv_table))
async def fetch(params): # We delve into pd.read_html()'s innards, below. Part of that means some # first-use initialization. pd.io.html._importers() table = None url: str = params['url'].strip() tablenum: int = params['tablenum'] - 1 # 1-based for user if tablenum < 0: return ProcessResult(error='Table number must be at least 1') result = None try: async with utils.spooled_data_from_url(url) as (spool, headers, charset): # pandas.read_html() does automatic type conversion, but we prefer # our own. Delve into its innards so we can pass all the conversion # kwargs we want. with utils.wrap_text(spool, charset) as textio: tables = pd.io.html._parse( # Positional arguments: flavor='html5lib', # force algorithm, for reproducibility io=textio, match='.+', attrs=None, encoding=None, # textio is already decoded displayed_only=False, # avoid dud feature: it ignores CSS # Required kwargs that pd.read_html() would set by default: header=None, skiprows=None, # Now the reason we used pd.io.html._parse() instead of # pd.read_html(): we get to pass whatever kwargs we want to # TextParser. # # kwargs we get to add as a result of this hack: na_filter=False, # do not autoconvert dtype=str, # do not autoconvert ) except asyncio.TimeoutError: return ProcessResult(error=f'Timeout fetching {url}') except aiohttp.InvalidURL: return ProcessResult(error=f'Invalid URL') except aiohttp.ClientResponseError as err: return ProcessResult(error=('Error from server: %d %s' % (err.status, err.message))) except aiohttp.ClientError as err: return ProcessResult(error=str(err)) except ValueError: return ProcessResult( error='Did not find any <table> tags on that page') except IndexError: # pandas.read_html() gives this unhelpful error message.... return ProcessResult(error='Table has no columns') if not tables: return ProcessResult( error='Did not find any <table> tags on that page') if tablenum >= len(tables): return ProcessResult( error=(f'The maximum table number on this page is {len(tables)}')) # pd.read_html() guarantees unique colnames table = tables[tablenum] merge_colspan_headers_in_place(table) utils.autocast_dtypes_in_place(table) if len(table) == 0: # read_html() produces an empty Index. We want a RangeIndex. table.reset_index(drop=True, inplace=True) result = ProcessResult(dataframe=table) result.truncate_in_place_if_too_big() return result
def test_bad_url(self): fetch_result = fetch(url='not a url') self.assertEqual(fetch_result, ProcessResult(error='Invalid URL'))
def test_json_syntax_error(self): result = parse_bytesio(io.BytesIO(b"{not JSON"), "application/json") expected = ProcessResult( error=("Invalid JSON (Unexpected character found when " "decoding 'null')")) self.assertEqual(result, expected)
def PR(error, *args, **kwargs): """Shortcut ProcessResult builder.""" return ProcessResult(pd.DataFrame(*args, **kwargs), error)
def test_txt_detect_separator_comma(self): result = parse_bytesio(io.BytesIO(b"A,C\nB,D"), "text/plain", "utf-8") expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["D"]})) self.assertEqual(result, expected)
def test_no_right(self): result = render(pd.DataFrame({'A': [1]}), P(), None) self.assertEqual(result, ProcessResult())
def test_csv_detect_separator_semicolon(self): result = parse_bytesio(io.BytesIO(b"A;C\nB;D"), "text/csv", "utf-8") expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["D"]})) self.assertEqual(result, expected)
def table_to_result(table): result = ProcessResult(table) result.sanitize_in_place() # alters dataframe.equals() result return result
def test_xls(self): with (TestDataPath / "example.xls").open("rb") as file: result = parse_bytesio(file, "application/vnd.ms-excel", None) expected = ProcessResult(pd.DataFrame({"foo": [1, 2], "bar": [2, 3]})) self.assertEqual(result, expected)
def test_return_str_for_error(self): result = safe_eval_process(""" def process(table): return 'hi' """, EMPTY_DATAFRAME) self.assertEqual(result, ProcessResult(error='hi', json=EMPTY_OUTPUT))
def test_status_unreachable(self): result = ProcessResult(pd.DataFrame(), '') self.assertEqual(result.status, 'unreachable')