Beispiel #1
0
 def test_empty_columns(self):
     result = ProcessResult()
     self.assertEqual(result.column_names, [])
     self.assertEqual(result.columns, [])
Beispiel #2
0
 def test_right_is_error(self):
     result = render(pd.DataFrame({'A': [1]}), P(), PR('error'))
     self.assertEqual(result, ProcessResult(error='error'))
Beispiel #3
0
def Err(error):
    return ProcessResult(error=error)
Beispiel #4
0
 def test_load_csv(self):
     fetch_result = fetch(url='http://test.com/the.csv')
     self.assertEqual(fetch_result, ProcessResult(mock_csv_table))
Beispiel #5
0
 def test_no_left(self):
     result = render(pd.DataFrame(), P(), PR('', {'A': [1]}))
     self.assertEqual(result, ProcessResult())
Beispiel #6
0
 async def fake_fetch(*args, **kwargs):
     return ProcessResult(pd.DataFrame({'A': [1]}))
Beispiel #7
0
 def test_load_404(self):
     # 404 error should put module in error state
     fetch_result = fetch(url='http://example.org/x.csv')
     self.assertEqual(
         fetch_result,
         ProcessResult(error='Error from server: 404 Not Found'))
Beispiel #8
0
 def test_render_has_header_false(self):
     result = render(False, ProcessResult(pd.DataFrame({'A': [1]})))
     self.assertEqual(result, ProcessResult(pd.DataFrame({'0': ['A',
                                                                '1']})))
Beispiel #9
0
 def test_render_file_error(self):
     result = render(False, ProcessResult(error='x'))
     self.assertEqual(result, ProcessResult(error='x'))
Beispiel #10
0
 def test_render_no_file(self):
     result = render(True, None)
     self.assertEqual(result, ProcessResult())
Beispiel #11
0
 def test_render_has_header_true(self):
     result = render(True, ProcessResult(pd.DataFrame({'A': [1]})))
     self.assertEqual(result, ProcessResult(pd.DataFrame({'A': [1]})))
Beispiel #12
0
def render(has_header, fetch_result):
    x = uploadfile.render(pd.DataFrame(), {'has_header': has_header},
                          fetch_result=fetch_result)
    result = ProcessResult.coerce(x)
    result.sanitize_in_place()
    return result
Beispiel #13
0
 def test_empty_table_shape(self):
     result = ProcessResult()
     self.assertEqual(result.table_shape, TableShape(0, []))
Beispiel #14
0
 def test_table_shape(self):
     df = pd.DataFrame({'A': [1, 2, 3]})
     result = ProcessResult(df)
     self.assertEqual(result.table_shape,
                      TableShape(3, [Column('A', ColumnType.NUMBER())]))
Beispiel #15
0
 def test_xlsx_cast_colnames_to_str(self):
     with (TestDataPath / "all-numeric.xlsx").open("rb") as file:
         result = parse_bytesio(file, "application/vnd.ms-excel", None)
     expected = ProcessResult(pd.DataFrame({"1": [2]}))
     self.assertEqual(result, expected)
Beispiel #16
0
def fake_result(colnames):
    return ProcessResult(a_table[colnames])
Beispiel #17
0
 def test_parse_empty_csv(self):
     result = parse_bytesio(io.BytesIO(b""), "text/csv", "utf-8")
     expected = ProcessResult.coerce(pd.DataFrame().reset_index(drop=True))
     self.assertEqual(result, expected)
Beispiel #18
0
def render(table, reorder_history):
    params = {'reorder-history': reorder_history}
    result = reordercolumns.render(table.copy(), params)
    return ProcessResult.coerce(result)
Beispiel #19
0
 def test_load_csv_use_ext_given_bad_content_type(self):
     # return text/plain type and rely on filename detection, as
     # https://raw.githubusercontent.com/ does
     fetch_result = fetch(url='http://test.com/the.csv')
     self.assertEqual(fetch_result, ProcessResult(mock_csv_table))
Beispiel #20
0
async def fetch(params):
    # We delve into pd.read_html()'s innards, below. Part of that means some
    # first-use initialization.
    pd.io.html._importers()

    table = None
    url: str = params['url'].strip()
    tablenum: int = params['tablenum'] - 1  # 1-based for user

    if tablenum < 0:
        return ProcessResult(error='Table number must be at least 1')

    result = None

    try:
        async with utils.spooled_data_from_url(url) as (spool, headers,
                                                        charset):
            # pandas.read_html() does automatic type conversion, but we prefer
            # our own. Delve into its innards so we can pass all the conversion
            # kwargs we want.
            with utils.wrap_text(spool, charset) as textio:
                tables = pd.io.html._parse(
                    # Positional arguments:
                    flavor='html5lib',  # force algorithm, for reproducibility
                    io=textio,
                    match='.+',
                    attrs=None,
                    encoding=None,  # textio is already decoded
                    displayed_only=False,  # avoid dud feature: it ignores CSS
                    # Required kwargs that pd.read_html() would set by default:
                    header=None,
                    skiprows=None,
                    # Now the reason we used pd.io.html._parse() instead of
                    # pd.read_html(): we get to pass whatever kwargs we want to
                    # TextParser.
                    #
                    # kwargs we get to add as a result of this hack:
                    na_filter=False,  # do not autoconvert
                    dtype=str,  # do not autoconvert
                )
    except asyncio.TimeoutError:
        return ProcessResult(error=f'Timeout fetching {url}')
    except aiohttp.InvalidURL:
        return ProcessResult(error=f'Invalid URL')
    except aiohttp.ClientResponseError as err:
        return ProcessResult(error=('Error from server: %d %s' %
                                    (err.status, err.message)))
    except aiohttp.ClientError as err:
        return ProcessResult(error=str(err))
    except ValueError:
        return ProcessResult(
            error='Did not find any <table> tags on that page')
    except IndexError:
        # pandas.read_html() gives this unhelpful error message....
        return ProcessResult(error='Table has no columns')

    if not tables:
        return ProcessResult(
            error='Did not find any <table> tags on that page')

    if tablenum >= len(tables):
        return ProcessResult(
            error=(f'The maximum table number on this page is {len(tables)}'))

    # pd.read_html() guarantees unique colnames
    table = tables[tablenum]
    merge_colspan_headers_in_place(table)
    utils.autocast_dtypes_in_place(table)
    if len(table) == 0:
        # read_html() produces an empty Index. We want a RangeIndex.
        table.reset_index(drop=True, inplace=True)
    result = ProcessResult(dataframe=table)
    result.truncate_in_place_if_too_big()
    return result
Beispiel #21
0
 def test_bad_url(self):
     fetch_result = fetch(url='not a url')
     self.assertEqual(fetch_result, ProcessResult(error='Invalid URL'))
Beispiel #22
0
 def test_json_syntax_error(self):
     result = parse_bytesio(io.BytesIO(b"{not JSON"), "application/json")
     expected = ProcessResult(
         error=("Invalid JSON (Unexpected character found when "
                "decoding 'null')"))
     self.assertEqual(result, expected)
Beispiel #23
0
def PR(error, *args, **kwargs):
    """Shortcut ProcessResult builder."""
    return ProcessResult(pd.DataFrame(*args, **kwargs), error)
Beispiel #24
0
 def test_txt_detect_separator_comma(self):
     result = parse_bytesio(io.BytesIO(b"A,C\nB,D"), "text/plain", "utf-8")
     expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["D"]}))
     self.assertEqual(result, expected)
Beispiel #25
0
 def test_no_right(self):
     result = render(pd.DataFrame({'A': [1]}), P(), None)
     self.assertEqual(result, ProcessResult())
Beispiel #26
0
 def test_csv_detect_separator_semicolon(self):
     result = parse_bytesio(io.BytesIO(b"A;C\nB;D"), "text/csv", "utf-8")
     expected = ProcessResult(pd.DataFrame({"A": ["B"], "C": ["D"]}))
     self.assertEqual(result, expected)
Beispiel #27
0
def table_to_result(table):
    result = ProcessResult(table)
    result.sanitize_in_place()  # alters dataframe.equals() result
    return result
Beispiel #28
0
 def test_xls(self):
     with (TestDataPath / "example.xls").open("rb") as file:
         result = parse_bytesio(file, "application/vnd.ms-excel", None)
     expected = ProcessResult(pd.DataFrame({"foo": [1, 2], "bar": [2, 3]}))
     self.assertEqual(result, expected)
Beispiel #29
0
    def test_return_str_for_error(self):
        result = safe_eval_process("""
def process(table):
    return 'hi'
""", EMPTY_DATAFRAME)
        self.assertEqual(result, ProcessResult(error='hi', json=EMPTY_OUTPUT))
Beispiel #30
0
 def test_status_unreachable(self):
     result = ProcessResult(pd.DataFrame(), '')
     self.assertEqual(result.status, 'unreachable')