def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual( result, RenderResult( ArrowTable(), [ RenderError( I18nMessage.TODO_i18n( 'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"' ) ) ], ), )
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, [])
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path, errors=errors), self.output_path, ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, errors)
def test_render_fetch_error(self): errors = [RenderResult(I18nMessage("x", {"y": "z"}))] with tempfile_context() as empty_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(empty_path, errors), self.output_path, ) assert_arrow_table_equals(result.table, ArrowTable()) self.assertEqual(result.errors, errors)
def test_render_json(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "application/json")], io.BytesIO(b'[{"A": "a"}]'), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual(result.errors, []) assert_arrow_table_equals(result.table, {"A": ["a"]})
def test_render_has_header_true(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"A,B\na,b"), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]}) self.assertEqual(result.errors, [])
def test_render_xlsx(self): with tempfile_context("fetch-") as http_path: with (TestDataPath / "example.xlsx").open("rb") as xlsx_f: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], xlsx_f, ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual(result.errors, []) assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]})
def test_render_text_plain(self): # guess_mime_type_or_none() treats text/plain specially. with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.unknownext"}, "200 OK", [("content-type", "text/plain")], io.BytesIO(b"A;B\na;b"), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual(result.errors, []) assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]})
def test_render_csv_handle_nonstandard_mime_type(self): # Transform 'application/csv' into 'text/csv', etc. # # Sysadmins sometimes invent MIME types. We hard-code to rewrite fake # MIME types we've seen in the wild that seem unambiguous. with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "application/x-csv")], io.BytesIO(b"A,B\na,b"), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]}) self.assertEqual(result.errors, [])
def test_render_deprecated_parquet_has_header_false(self): # This behavior is totally awful, but we support it for backwards # compatibility. # # Back in the day, we parsed during fetch. But has_header can change # between fetch and render. We were lazy, so we made fetch() follow the # most-common path: has_header=True. Then, in render(), we would "undo" # the change if has_header=False. This was lossy. It took a lot of time # to figure it out. It was _never_ wise to code this. Now we need to # support these lossy, mangled files. with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(has_header=False), "tab-x", FetchResult(fetched_path), self.output_path, ) assert_arrow_table_equals( result.table, {"0": ["A", "1", "2"], "1": ["B", "3", "4"]} ) self.assertEqual(result.errors, [])
def test_render_csv_use_url_ext_given_bad_content_type(self): # Use text/plain type and rely on filename detection, as # https://raw.githubusercontent.com/ does with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.csv"}, "200 OK", [("content-type", "text/plain")], # bytes will prove we used "csv" explicitly -- we didn't # take "text/plain" and decide to use a CSV sniffer to # find the delimiter. io.BytesIO(b"A;B\na;b"), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals(result.table, {"A;B": ["a;b"]}) self.assertEqual(result.errors, [])
def test_render_has_header_false(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"1,2\n3,4"), ) result = render_arrow( ArrowTable(), P(has_header=False), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals( result.table, { "Column 1": pyarrow.array([1, 3], pyarrow.int8()), "Column 2": pyarrow.array([2, 4], pyarrow.int8()), }, ) self.assertEqual(result.errors, [])
def test_render_missing_fetch_result_returns_empty(self): result = render_arrow(ArrowTable(), P(), "tab-x", None, self.output_path) assert_arrow_table_equals(result.table, {}) self.assertEqual(result.errors, [])
def test_render_no_file(self): result = render_arrow(ArrowTable(), P(), "tab-x", None, self.output_path) assert_arrow_table_equals(result.table, ArrowTable()) self.assertEqual(result.errors, [])