def test_tsv(self): result = render_arrow(P(csv="A\tB\na\tb\nc\td")) assert_arrow_table_equals(result.table, { "A": ["a", "c"], "B": ["b", "d"] }) self.assertEqual(result.errors, [])
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() create_module_zipfile( "mod", python_code= ("import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table" ), ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod") cjwstate.modules.init_module_system() now = timezone.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now)) wf_module.refresh_from_db() so = wf_module.stored_objects.get( stored_at=wf_module.stored_data_version) with minio.temporarily_download(minio.StoredObjectsBucket, so.key) as parquet_path: table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False) assert_arrow_table_equals(table, {"A": [1]}) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def test_no_header(self): result = render_arrow(P(csv="A,B", has_header_row=False)) assert_arrow_table_equals(result.table, { "Column 1": ["A"], "Column 2": ["B"] }) self.assertEqual(result.errors, [])
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: result = render_arrow( ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path ) assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]}) self.assertEqual(result.errors, [])
def test_render_error(self): path = self._file(b"A,B\nx,y", suffix=".json") result = upload.render_arrow( ArrowTable(), { "file": path, "has_header": True }, "tab-x", None, self.output_path, ) assert_arrow_table_equals(result.table, {}) self.assertEqual( result.errors, [ RenderError( message=I18nMessage( id="TODO_i18n", args={ "text": "JSON parse error at byte 0: Invalid value." }, ), quick_fixes=[], ) ], )
def test_render_fetch_error(self): fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))] with tempfile_context() as empty_path: with self.render(P(), FetchResult(empty_path, fetch_errors)) as result: assert_arrow_table_equals(result.table, ArrowTable()) self.assertEqual(result.errors, fetch_errors)
def test_detect_semicolon_csv_by_suffix(self): with _data_file(b"A;B\nx;y\nz;a", suffix=".txt") as txt_path: result = parse_file(txt_path, output_path=self.output_path) assert_arrow_table_equals(result.table, { "A": ["x", "z"], "B": ["y", "a"] })
def test_detect_tsv_by_suffix(self): with _data_file(b"A\tB\nx\ty\nz\ta", suffix=".tsv") as tsv_path: result = parse_file(tsv_path, output_path=self.output_path) assert_arrow_table_equals(result.table, { "A": ["x", "z"], "B": ["y", "a"] })
def test_detect_csv_by_suffix(self): with _data_file(b"A,B\nx,y\nz,a", suffix=".csv") as csv_path: result = parse_file(csv_path, output_path=self.output_path) assert_arrow_table_equals(result.table, { "A": ["x", "z"], "B": ["y", "a"] })
def test_mime_type_overrides_suffix(self): # File is ".csv" but we parse as ".json" because mime_type=MimeType.JSON with _data_file(b'[{"X":"x"}]', suffix=".csv") as json_path: result = parse_file(json_path, output_path=self.output_path, mime_type=MimeType.JSON) assert_arrow_table_equals(result.table, {"X": ["x"]})
def test_detect_xlsx_by_suffix(self): result = parse_file(TestDataPath / "test.xlsx", output_path=self.output_path) assert_arrow_table_equals(result.table, { "Month": ["Jan", "Feb"], "Amount": [10, 20] })
def test_fetch_integration(self, send_update, queue_render): queue_render.side_effect = async_value(None) send_update.side_effect = async_value(None) workflow = Workflow.create_and_init() ModuleVersion.create_or_replace_from_spec( {"id_name": "mod", "name": "Mod", "category": "Clean", "parameters": []}, source_version_hash="abc123", ) wf_module = workflow.tabs.first().wf_modules.create( order=0, slug="step-1", module_id_name="mod" ) minio.put_bytes( minio.ExternalModulesBucket, "mod/abc123/code.py", b"import pandas as pd\ndef fetch(params): return pd.DataFrame({'A': [1]})\ndef render(table, params): return table", ) cjwstate.modules.init_module_system() now = timezone.now() with self.assertLogs(level=logging.INFO): self.run_with_async_db( fetch.fetch(workflow_id=workflow.id, wf_module_id=wf_module.id, now=now) ) wf_module.refresh_from_db() so = wf_module.stored_objects.get(stored_at=wf_module.stored_data_version) with minio.temporarily_download(so.bucket, so.key) as parquet_path: table = pyarrow.parquet.read_table(str(parquet_path), use_threads=False) assert_arrow_table_equals(table, {"A": [1]}) workflow.refresh_from_db() queue_render.assert_called_with(workflow.id, workflow.last_delta_id) send_update.assert_called()
def render(*args, fetch_result, **kwargs): self.assertEqual( fetch_result.errors, [RenderError(I18nMessage.TODO_i18n("maybe an error"))], ) assert_arrow_table_equals( pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]}) return RenderResult()
def test_no_nan(self): # https://www.pivotaltracker.com/story/show/163106728 result = render_arrow(P(csv="A,B\nx,y\nz,NA")) assert_arrow_table_equals(result.table, { "A": ["x", "z"], "B": ["y", "NA"] }) self.assertEqual(result.errors, [])
def test_render_deprecated_parquet(self): with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, [])
def test_empty_column_name_gets_automatic_name(self): result = render_arrow(P(csv="A,,B\na,b,c", has_header_row=True)) assert_arrow_table_equals(result.table, { "A": ["a"], "Column 2": ["b"], "B": ["c"] }) self.assertEqual(result.errors, [])
def test_read_issue_375_snappy(self): assert_arrow_table_equals( parquet.read(self._testPath("fastparquet-issue-375-snappy.par")), { "A": ["A" * 32760] * 10, "__index_level_0__": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], }, )
def test_json_detect_encoding_by_default(self): with _data_file('[{"A":"café"}]'.encode("windows-1252")) as path: result = parse_file( path, output_path=self.output_path, mime_type=MimeType.JSON, encoding=None, ) assert_arrow_table_equals(result.table, {"A": ["café"]})
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
def test_render_arrow_table(self): # The param name "arrow_table" is a special case def render(arrow_table, params, output_path, **kwargs): out = pa.table({"A": [2]}) with pa.ipc.RecordBatchFileWriter(output_path, out.schema) as writer: writer.write_table(out) result = self._test_render(render, {"A": [1]}) assert_arrow_table_equals(result.table, {"A": [2]})
def test_xlsx_nix_control_characters_from_colnames(self): path = TestDataPath / "headers-have-control-characters.xlsx" with tempfile_context(suffix=".arrow") as output_path: result = parse_xlsx_file(path, output_path=output_path, has_header=True, autoconvert_types=False) assert_arrow_table_equals(result.table, {"AB": ["a"], "C": ["b"]}) self.assertEqual(result.errors, [])
def test_xlsx_cast_colnames_to_str(self): path = TestDataPath / "all-numeric.xlsx" with tempfile_context(suffix=".arrow") as output_path: result = parse_xlsx_file(path, output_path=output_path, has_header=True, autoconvert_types=True) assert_arrow_table_equals(result.table, {"1": [2]}) self.assertEqual(result.errors, [])
def test_render_deprecated_parquet_warning(self): errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))] with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path: with self.render(P(), FetchResult(fetched_path, errors)) as result: assert_arrow_table_equals(result.table, { "A": [1, 2], "B": [3, 4] }) self.assertEqual(result.errors, errors)
def _test_read_write_table(self, table, expected=None): table = arrow_table(table).table if expected is None: expected = table else: expected = arrow_table(expected).table parquet.write(self.temp_path, table) result = parquet.read(self.temp_path) assert_arrow_table_equals(result, table)
def test_xls(self): path = TestDataPath / "example.xls" with tempfile_context(suffix=".arrow") as output_path: result = parse_xls_file(path, output_path=output_path, has_header=True, autoconvert_types=True) assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]}) self.assertEqual(result.errors, [])
def test_csv_has_header_false(self): with _data_file(b"A\n1.00\n2") as path: result = parse_file( path, output_path=self.output_path, mime_type=MimeType.CSV, has_header=False, ) assert_arrow_table_equals(result.table, {"Column 1": ["A", "1.00", "2"]})
def test_json_override_encoding_by_argument(self): # caller-selected encoding overrides autodetected encoding with _data_file('[{"A":"café"}]'.encode("utf-8")) as path: result = parse_file( path, output_path=self.output_path, mime_type=MimeType.JSON, encoding="windows-1252", ) assert_arrow_table_equals(result.table, {"A": ["café"]})
def test_render_truncate(self): def render(table, params): return pd.DataFrame({"A": [1, 2, 3]}) result = self._test_render(render) assert_arrow_table_equals(result.table, {"A": [1, 2]}) self.assertEqual( result.errors, [RenderError(I18nMessage.TODO_i18n("Truncated output from 3 rows to 2"))], )
def test_read_issue_375_uncompressed(self): # https://github.com/dask/fastparquet/issues/375 # large dictionary written by pyarrow.parquet. assert_arrow_table_equals( parquet.read(self._testPath("fastparquet-issue-375.par")), { "A": ["A" * 32755] * 10, "__index_level_0__": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], }, )
def test_duplicate_column_names_renamed(self): result = render_arrow(P(csv="A,A\na,b", has_header_row=True)) assert_arrow_table_equals(result.table, {"A": ["a"], "A 2": ["b"]}) self.assertEqual( result.errors, [ RenderError( I18nMessage.TODO_i18n( "Renamed 1 duplicate column names (see “A 2”)")) ], )