def test_coerce_infer_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce(table) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}}, try_fallback_columns=[Column("A", ColumnType.NUMBER("{:,.2f}"))], ) self.assertEqual(result.columns, [Column("A", ColumnType.NUMBER("{:,d}"))])
def test_arrow_uint8_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, columns=[atypes.Column("A", ColumnType.Number("{:,d}"))], )) assert_frame_equal(dataframe, pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8)) self.assertEqual(columns, [Column("A", ColumnType.Number("{:,d}"))])
def test_arrow_all_null_text_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array(["a", "b", None, "c"])}, columns=[atypes.Column("A", ColumnType.Text())], )) assert_frame_equal(dataframe, pd.DataFrame({"A": ["a", "b", np.nan, "c"]})) self.assertEqual(columns, [Column("A", ColumnType.Text())])
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.TEXT()), ], )
def test_dataframe_uint8_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8), [Column("A", ColumnType.Number("{:,d}"))], self.path, ), arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, [atypes.Column("A", ColumnType.Number("{:,d}"))], ), )
def test_coerce_infer_columns_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text())], )
def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.TEXT()), Column("B", ColumnType.NUMBER()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns_with_unit(self): table = pd.DataFrame( {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]} ) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "year"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.Date(unit="year")), Column("B", ColumnType.Text()), ], )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame({ "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], })) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), ], )
def test_to_arrow_normal_dataframe(self): fd, filename = tempfile.mkstemp() os.close(fd) # Remove the file. Then we'll test that ProcessResult.to_arrow() does # not write it (because the result is an error) os.unlink(filename) try: process_result = ProcessResult.coerce(pd.DataFrame({"A": [1, 2]})) result = process_result.to_arrow(Path(filename)) self.assertEqual( result, atypes.RenderResult( atypes.ArrowTable( Path(filename), pyarrow.table({"A": [1, 2]}), atypes.TableMetadata( 2, [ atypes.Column( "A", ColumnType.Number( # Whatever .format # ProcessResult.coerce() gave process_result.columns[0].type.format), ) ], ), ), [], {}, ), ) finally: os.unlink(filename)
def test_to_arrow(self): self.assertEqual( TableShape( 3, [ Column("A", ColumnType.NUMBER("{:,d}")), Column("B", ColumnType.TEXT()), ], ).to_arrow(), atypes.TableMetadata( 3, [ atypes.Column("A", atypes.ColumnType.Number("{:,d}")), atypes.Column("B", atypes.ColumnType.Text()), ], ), )
def test_dataframe_uint8_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8), [Column("A", ColumnType.Number("{:,d}"))], make_table( make_column("A", [1, 2, 3, 253], type=pa.uint8(), format="{:,d}") ), )
def test_format_whole_float_as_int(self): """ Mimic d3-format, which cannot differentiate between float and int. """ series = pd.Series([1.1, 2.0, 123456789.0]) column_type = ColumnType.NUMBER("{:,}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["1.1", "2", "123,456,789"]))
def test_arrow_timestamp_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", ColumnType.Timestamp())], )) assert_frame_equal( dataframe, pd.DataFrame({"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]"), ) self.assertEqual(columns, [Column("A", ColumnType.Timestamp())])
def test_columns(self): df = pd.DataFrame({ "A": [1], # number "B": ["foo"], # str "C": dt(2018, 8, 20), # datetime64 }) df["D"] = pd.Series(["cat"], dtype="category") result = ProcessResult(df) self.assertEqual(result.column_names, ["A", "B", "C", "D"]) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Text()), ], )
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), ], )
def test_dataframe_datetime_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]"), [Column("A", ColumnType.Timestamp())], self.path, ), arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", ColumnType.Timestamp())], ), )
def test_dataframe_datetime_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame( {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]" ), [Column("A", ColumnType.Timestamp())], make_table( make_column("A", [dt.fromisoformat("2019-09-17T21:21:00.123456"), None]) ), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], "D": [pd.Period("2021-01-01", freq="D"), pd.NaT], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Date("day")), ], )
def test_format(self): series = pd.Series( [dt(1999, 2, 3, 4, 5, 6, 7), np.nan, dt(2000, 3, 4, 5, 6, 7, 8)] ) column_type = ColumnType.DATETIME() result = column_type.format_series(series) assert_series_equal( result, pd.Series( ["1999-02-03T04:05:06.000007Z", np.nan, "2000-03-04T05:06:07.000008Z"] ), )
def test_arrow_category_column(self): atable = arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.Text())]) assert_frame_equal( dataframe, pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"))
def test_dataframe_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([0, 1, None, 0], pa.int8()), pa.array(["A", "B"], pa.string()), ), } ), )
def test_dataframe_all_null_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([None], pa.int8()), pa.array([], pa.string()), ), } ), )
def test_arrow_all_null_category_column(self): atable = arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.TEXT())]) assert_frame_equal( dataframe, pd.DataFrame({"A": [None]}, dtype=str).astype("category") )
def test_dataframe_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], self.path, ), arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }), )
def test_dataframe_all_null_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ), )
def test_format_nulls_becomes_str(self): series = pd.Series([np.nan, np.nan], dtype=np.float64) result = ColumnType.NUMBER().format_series(series) assert_series_equal(result, pd.Series([np.nan, np.nan], dtype=object))
def test_format_zero_length_becomes_str(self): # (even though there's no way for pandas to detect type of result) # (luckily, pandas defaults to `object`) series = pd.Series([], dtype=np.int64) result = ColumnType.NUMBER().format_series(series) assert_series_equal(result, pd.Series([], dtype=object))