def test_coerce_infer_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce(table) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}}, try_fallback_columns=[Column("A", ColumnType.NUMBER("{:,.2f}"))], ) self.assertEqual(result.columns, [Column("A", ColumnType.NUMBER("{:,d}"))])
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.TEXT()), ], )
def test_coerce_infer_columns_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.Number("{:,d}")), Column("B", ColumnType.Text())], )
def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.TEXT()), Column("B", ColumnType.NUMBER()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns_with_unit(self): table = pd.DataFrame( {"A": [pd.Period("2021-01-01", freq="D"), None], "B": ["x", "y"]} ) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "year"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.Date(unit="year")), Column("B", ColumnType.Text()), ], )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame({ "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], })) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), ], )
def test_dataframe_uint8_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8), [Column("A", ColumnType.Number("{:,d}"))], make_table( make_column("A", [1, 2, 3, 253], type=pa.uint8(), format="{:,d}") ), )
def test_to_arrow(self): self.assertEqual( TableShape( 3, [ Column("A", ColumnType.NUMBER("{:,d}")), Column("B", ColumnType.TEXT()), ], ).to_arrow(), atypes.TableMetadata( 3, [ atypes.Column("A", atypes.ColumnType.Number("{:,d}")), atypes.Column("B", atypes.ColumnType.Text()), ], ), )
def test_arrow_uint8_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, columns=[atypes.Column("A", ColumnType.Number("{:,d}"))], )) assert_frame_equal(dataframe, pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8)) self.assertEqual(columns, [Column("A", ColumnType.Number("{:,d}"))])
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
def test_arrow_all_null_text_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array(["a", "b", None, "c"])}, columns=[atypes.Column("A", atypes.ColumnType.Text())], ) ) assert_frame_equal(dataframe, pd.DataFrame({"A": ["a", "b", np.nan, "c"]})) self.assertEqual(columns, [Column("A", ColumnType.TEXT())])
def test_columns(self): df = pd.DataFrame({ "A": [1], # number "B": ["foo"], # str "C": dt(2018, 8, 20), # datetime64 }) df["D"] = pd.Series(["cat"], dtype="category") result = ProcessResult(df) self.assertEqual(result.column_names, ["A", "B", "C", "D"]) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Text()), ], )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), ], )
def test_dataframe_datetime_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame( {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]" ), [Column("A", ColumnType.Timestamp())], make_table( make_column("A", [dt.fromisoformat("2019-09-17T21:21:00.123456"), None]) ), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], "D": [pd.Period("2021-01-01", freq="D"), pd.NaT], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.Number()), Column("B", ColumnType.Text()), Column("C", ColumnType.Timestamp()), Column("D", ColumnType.Date("day")), ], )
def test_dataframe_uint8_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8), [Column("A", ColumnType.NUMBER("{:,d}"))], self.path, ), arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, [atypes.Column("A", atypes.ColumnType.Number("{:,d}"))], ), )
def test_arrow_category_column(self): atable = arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.Text())]) assert_frame_equal( dataframe, pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"))
def test_dataframe_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([0, 1, None, 0], pa.int8()), pa.array(["A", "B"], pa.string()), ), } ), )
def test_dataframe_all_null_category_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.Text())], pa.table( { "A": pa.DictionaryArray.from_arrays( pa.array([None], pa.int8()), pa.array([], pa.string()), ), } ), )
def test_arrow_all_null_category_column(self): atable = arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.TEXT())]) assert_frame_equal( dataframe, pd.DataFrame({"A": [None]}, dtype=str).astype("category") )
def test_dataframe_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.Text())], self.path, ), arrow_table({ "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) }), )
def test_dataframe_all_null_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ), )
def test_arrow_timestamp_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", ColumnType.Timestamp())], )) assert_frame_equal( dataframe, pd.DataFrame({"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]"), ) self.assertEqual(columns, [Column("A", ColumnType.Timestamp())])
def test_dataframe_datetime_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame( {"A": ["2019-09-17T21:21:00.123456Z", None]}, dtype="datetime64[ns]" ), [Column("A", ColumnType.DATETIME())], self.path, ), arrow_table( { "A": pyarrow.array( [dt.fromisoformat("2019-09-17T21:21:00.123456"), None], type=pyarrow.timestamp(unit="ns", tz=None), ) }, [atypes.Column("A", atypes.ColumnType.Datetime())], ), )
def test_from_arrow(self): self.assertEqual( Column.from_arrow(atypes.Column("A", atypes.ColumnType.Number("{:,d}"))), Column("A", ColumnType.NUMBER("{:,d}")), )
def test_dataframe_all_null_text_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.Text())], expected_table=make_table(make_column("A", [None], pa.string())), )
def test_to_arrow(self): self.assertEqual( Column("A", ColumnType.NUMBER("{:,d}")).to_arrow(), atypes.Column("A", atypes.ColumnType.Number("{:,d}")), )
def test_dataframe_date_column(self): self._test_dataframe_to_arrow_table( pd.DataFrame({"A": [pd.Period("2021-04-01", freq="D"), None]}), [Column("A", ColumnType.Date(unit="month"))], make_table(make_column("A", [date(2021, 4, 1), None], unit="month")), )
def test_table_shape(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = ProcessResult(df) self.assertEqual( result.table_shape, TableShape(3, [Column("A", ColumnType.NUMBER())]) )