def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.TEXT()), Column("B", ColumnType.NUMBER()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce(table) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_arrow_all_null_text_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array(["a", "b", None, "c"])}, columns=[atypes.Column("A", atypes.ColumnType.Text())], ) ) assert_frame_equal(dataframe, pd.DataFrame({"A": ["a", "b", np.nan, "c"]})) self.assertEqual(columns, [Column("A", ColumnType.TEXT())])
def test_dataframe_all_null_text_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table({"A": pyarrow.array([None], pyarrow.string())}), )
def test_columns(self): df = pd.DataFrame( { "A": [1], # number "B": ["foo"], # str "C": dt(2018, 8, 20), # datetime64 } ) df["D"] = pd.Series(["cat"], dtype="category") result = ProcessResult(df) self.assertEqual(result.column_names, ["A", "B", "C", "D"]) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), Column("D", ColumnType.TEXT()), ], )
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.TEXT()), ], )
def test_arrow_all_null_category_column(self): atable = arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.TEXT())]) assert_frame_equal( dataframe, pd.DataFrame({"A": [None]}, dtype=str).astype("category") )
def test_arrow_category_column(self): atable = arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) } ) dataframe, columns = arrow_table_to_dataframe(atable) self.assertEqual(columns, [Column("A", ColumnType.TEXT())]) assert_frame_equal( dataframe, pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category") )
def test_dataframe_all_null_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [None]}, dtype=str).astype("category"), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([None], type=pyarrow.int8()), pyarrow.array([], type=pyarrow.string()), ) } ), )
def test_dataframe_category_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": ["A", "B", None, "A"]}, dtype="category"), [Column("A", ColumnType.TEXT())], self.path, ), arrow_table( { "A": pyarrow.DictionaryArray.from_arrays( pyarrow.array([0, 1, None, 0], type=pyarrow.int8()), pyarrow.array(["A", "B"], type=pyarrow.string()), ) } ), )
def test_to_arrow(self): self.assertEqual( TableShape( 3, [ Column("A", ColumnType.NUMBER("{:,d}")), Column("B", ColumnType.TEXT()), ], ).to_arrow(), atypes.TableMetadata( 3, [ atypes.Column("A", atypes.ColumnType.Number("{:,d}")), atypes.Column("B", atypes.ColumnType.Text()), ], ), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), ], )
def test_to_arrow(self): self.assertEqual(ColumnType.TEXT().to_arrow(), atypes.ColumnType.Text())
def test_from_arrow(self): self.assertEqual( ColumnType.from_arrow(atypes.ColumnType.Text()), ColumnType.TEXT() )
def test_text_type(self): series = pd.Series(["x", np.nan, "z"]) column_type = ColumnType.TEXT() result = column_type.format_series(series) assert_series_equal(result, pd.Series(["x", np.nan, "z"]))