def test_coerce_infer_columns_format_supercedes_try_fallback_columns(self): table = pd.DataFrame({"A": [1, 2]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}}, try_fallback_columns=[Column("A", ColumnType.NUMBER("{:,.2f}"))], ) self.assertEqual(result.columns, [Column("A", ColumnType.NUMBER("{:,d}"))])
def test_coerce_infer_columns_try_fallback_columns_ignore_wrong_type(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( table, try_fallback_columns=[ Column("A", ColumnType.TEXT()), Column("B", ColumnType.NUMBER()), ], ) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_coerce_infer_columns(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce(table) self.assertEqual( result.columns, [Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT())], )
def test_format_whole_float_as_int(self): """ Mimic d3-format, which cannot differentiate between float and int. """ series = pd.Series([1.1, 2.0, 123456789.0]) column_type = ColumnType.NUMBER("{:,}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["1.1", "2", "123,456,789"]))
def test_arrow_uint8_column(self): dataframe, columns = arrow_table_to_dataframe( arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, columns=[atypes.Column("A", atypes.ColumnType.Number("{:,d}"))], ) ) assert_frame_equal( dataframe, pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8) ) self.assertEqual(columns, [Column("A", ColumnType.NUMBER("{:,d}"))])
def test_coerce_infer_columns_with_format(self): table = pd.DataFrame({"A": [1, 2], "B": ["x", "y"]}) result = ProcessResult.coerce( {"dataframe": table, "column_formats": {"A": "{:,d}"}} ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER(format="{:,d}")), Column("B", ColumnType.TEXT()), ], )
def test_dataframe_uint8_column(self): assert_arrow_table_equals( dataframe_to_arrow_table( pd.DataFrame({"A": [1, 2, 3, 253]}, dtype=np.uint8), [Column("A", ColumnType.NUMBER("{:,d}"))], self.path, ), arrow_table( {"A": pyarrow.array([1, 2, 3, 253], type=pyarrow.uint8())}, [atypes.Column("A", atypes.ColumnType.Number("{:,d}"))], ), )
def test_to_arrow(self): self.assertEqual( TableShape( 3, [ Column("A", ColumnType.NUMBER("{:,d}")), Column("B", ColumnType.TEXT()), ], ).to_arrow(), atypes.TableMetadata( 3, [ atypes.Column("A", atypes.ColumnType.Number("{:,d}")), atypes.Column("B", atypes.ColumnType.Text()), ], ), )
def test_ctor_infer_columns(self): result = ProcessResult( pd.DataFrame( { "A": [1, 2], "B": ["x", "y"], "C": [np.nan, dt(2019, 3, 3, 4, 5, 6, 7)], } ) ) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), ], )
def test_columns(self): df = pd.DataFrame( { "A": [1], # number "B": ["foo"], # str "C": dt(2018, 8, 20), # datetime64 } ) df["D"] = pd.Series(["cat"], dtype="category") result = ProcessResult(df) self.assertEqual(result.column_names, ["A", "B", "C", "D"]) self.assertEqual( result.columns, [ Column("A", ColumnType.NUMBER()), Column("B", ColumnType.TEXT()), Column("C", ColumnType.DATETIME()), Column("D", ColumnType.TEXT()), ], )
def test_format_float_as_int(self): series = pd.Series([1.1]) column_type = ColumnType.NUMBER(format="{:d}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["1"]))
def test_format_int_as_float(self): series = pd.Series([1, 2, 3, 4], dtype=int) column_type = ColumnType.NUMBER(format="{:.1f}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["1.0", "2.0", "3.0", "4.0"]))
def test_table_shape(self): df = pd.DataFrame({"A": [1, 2, 3]}) result = ProcessResult(df) self.assertEqual( result.table_shape, TableShape(3, [Column("A", ColumnType.NUMBER())]) )
def test_custom_format(self): series = pd.Series([1.1, 2231, np.nan, 0.123]) column_type = ColumnType.NUMBER(format="${:0,.2f}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["$1.10", "$2,231.00", np.nan, "$0.12"]))
def test_to_arrow(self): self.assertEqual( Column("A", ColumnType.NUMBER("{:,d}")).to_arrow(), atypes.Column("A", atypes.ColumnType.Number("{:,d}")), )
def test_format_zero_length_becomes_str(self): # (even though there's no way for pandas to detect type of result) # (luckily, pandas defaults to `object`) series = pd.Series([], dtype=np.int64) result = ColumnType.NUMBER().format_series(series) assert_series_equal(result, pd.Series([], dtype=object))
def test_format_disallow_field_converter(self): with self.assertRaisesRegex(ValueError, "Field converters are not allowed"): ColumnType.NUMBER("{!r:f}")
def test_format_disallow_invalid_type(self): with self.assertRaisesRegex(ValueError, "Unknown format code 'T'"): ColumnType.NUMBER("{:T}")
def test_format_percent(self): series = pd.Series([0.3, 11.111, 0.0001, np.nan]) column_type = ColumnType.NUMBER(format="{:,.1%}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["30.0%", "1,111.1%", "0.0%", np.nan]))
def test_format_disallow_non_format(self): with self.assertRaisesRegex(ValueError, 'Format must look like "{:...}"'): ColumnType.NUMBER("%d")
def test_format_int_as_percent(self): series = pd.Series([1, 11]) column_type = ColumnType.NUMBER(format="{:,.1%}") result = column_type.format_series(series) assert_series_equal(result, pd.Series(["100.0%", "1,100.0%"]))
def test_format_disallow_field_name(self): with self.assertRaisesRegex( ValueError, "Field names or numbers are not allowed" ): ColumnType.NUMBER("{value:f}")
def test_default_format(self): series = pd.Series([1.1, 2.231, np.nan]) column_type = ColumnType.NUMBER() result = column_type.format_series(series) assert_series_equal(result, pd.Series(["1.1", "2.231", np.nan]))
def test_from_arrow(self): self.assertEqual( Column.from_arrow(atypes.Column("A", atypes.ColumnType.Number("{:,d}"))), Column("A", ColumnType.NUMBER("{:,d}")), )
def test_format_nulls_becomes_str(self): series = pd.Series([np.nan, np.nan], dtype=np.float64) result = ColumnType.NUMBER().format_series(series) assert_series_equal(result, pd.Series([np.nan, np.nan], dtype=object))
def test_format_too_many_arguments(self): with self.assertRaisesRegex(ValueError, "Can only format one number"): ColumnType.NUMBER("{:d}{:f}")