def df_timestamp(npartitions): return dd.from_pandas( pd.DataFrame( {'a': list(range(10))}, dtype=pd.DatetimeTZDtype(tz='UTC') ), npartitions=npartitions, )
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def __post_init__(self): if self.tz is None: type_ = np.dtype("datetime64[ns]") else: type_ = pd.DatetimeTZDtype(self.unit, self.tz) # DatetimeTZDtype converted tz to tzinfo for us object.__setattr__(self, "tz", type_.tz) object.__setattr__(self, "type", type_)
def test_timezones_fixed_format_series_nonempty(setup_path, tz_aware_fixture): # GH 20594 dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) with ensure_clean_store(setup_path) as store: s = Series([0], dtype=dtype) store["s"] = s result = store["s"] tm.assert_series_equal(result, s)
def test_timezones_fixed_format_frame_empty(setup_path, tz_aware_fixture): # GH 20594 dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) with ensure_clean_store(setup_path) as store: s = Series(dtype=dtype) df = DataFrame({"A": s}) store["df"] = df result = store["df"] tm.assert_frame_equal(result, df)
def df_timestamp(npartitions): df = pd.DataFrame({ 'a': list(range(10)), 'b': list('wwwwwxxxxx'), 'c': list('yyyzzzyyzz'), }) df["a"] = df.a.astype(pd.DatetimeTZDtype(tz='UTC')) return dd.from_pandas( df, npartitions=npartitions, )
def test_meta_nonempty_scalar(): meta = meta_nonempty(np.float64(1.0)) assert isinstance(meta, np.float64) x = pd.Timestamp(2000, 1, 1) meta = meta_nonempty(x) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = meta_nonempty(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit)
def test_timezones_fixed_format_empty(setup_path, tz_aware_fixture, frame_or_series): # GH 20594 dtype = pd.DatetimeTZDtype(tz=tz_aware_fixture) obj = Series(dtype=dtype, name="A") if frame_or_series is DataFrame: obj = obj.to_frame() with ensure_clean_store(setup_path) as store: store["obj"] = obj result = store["obj"] tm.assert_equal(result, obj)
def _extract_schema(self, df): schema = [] for column_name, dtype in dict(df.dtypes).items(): type_map = { np.dtype('object'): bigquery.SchemaField(column_name, 'STRING'), np.dtype('int32'): bigquery.SchemaField(column_name, 'INTEGER'), np.dtype('int64'): bigquery.SchemaField(column_name, 'INTEGER'), np.dtype('float64'): bigquery.SchemaField(column_name, 'FLOAT'), np.dtype('datetime64[ns]'): bigquery.SchemaField(column_name, 'DATE'), pd.DatetimeTZDtype(tz='UTC'): bigquery.SchemaField(column_name, 'DATE') } schema.append(type_map[dtype]) return schema
def df_time() -> pd.DataFrame: return pd.DataFrame({ "date": [pd.Timestamp("now"), pd.Timestamp("now"), pd.Timestamp("now")], "datetz": pd.Series( [pd.Timestamp("now"), pd.Timestamp("now"), pd.Timestamp("now")]).astype(pd.DatetimeTZDtype(tz="UTC")), "timedelta": [pd.Timedelta(1), pd.Timedelta(1), pd.Timedelta(1)], "period": pd.arrays.PeriodArray([0, 1, 2], freq="D"), })
def sql_to_python_type(sql_type: str) -> type: """Turn an SQL type into a dataframe dtype""" if sql_type.startswith("CHAR(") or sql_type.startswith("VARCHAR("): return pd.StringDtype() elif sql_type.startswith("INTERVAL"): return np.dtype("<m8[ns]") elif sql_type.startswith("TIMESTAMP(") or sql_type.startswith("TIME("): return np.dtype("<M8[ns]") elif sql_type.startswith("TIMESTAMP_WITH_LOCAL_TIME_ZONE("): # Everything is converted to UTC # So far, this did not break return pd.DatetimeTZDtype(unit="ns", tz="UTC") elif sql_type.startswith("DECIMAL("): # We use np.float64 always, even though we might # be able to use a smaller type return np.float64 else: try: return _SQL_TO_PYTHON_FRAMES[sql_type] except KeyError: # pragma: no cover raise NotImplementedError( f"The SQL type {sql_type} is not implemented (yet)")
] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [object] try: extension_dtypes = [ pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype, pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype("ns", "UTC"), pd.PeriodDtype("D"), ] except AttributeError: extension_dtypes = [] def setup(*args, **kwargs): # This function just needs to be imported into each benchmark file to # set up the random seed before each function. # https://asv.readthedocs.io/en/latest/writing_benchmarks.html np.random.seed(1234) class BaseIO: """
DatetimeArray._from_sequence(["2000", "2001"]), ), ( pd.DatetimeIndex(["2000", "2001"]), None, DatetimeArray._from_sequence(["2000", "2001"]), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), DatetimeArray._from_sequence(["2000", "2001"]), ), # Datetime (tz-aware) ( ["2000", "2001"], pd.DatetimeTZDtype(tz="CET"), DatetimeArray._from_sequence(["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET")), ), # Timedelta ( ["1H", "2H"], np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ), (
import numpy as np import pytest import pandas as pd from pandas import ( DatetimeIndex, Index, ) import pandas._testing as tm dtlike_dtypes = [ np.dtype("timedelta64[ns]"), np.dtype("datetime64[ns]"), pd.DatetimeTZDtype("ns", "Asia/Tokyo"), pd.PeriodDtype("ns"), ] @pytest.mark.parametrize("ldtype", dtlike_dtypes) @pytest.mark.parametrize("rdtype", dtlike_dtypes) def test_get_indexer_non_unique_wrong_dtype(ldtype, rdtype): vals = np.tile(3600 * 10**9 * np.arange(3), 2) def construct(dtype): if dtype is dtlike_dtypes[-1]: # PeriodArray will try to cast ints to strings return DatetimeIndex(vals).astype(dtype) return Index(vals, dtype=dtype) left = construct(ldtype)
# datetime ([pd.Timestamp('2000', ), pd.Timestamp('2001') ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), ([datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1) ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (np.array([1, 2], dtype='M8[ns]'), pd.arrays.DatetimeArray(np.array([1, 2], dtype='M8[ns]'))), (np.array([1, 2], dtype='M8[us]'), pd.arrays.DatetimeArray(np.array([1000, 2000], dtype='M8[ns]'))), # datetimetz ([pd.Timestamp('2000', tz='CET'), pd.Timestamp('2001', tz='CET')], pd.arrays.DatetimeArray._from_sequence( ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz='CET'))), ([ datetime.datetime(2000, 1, 1, tzinfo=cet), datetime.datetime(2001, 1, 1, tzinfo=cet) ], pd.arrays.DatetimeArray._from_sequence(['2000', '2001'], tz=cet)), # timedelta ([pd.Timedelta('1H'), pd.Timedelta('2H') ], pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), (np.array([1, 2], dtype='m8[ns]'), pd.arrays.TimedeltaArray(np.array([1, 2], dtype='m8[ns]'))), (np.array([1, 2], dtype='m8[us]'), pd.arrays.TimedeltaArray(np.array([1000, 2000], dtype='m8[ns]'))), ]) def test_array_inference(data, expected): result = pd.array(data)
(np.array([1, 2], dtype='datetime64[ns]'), None, pd.arrays.DatetimeArray._from_sequence( np.array([1, 2], dtype='datetime64[ns]'))), (pd.DatetimeIndex(['2000', '2001']), np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (pd.DatetimeIndex(['2000', '2001']), None, pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), (['2000', '2001'], np.dtype('datetime64[ns]'), pd.arrays.DatetimeArray._from_sequence(['2000', '2001'])), # Datetime (tz-aware) (['2000', '2001'], pd.DatetimeTZDtype(tz="CET"), pd.arrays.DatetimeArray._from_sequence( ['2000', '2001'], dtype=pd.DatetimeTZDtype(tz="CET"))), # Timedelta (['1H', '2H'], np.dtype('timedelta64[ns]'), pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), (pd.TimedeltaIndex(['1H', '2H']), np.dtype('timedelta64[ns]'), pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), (pd.TimedeltaIndex(['1H', '2H']), None, pd.arrays.TimedeltaArray._from_sequence(['1H', '2H'])), # Category (['a', 'b'], 'category', pd.Categorical(['a', 'b'])),
def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" assert str(python_to_sql_type(np.dtype(">M8[ns]"))) == "TIMESTAMP" assert (str(python_to_sql_type(pd.DatetimeTZDtype( unit="ns", tz="UTC"))) == "TIMESTAMP_WITH_LOCAL_TIME_ZONE")
for checks in [ pa.Check.le(check_arg), pa.Check.ge(check_arg), pa.Check.eq(check_arg), pa.Check.isin([check_arg]), ]: column_schema = pa.Column("datetime", checks=checks, name="test_datetime") column_schema(data.draw(column_schema.strategy())) @pytest.mark.parametrize( "dtype", ( pd.DatetimeTZDtype(tz="UTC"), pd.DatetimeTZDtype(tz="dateutil/US/Central"), ), ) @pytest.mark.parametrize( "check_arg", [ pd.Timestamp("2006-01-01", tz="CET"), pd.Timestamp("2006-01-01", tz="UTC"), ], ) @hypothesis.given(st.data()) @hypothesis.settings( suppress_health_check=[hypothesis.HealthCheck.too_slow], ) def test_datetime_tz_example(dtype, check_arg, data) -> None: """Test Column schema example method generate examples of
class TestSeriesConvertDtypes: # The answerdict has keys that have 4 tuples, corresponding to the arguments # infer_objects, convert_string, convert_integer, convert_boolean # This allows all 16 possible combinations to be tested. Since common # combinations expect the same answer, this provides an easy way to list # all the possibilities @pytest.mark.parametrize( "data, maindtype, answerdict", [ ( [1, 2, 3], np.dtype("int32"), { ((True, False), (True, False), (True, ), (True, False)): "Int32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int32"), }, ), ( [1, 2, 3], np.dtype("int64"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int64"), }, ), ( ["x", "y", "z"], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [True, False, np.nan], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, ), ): pd.BooleanDtype(), ((True, False), (True, False), (True, False), (False, )): np.dtype("O"), }, ), ( ["h", "i", np.nan], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [10, np.nan, 20], np.dtype("float"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("float"), }, ), ( [np.nan, 100.5, 200], np.dtype("float"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("float"), }, ), ( [3, 4, 5], "Int8", { ((True, False), (True, False), (True, False), (True, False)): "Int8" }, ), ( [[1, 2], [3, 4], [5]], None, { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [4, 5, 6], np.dtype("uint32"), { ((True, False), (True, False), (True, ), (True, False)): "UInt32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("uint32"), }, ), ( [-10, 12, 13], np.dtype("i1"), { ((True, False), (True, False), (True, ), (True, False)): "Int8", ((True, False), (True, False), (False, ), (True, False)): np.dtype("i1"), }, ), ( [1, 2.0], object, { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, ), (True, False), (False, ), (True, False)): np.dtype("float"), ((False, ), (True, False), (False, ), (True, False)): np.dtype("object"), }, ), ( ["a", "b"], pd.CategoricalDtype(), { ( (True, False), (True, False), (True, False), (True, False), ): pd.CategoricalDtype(), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), { ( (True, False), (True, False), (True, False), (True, False), ): pd.DatetimeTZDtype(tz="UTC"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { ( (True, ), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), ( (False, ), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.PeriodDtype("M"), }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.IntervalDtype("int64"), }, ), ], ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = { k: a for (kk, a) in answerdict.items() for k in product(*kk) } ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy)
'plant_id_eia': 'uint16', 'unitid': 'category', 'gross_load_mw': 'float32', 'steam_load_1000_lbs': 'float32', 'so2_mass_lbs': 'float32', 'so2_mass_measurement_code': 'category', 'nox_rate_lbs_mmbtu': 'float32', 'nox_rate_measurement_code': 'category', 'nox_mass_lbs': 'float32', 'nox_mass_measurement_code': 'category', 'co2_mass_tons': 'float32', 'co2_mass_measurement_code': 'category', 'heat_content_mmbtu': 'float32', 'facility_id': 'category', 'unit_id_epa': 'category', 'operating_datetime_utc': pd.DatetimeTZDtype(tz="UTC"), 'operating_time_hours': 'float32' } def parse_command_line(argv): """ Parse command line arguments. See the -h option. :param argv: arguments on the command line must include caller file name. """ parser = argparse.ArgumentParser() parser.add_argument( '-q', '--quiet',
break except (ImportError, TypeError, ValueError): pass numeric_dtypes = [np.int64, np.int32, np.uint32, np.uint64, np.float32, np.float64, np.int16, np.int8, np.uint16, np.uint8] datetime_dtypes = [np.datetime64, np.timedelta64] string_dtypes = [np.object] try: extension_dtypes = [pd.Int8Dtype, pd.Int16Dtype, pd.Int32Dtype, pd.Int64Dtype, pd.UInt8Dtype, pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype('ns', 'UTC'), pd.PeriodDtype('D')] except AttributeError: extension_dtypes = [] def setup(*args, **kwargs): # This function just needs to be imported into each benchmark file to # set up the random seed before each function. # http://asv.readthedocs.io/en/latest/writing_benchmarks.html np.random.seed(1234) class BaseIO(object): """ Base class for IO benchmarks
def test_make_meta(): df = pd.DataFrame({ "a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # List meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # Iterable class CustomMetadata(Iterable): """Custom class iterator returning pandas types.""" def __init__(self, max=0): self.types = [("a", "i8"), ("c", "f8"), ("b", "O")] def __iter__(self): self.n = 0 return self def __next__(self): if self.n < len(self.types): ret = self.types[self.n] self.n += 1 return ret else: raise StopIteration meta = make_meta(CustomMetadata()) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # With index idx = pd.Index([1, 2], name="foo") meta = make_meta( { "a": "i8", "b": "i4" }, index=idx, ) assert type(meta.index) is type(idx) assert meta.index.dtype == "int64" assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=idx) assert type(meta.index) is type(idx) assert meta.index.dtype == "int64" assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}, parent_meta=df) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category"), parent_meta=df) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0), parent_meta=df) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0, parent_meta=df) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x, parent_meta=df) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = make_meta(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit) # Dtype expressions meta = make_meta("i8", parent_meta=df) assert isinstance(meta, np.int64) meta = make_meta(float, parent_meta=df) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool"), parent_meta=df) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
def test_make_meta(): df = pd.DataFrame({ "a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0] }, index=[10, 20, 30]) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # With index meta = make_meta( { "a": "i8", "b": "i4" }, index=pd.Int64Index([1, 2], name="foo"), ) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}, parent_meta=df) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category"), parent_meta=df) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0), parent_meta=df) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0, parent_meta=df) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x, parent_meta=df) assert meta is x # DatetimeTZDtype x = pd.DatetimeTZDtype(tz="UTC") meta = make_meta(x) assert meta == pd.Timestamp(1, tz=x.tz, unit=x.unit) # Dtype expressions meta = make_meta("i8", parent_meta=df) assert isinstance(meta, np.int64) meta = make_meta(float, parent_meta=df) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool"), parent_meta=df) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
object_dtypes = {object: "object", np.object_: "object"} category_dtypes = { pa.Category: "category", pa.Category(["A", "B"], ordered=True): pd.CategoricalDtype(["A", "B"], ordered=True), pd.CategoricalDtype(["A", "B"], ordered=True): pd.CategoricalDtype(["A", "B"], ordered=True), } timestamp_dtypes = { datetime.datetime: "datetime64[ns]", np.datetime64: "datetime64[ns]", pa.Timestamp: "datetime64[ns]", pd.DatetimeTZDtype(tz="CET"): "datetime64[ns, CET]", pandas_engine.DateTime: "datetime64[ns]", pandas_engine.DateTime(unit="ns", tz="CET"): "datetime64[ns, CET]", # type: ignore } timedelta_dtypes = { datetime.timedelta: "timedelta64", datetime.timedelta: "timedelta64", np.timedelta64: "timedelta64", pd.Timedelta: "timedelta64", pa.Timedelta: "timedelta64", } period_dtypes = {pd.PeriodDtype(freq="D"): "period[D]"} # Series.astype does not accept a string alias for SparseDtype.
class TestSeriesConvertDtypes: # The answerdict has keys that have 4 tuples, corresponding to the arguments # infer_objects, convert_string, convert_integer, convert_boolean # This allows all 16 possible combinations to be tested. Since common # combinations expect the same answer, this provides an easy way to list # all the possibilities @pytest.mark.parametrize( "data, maindtype, answerdict", [ ( [1, 2, 3], np.dtype("int32"), { ((True, False), (True, False), (True, ), (True, False)): "Int32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int32"), }, ), ( [1, 2, 3], np.dtype("int64"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("int64"), }, ), ( ["x", "y", "z"], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( [True, False, np.nan], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, ), ): pd.BooleanDtype(), ((True, False), (True, False), (True, False), (False, )): np.dtype("O"), }, ), ( ["h", "i", np.nan], np.dtype("O"), { ( (True, False), (True, ), (True, False), (True, False), ): pd.StringDtype(), ((True, False), (False, ), (True, False), (True, False)): np.dtype("O"), }, ), ( # GH32117 ["h", "i", 1], np.dtype("O"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [10, np.nan, 20], np.dtype("float"), { ((True, False), (True, False), (True, ), (True, False)): "Int64", ((True, False), (True, False), (False, ), (True, False)): np.dtype("float"), }, ), ( [np.nan, 100.5, 200], np.dtype("float"), { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("float"), }, ), ( [3, 4, 5], "Int8", { ((True, False), (True, False), (True, False), (True, False)): "Int8" }, ), ( [[1, 2], [3, 4], [5]], None, { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( [4, 5, 6], np.dtype("uint32"), { ((True, False), (True, False), (True, ), (True, False)): "UInt32", ((True, False), (True, False), (False, ), (True, False)): np.dtype("uint32"), }, ), ( [-10, 12, 13], np.dtype("i1"), { ((True, False), (True, False), (True, ), (True, False)): "Int8", ((True, False), (True, False), (False, ), (True, False)): np.dtype("i1"), }, ), ( [1, 2.0], object, { ((True, ), (True, False), (True, ), (True, False)): "Int64", ((True, ), (True, False), (False, ), (True, False)): np.dtype("float"), ((False, ), (True, False), (True, False), (True, False)): np.dtype("object"), }, ), ( [1, 2.5], object, { ((True, ), (True, False), (True, False), (True, False)): np.dtype("float"), ((False, ), (True, False), (True, False), (True, False)): np.dtype("object"), }, ), ( ["a", "b"], pd.CategoricalDtype(), { ( (True, False), (True, False), (True, False), (True, False), ): pd.CategoricalDtype(), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), { ( (True, False), (True, False), (True, False), (True, False), ): pd.DatetimeTZDtype(tz="UTC"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", { ( (True, False), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), }, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, { ( (True, ), (True, False), (True, False), (True, False), ): np.dtype("datetime64[ns]"), ( (False, ), (True, False), (True, False), (True, False), ): np.dtype("O"), }, ), ( pd.period_range("1/1/2011", freq="M", periods=3), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.PeriodDtype("M"), }, ), ( pd.arrays.IntervalArray([pd.Interval(0, 1), pd.Interval(1, 5)]), None, { ( (True, False), (True, False), (True, False), (True, False), ): pd.IntervalDtype("int64"), }, ), ], ) @pytest.mark.parametrize("params", product(*[(True, False)] * 4)) def test_convert_dtypes(self, data, maindtype, params, answerdict): if maindtype is not None: series = pd.Series(data, dtype=maindtype) else: series = pd.Series(data) answers = { k: a for (kk, a) in answerdict.items() for k in product(*kk) } ns = series.convert_dtypes(*params) expected_dtype = answers[tuple(params)] expected = pd.Series(series.values, dtype=expected_dtype) tm.assert_series_equal(ns, expected) # Test that it is a copy copy = series.copy(deep=True) if is_interval_dtype(ns.dtype) and ns.dtype.subtype.kind in ["i", "u"]: msg = "Cannot set float NaN to integer-backed IntervalArray" with pytest.raises(ValueError, match=msg): ns[ns.notna()] = np.nan else: ns[ns.notna()] = np.nan # Make sure original not changed tm.assert_series_equal(series, copy) def test_convert_string_dtype(self): # https://github.com/pandas-dev/pandas/issues/31731 -> converting columns # that are already string dtype df = pd.DataFrame({ "A": ["a", "b", pd.NA], "B": ["ä", "ö", "ü"] }, dtype="string") result = df.convert_dtypes() tm.assert_frame_equal(df, result) def test_convert_bool_dtype(self): # GH32287 df = pd.DataFrame({"A": pd.array([True])}) tm.assert_frame_equal(df, df.convert_dtypes())
def astype_datetime(self): self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific"))
DatetimeArray._from_sequence(["2000", "2001"]), ), ( pd.DatetimeIndex(["2000", "2001"]), None, DatetimeArray._from_sequence(["2000", "2001"]), ), ( ["2000", "2001"], np.dtype("datetime64[ns]"), DatetimeArray._from_sequence(["2000", "2001"]), ), # Datetime (tz-aware) ( ["2000", "2001"], pd.DatetimeTZDtype(tz="CET"), DatetimeArray._from_sequence( ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") ), ), # Timedelta ( ["1H", "2H"], np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ), ( pd.TimedeltaIndex(["1H", "2H"]), np.dtype("timedelta64[ns]"), TimedeltaArray._from_sequence(["1H", "2H"]), ),
np.dtype("object"), }, ), ( [1, 2.5], object, "Float64", { ("convert_floating", False): np.dtype("float"), ("infer_objects", False): np.dtype("object"), }, ), (["a", "b"], pd.CategoricalDtype(), pd.CategoricalDtype(), {}), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), pd.DatetimeTZDtype(tz="UTC"), pd.DatetimeTZDtype(tz="UTC"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), "datetime64[ns]", np.dtype("datetime64[ns]"), {}, ), ( pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]), object, np.dtype("datetime64[ns]"), { ("infer_objects", False): np.dtype("object")