def test_update_dtype_string(self, ordered): dtype = CategoricalDtype(list('abc'), ordered) expected_categories = dtype.categories expected_ordered = dtype.ordered result = dtype.update_dtype('category') tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered
def test_update_dtype(self, ordered, new_categories, new_ordered): dtype = CategoricalDtype(list('abc'), ordered) new_dtype = CategoricalDtype(new_categories, new_ordered) expected_categories = new_dtype.categories if expected_categories is None: expected_categories = dtype.categories expected_ordered = new_dtype.ordered if expected_ordered is None: expected_ordered = dtype.ordered result = dtype.update_dtype(new_dtype) tm.assert_index_equal(result.categories, expected_categories) assert result.ordered is expected_ordered
def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=None): if fastpath is not None: warnings.warn("The 'fastpath' keyword is deprecated, and will be " "removed in a future version.", FutureWarning, stacklevel=2) if fastpath: return cls._simple_new(data, name=name, dtype=dtype) dtype = CategoricalDtype._from_values_or_dtype(data, categories, ordered, dtype) if name is None and hasattr(data, 'name'): name = data.name if not is_categorical_dtype(data): # don't allow scalars # if data is None, then categories must be provided if is_scalar(data): if data is not None or categories is None: cls._scalar_data_error(data) data = [] data = cls._create_categorical(data, dtype=dtype) data = data.copy() if copy else data return cls._simple_new(data, name=name)
def test_is_dtype(self): self.assertTrue(CategoricalDtype.is_dtype(self.dtype)) self.assertTrue(CategoricalDtype.is_dtype('category')) self.assertTrue(CategoricalDtype.is_dtype(CategoricalDtype())) self.assertFalse(CategoricalDtype.is_dtype('foo')) self.assertFalse(CategoricalDtype.is_dtype(np.float64))
def test_from_values_or_dtype_invalid_dtype(self): msg = "Cannot not construct CategoricalDtype from <class 'object'>" with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(None, None, None, object)
def test_from_values_or_dtype(self, values, categories, ordered, dtype, expected): result = CategoricalDtype._from_values_or_dtype( values, categories, ordered, dtype) assert result == expected
class TestDataFrameDataTypes(TestData): def test_concat_empty_dataframe_dtypes(self): df = DataFrame(columns=list("abc")) df['a'] = df['a'].astype(np.bool_) df['b'] = df['b'].astype(np.int32) df['c'] = df['c'].astype(np.float64) result = pd.concat([df, df]) assert result['a'].dtype == np.bool_ assert result['b'].dtype == np.int32 assert result['c'].dtype == np.float64 result = pd.concat([df, df.astype(np.float64)]) assert result['a'].dtype == np.object_ assert result['b'].dtype == np.float64 assert result['c'].dtype == np.float64 def test_empty_frame_dtypes_ftypes(self): empty_df = pd.DataFrame() assert_series_equal(empty_df.dtypes, pd.Series(dtype=np.object)) assert_series_equal(empty_df.ftypes, pd.Series(dtype=np.object)) nocols_df = pd.DataFrame(index=[1, 2, 3]) assert_series_equal(nocols_df.dtypes, pd.Series(dtype=np.object)) assert_series_equal(nocols_df.ftypes, pd.Series(dtype=np.object)) norows_df = pd.DataFrame(columns=list("abc")) assert_series_equal(norows_df.dtypes, pd.Series(np.object, index=list("abc"))) assert_series_equal(norows_df.ftypes, pd.Series('object:dense', index=list("abc"))) norows_int_df = pd.DataFrame(columns=list("abc")).astype(np.int32) assert_series_equal(norows_int_df.dtypes, pd.Series(np.dtype('int32'), index=list("abc"))) assert_series_equal(norows_int_df.ftypes, pd.Series('int32:dense', index=list("abc"))) odict = OrderedDict df = pd.DataFrame(odict([('a', 1), ('b', True), ('c', 1.0)]), index=[1, 2, 3]) ex_dtypes = pd.Series( odict([('a', np.int64), ('b', np.bool), ('c', np.float64)])) ex_ftypes = pd.Series( odict([('a', 'int64:dense'), ('b', 'bool:dense'), ('c', 'float64:dense')])) assert_series_equal(df.dtypes, ex_dtypes) assert_series_equal(df.ftypes, ex_ftypes) # same but for empty slice of df assert_series_equal(df[:0].dtypes, ex_dtypes) assert_series_equal(df[:0].ftypes, ex_ftypes) def test_datetime_with_tz_dtypes(self): tzframe = DataFrame({ 'A': date_range('20130101', periods=3), 'B': date_range('20130101', periods=3, tz='US/Eastern'), 'C': date_range('20130101', periods=3, tz='CET') }) tzframe.iloc[1, 1] = pd.NaT tzframe.iloc[1, 2] = pd.NaT result = tzframe.dtypes.sort_index() expected = Series([ np.dtype('datetime64[ns]'), DatetimeTZDtype('ns', 'US/Eastern'), DatetimeTZDtype('ns', 'CET') ], ['A', 'B', 'C']) assert_series_equal(result, expected) def test_dtypes_are_correct_after_column_slice(self): # GH6525 df = pd.DataFrame(index=range(5), columns=list("abc"), dtype=np.float_) odict = OrderedDict assert_series_equal( df.dtypes, pd.Series( odict([('a', np.float_), ('b', np.float_), ('c', np.float_)]))) assert_series_equal(df.iloc[:, 2:].dtypes, pd.Series(odict([('c', np.float_)]))) assert_series_equal( df.dtypes, pd.Series( odict([('a', np.float_), ('b', np.float_), ('c', np.float_)]))) def test_select_dtypes_include_using_list_like(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3) }) ri = df.select_dtypes(include=[np.number]) ei = df[['b', 'c', 'd', 'k']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number], exclude=['timedelta']) ei = df[['b', 'c', 'd']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude=['timedelta']) ei = df[['b', 'c', 'd', 'f']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetime64']) ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=['datetimetz']) ei = df[['h', 'i']] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include=['period']) def test_select_dtypes_exclude_using_list_like(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True] }) re = df.select_dtypes(exclude=[np.number]) ee = df[['a', 'e']] assert_frame_equal(re, ee) def test_select_dtypes_exclude_include_using_list_like(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values }) exclude = np.datetime64, include = np.bool_, 'integer' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'c', 'e']] assert_frame_equal(r, e) exclude = 'datetime', include = 'bool', 'int64', 'int32' r = df.select_dtypes(include=include, exclude=exclude) e = df[['b', 'e']] assert_frame_equal(r, e) def test_select_dtypes_include_using_scalars(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3) }) ri = df.select_dtypes(include=np.number) ei = df[['b', 'c', 'd', 'k']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='datetime') ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='datetime64') ei = df[['g']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include='category') ei = df[['f']] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(include='period') def test_select_dtypes_exclude_using_scalars(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3) }) ri = df.select_dtypes(exclude=np.number) ei = df[['a', 'e', 'f', 'g', 'h', 'i', 'j']] assert_frame_equal(ri, ei) ri = df.select_dtypes(exclude='category') ei = df[['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'j', 'k']] assert_frame_equal(ri, ei) with pytest.raises(NotImplementedError, match=r"^$"): df.select_dtypes(exclude='period') def test_select_dtypes_include_exclude_using_scalars(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3) }) ri = df.select_dtypes(include=np.number, exclude='floating') ei = df[['b', 'c', 'k']] assert_frame_equal(ri, ei) def test_select_dtypes_include_exclude_mixed_scalars_lists(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.Categorical(list('abc')), 'g': pd.date_range('20130101', periods=3), 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), 'i': pd.date_range('20130101', periods=3, tz='CET'), 'j': pd.period_range('2013-01', periods=3, freq='M'), 'k': pd.timedelta_range('1 day', periods=3) }) ri = df.select_dtypes(include=np.number, exclude=['floating', 'timedelta']) ei = df[['b', 'c']] assert_frame_equal(ri, ei) ri = df.select_dtypes(include=[np.number, 'category'], exclude='floating') ei = df[['b', 'c', 'f', 'k']] assert_frame_equal(ri, ei) def test_select_dtypes_duplicate_columns(self): # GH20839 odict = OrderedDict df = DataFrame( odict([('a', list('abc')), ('b', list(range(1, 4))), ('c', np.arange(3, 6).astype('u1')), ('d', np.arange(4.0, 7.0, dtype='float64')), ('e', [True, False, True]), ('f', pd.date_range('now', periods=3).values)])) df.columns = ['a', 'a', 'b', 'b', 'b', 'c'] expected = DataFrame({ 'a': list(range(1, 4)), 'b': np.arange(3, 6).astype('u1') }) result = df.select_dtypes(include=[np.number], exclude=['floating']) assert_frame_equal(result, expected) def test_select_dtypes_not_an_attr_but_still_valid_dtype(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values }) df['g'] = df.f.diff() assert not hasattr(np, 'u8') r = df.select_dtypes(include=['i8', 'O'], exclude=['timedelta']) e = df[['a', 'b']] assert_frame_equal(r, e) r = df.select_dtypes(include=['i8', 'O', 'timedelta64[ns]']) e = df[['a', 'b', 'g']] assert_frame_equal(r, e) def test_select_dtypes_empty(self): df = DataFrame({'a': list('abc'), 'b': list(range(1, 4))}) msg = 'at least one of include or exclude must be nonempty' with pytest.raises(ValueError, match=msg): df.select_dtypes() def test_select_dtypes_bad_datetime64(self): df = DataFrame({ 'a': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values }) with pytest.raises(ValueError, match='.+ is too specific'): df.select_dtypes(include=['datetime64[D]']) with pytest.raises(ValueError, match='.+ is too specific'): df.select_dtypes(exclude=['datetime64[as]']) def test_select_dtypes_datetime_with_tz(self): df2 = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) df3 = pd.concat([df2.A.to_frame(), df2.B.to_frame()], axis=1) result = df3.select_dtypes(include=['datetime64[ns]']) expected = df3.reindex(columns=[]) assert_frame_equal(result, expected) @pytest.mark.parametrize( "dtype", [str, "str", np.string_, "S1", "unicode", np.unicode_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg): df = DataFrame({ "a": list("abc"), "g": list("abc"), "b": list(range(1, 4)), "c": np.arange(3, 6).astype("u1"), "d": np.arange(4.0, 7.0, dtype="float64"), "e": [True, False, True], "f": pd.date_range("now", periods=3).values }) msg = "string dtypes are not allowed" kwargs = {arg: [dtype]} with pytest.raises(TypeError, match=msg): df.select_dtypes(**kwargs) def test_select_dtypes_bad_arg_raises(self): df = DataFrame({ 'a': list('abc'), 'g': list('abc'), 'b': list(range(1, 4)), 'c': np.arange(3, 6).astype('u1'), 'd': np.arange(4.0, 7.0, dtype='float64'), 'e': [True, False, True], 'f': pd.date_range('now', periods=3).values }) msg = 'data type.*not understood' with pytest.raises(TypeError, match=msg): df.select_dtypes(['blargy, blarg, blarg']) def test_select_dtypes_typecodes(self): # GH 11990 df = mkdf(30, 3, data_gen_f=lambda x, y: np.random.random()) expected = df FLOAT_TYPES = list(np.typecodes['AllFloat']) assert_frame_equal(df.select_dtypes(FLOAT_TYPES), expected) def test_dtypes_gh8722(self): self.mixed_frame['bool'] = self.mixed_frame['A'] > 0 result = self.mixed_frame.dtypes expected = Series({k: v.dtype for k, v in self.mixed_frame.items()}, index=result.index) assert_series_equal(result, expected) # compat, GH 8722 with option_context('use_inf_as_na', True): df = DataFrame([[1]]) result = df.dtypes assert_series_equal(result, Series({0: np.dtype('int64')})) def test_ftypes(self): frame = self.mixed_float expected = Series( dict(A='float32:dense', B='float32:dense', C='float16:dense', D='float64:dense')).sort_values() result = frame.ftypes.sort_values() assert_series_equal(result, expected) def test_astype(self): casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) casted = self.frame.astype(np.int32) expected = DataFrame(self.frame.values.astype(np.int32), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) self.frame['foo'] = '5' casted = self.frame.astype(int) expected = DataFrame(self.frame.values.astype(int), index=self.frame.index, columns=self.frame.columns) assert_frame_equal(casted, expected) # mixed casting def _check_cast(df, v): assert (list({s.dtype.name for _, s in df.items()})[0] == v) mn = self.all_mixed._get_numeric_data().copy() mn['little_float'] = np.array(12345., dtype='float16') mn['big_float'] = np.array(123456789101112., dtype='float64') casted = mn.astype('float64') _check_cast(casted, 'float64') casted = mn.astype('int64') _check_cast(casted, 'int64') casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float32') _check_cast(casted, 'float32') casted = mn.reindex(columns=['little_float']).astype('float16') _check_cast(casted, 'float16') casted = self.mixed_float.reindex(columns=['A', 'B']).astype('float16') _check_cast(casted, 'float16') casted = mn.astype('float32') _check_cast(casted, 'float32') casted = mn.astype('int32') _check_cast(casted, 'int32') # to object casted = mn.astype('O') _check_cast(casted, 'object') def test_astype_with_exclude_string(self): df = self.frame.copy() expected = self.frame.astype(int) df['string'] = 'foo' casted = df.astype(int, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) df = self.frame.copy() expected = self.frame.astype(np.int32) df['string'] = 'foo' casted = df.astype(np.int32, errors='ignore') expected['string'] = 'foo' assert_frame_equal(casted, expected) def test_astype_with_view(self): tf = self.mixed_float.reindex(columns=['A', 'B', 'C']) casted = tf.astype(np.int64) casted = tf.astype(np.float32) # this is the only real reason to do it this way tf = np.round(self.frame).astype(np.int32) casted = tf.astype(np.float32, copy=False) # TODO(wesm): verification? tf = self.frame.astype(np.float64) casted = tf.astype(np.int64, copy=False) # noqa @pytest.mark.parametrize("dtype", [np.int32, np.int64]) @pytest.mark.parametrize("val", [np.nan, np.inf]) def test_astype_cast_nan_inf_int(self, val, dtype): # see gh-14265 # # Check NaN and inf --> raise error when converting to int. msg = "Cannot convert non-finite values \\(NA or inf\\) to integer" df = DataFrame([val]) with pytest.raises(ValueError, match=msg): df.astype(dtype) def test_astype_str(self): # see gh-9757 a = Series(date_range("2010-01-04", periods=5)) b = Series(date_range("3/6/2012 00:00", periods=5, tz="US/Eastern")) c = Series([Timedelta(x, unit="d") for x in range(5)]) d = Series(range(5)) e = Series([0.0, 0.2, 0.4, 0.6, 0.8]) df = DataFrame({"a": a, "b": b, "c": c, "d": d, "e": e}) # Datetime-like result = df.astype(str) expected = DataFrame({ "a": list(map(str, map(lambda x: Timestamp(x)._date_repr, a._values))), "b": list(map(str, map(Timestamp, b._values))), "c": list( map( str, map(lambda x: Timedelta(x)._repr_base(format="all"), c._values))), "d": list(map(str, d._values)), "e": list(map(str, e._values)), }) assert_frame_equal(result, expected) def test_astype_str_float(self): # see gh-11302 result = DataFrame([np.NaN]).astype(str) expected = DataFrame(["nan"]) assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) # < 1.14 truncates # >= 1.14 preserves the full repr val = ("1.12345678901" if _np_version_under1p14 else "1.1234567890123457") expected = DataFrame([val]) assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) def test_astype_dict_like(self, dtype_class): # GH7271 & GH16717 a = Series(date_range('2010-01-04', periods=5)) b = Series(range(5)) c = Series([0.0, 0.2, 0.4, 0.6, 0.8]) d = Series(['1.0', '2', '3.14', '4', '5.4']) df = DataFrame({'a': a, 'b': b, 'c': c, 'd': d}) original = df.copy(deep=True) # change type of a subset of columns dt1 = dtype_class({'b': 'str', 'd': 'float32'}) result = df.astype(dt1) expected = DataFrame({ 'a': a, 'b': Series(['0', '1', '2', '3', '4']), 'c': c, 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float32') }) assert_frame_equal(result, expected) assert_frame_equal(df, original) dt2 = dtype_class({'b': np.float32, 'c': 'float32', 'd': np.float64}) result = df.astype(dt2) expected = DataFrame({ 'a': a, 'b': Series([0.0, 1.0, 2.0, 3.0, 4.0], dtype='float32'), 'c': Series([0.0, 0.2, 0.4, 0.6, 0.8], dtype='float32'), 'd': Series([1.0, 2.0, 3.14, 4.0, 5.4], dtype='float64') }) assert_frame_equal(result, expected) assert_frame_equal(df, original) # change all columns dt3 = dtype_class({'a': str, 'b': str, 'c': str, 'd': str}) assert_frame_equal(df.astype(dt3), df.astype(str)) assert_frame_equal(df, original) # error should be raised when using something other than column labels # in the keys of the dtype dict dt4 = dtype_class({'b': str, 2: str}) dt5 = dtype_class({'e': str}) msg = ("Only a column name can be used for the key in a dtype mappings" " argument") with pytest.raises(KeyError, match=msg): df.astype(dt4) with pytest.raises(KeyError, match=msg): df.astype(dt5) assert_frame_equal(df, original) # if the dtypes provided are the same as the original dtypes, the # resulting DataFrame should be the same as the original DataFrame dt6 = dtype_class({col: df[col].dtype for col in df.columns}) equiv = df.astype(dt6) assert_frame_equal(df, equiv) assert_frame_equal(df, original) # GH 16717 # if dtypes provided is empty, the resulting DataFrame # should be the same as the original DataFrame dt7 = dtype_class({}) result = df.astype(dt7) assert_frame_equal(df, equiv) assert_frame_equal(df, original) def test_astype_duplicate_col(self): a1 = Series([1, 2, 3, 4, 5], name='a') b = Series([0.1, 0.2, 0.4, 0.6, 0.8], name='b') a2 = Series([0, 1, 2, 3, 4], name='a') df = concat([a1, b, a2], axis=1) result = df.astype(str) a1_str = Series(['1', '2', '3', '4', '5'], dtype='str', name='a') b_str = Series(['0.1', '0.2', '0.4', '0.6', '0.8'], dtype=str, name='b') a2_str = Series(['0', '1', '2', '3', '4'], dtype='str', name='a') expected = concat([a1_str, b_str, a2_str], axis=1) assert_frame_equal(result, expected) result = df.astype({'a': 'str'}) expected = concat([a1_str, b, a2_str], axis=1) assert_frame_equal(result, expected) @pytest.mark.parametrize('dtype', [ 'category', CategoricalDtype(), CategoricalDtype(ordered=True), CategoricalDtype(ordered=False), CategoricalDtype(categories=list('abcdef')), CategoricalDtype(categories=list('edba'), ordered=False), CategoricalDtype(categories=list('edcb'), ordered=True) ], ids=repr) def test_astype_categorical(self, dtype): # GH 18099 d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')} df = DataFrame(d) result = df.astype(dtype) expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("cls", [ pd.api.types.CategoricalDtype, pd.api.types.DatetimeTZDtype, pd.api.types.IntervalDtype ]) def test_astype_categoricaldtype_class_raises(self, cls): df = DataFrame({"A": ['a', 'a', 'b', 'c']}) xpr = "Expected an instance of {}".format(cls.__name__) with pytest.raises(TypeError, match=xpr): df.astype({"A": cls}) with pytest.raises(TypeError, match=xpr): df['A'].astype(cls) @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) def test_astype_extension_dtypes(self, dtype): # GH 22578 df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) expected1 = pd.DataFrame({ 'a': integer_array([1, 3, 5], dtype=dtype), 'b': integer_array([2, 4, 6], dtype=dtype) }) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) tm.assert_frame_equal(df.astype(dtype).astype('float64'), df) df = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], columns=['a', 'b']) df['b'] = df['b'].astype(dtype) expected2 = pd.DataFrame({ 'a': [1., 3., 5.], 'b': integer_array([2, 4, 6], dtype=dtype) }) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) @pytest.mark.parametrize("dtype", ['Int64', 'Int32', 'Int16']) def test_astype_extension_dtypes_1d(self, dtype): # GH 22578 df = pd.DataFrame({'a': [1., 2., 3.]}) expected1 = pd.DataFrame({'a': integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) df = pd.DataFrame({'a': [1., 2., 3.]}) df['a'] = df['a'].astype(dtype) expected2 = pd.DataFrame({'a': integer_array([1, 2, 3], dtype=dtype)}) tm.assert_frame_equal(df, expected2) tm.assert_frame_equal(df.astype(dtype), expected1) tm.assert_frame_equal(df.astype('int64').astype(dtype), expected1) @pytest.mark.parametrize("dtype", ['category', 'Int64']) def test_astype_extension_dtypes_duplicate_col(self, dtype): # GH 24704 a1 = Series([0, np.nan, 4], name='a') a2 = Series([np.nan, 3, 5], name='a') df = concat([a1, a2], axis=1) result = df.astype(dtype) expected = concat([a1.astype(dtype), a2.astype(dtype)], axis=1) assert_frame_equal(result, expected) @pytest.mark.parametrize('dtype', [{ 100: 'float64', 200: 'uint64' }, 'category', 'float64']) def test_astype_column_metadata(self, dtype): # GH 19920 columns = pd.UInt64Index([100, 200, 300], name='foo') df = DataFrame(np.arange(15).reshape(5, 3), columns=columns) df = df.astype(dtype) tm.assert_index_equal(df.columns, columns) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_from_datetimelike_to_objectt(self, dtype, unit): # tests astype to object dtype # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(object) assert (result.dtypes == object).all() if dtype.startswith('M8'): assert result.iloc[0, 0] == pd.to_datetime(1, unit=unit) else: assert result.iloc[0, 0] == pd.to_timedelta(1, unit=unit) @pytest.mark.parametrize("arr_dtype", [np.int64, np.float64]) @pytest.mark.parametrize("dtype", ["M8", "m8"]) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_datetimelike_unit(self, arr_dtype, dtype, unit): # tests all units from numeric origination # gh-19223 / gh-12425 dtype = "{}[{}]".format(dtype, unit) arr = np.array([[1, 2, 3]], dtype=arr_dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_datetime_unit(self, unit): # tests all units from datetime origination # gh-19223 dtype = "M8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns']) def test_astype_to_timedelta_unit_ns(self, unit): # preserver the timedelta conversion # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(arr.astype(dtype)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_timedelta_unit(self, unit): # coerce to float # gh-19223 dtype = "m8[{}]".format(unit) arr = np.array([[1, 2, 3]], dtype=dtype) df = DataFrame(arr) result = df.astype(dtype) expected = DataFrame(df.values.astype(dtype).astype(float)) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("unit", ['ns', 'us', 'ms', 's', 'h', 'm', 'D']) def test_astype_to_incorrect_datetimelike(self, unit): # trying to astype a m to a M, or vice-versa # gh-19224 dtype = "M8[{}]".format(unit) other = "m8[{}]".format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) msg = (r"cannot astype a datetimelike from \[datetime64\[ns\]\] to" r" \[timedelta64\[{}\]\]").format(unit) with pytest.raises(TypeError, match=msg): df.astype(other) msg = (r"cannot astype a timedelta from \[timedelta64\[ns\]\] to" r" \[datetime64\[{}\]\]").format(unit) df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) def test_timedeltas(self): df = DataFrame( dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) result = df.get_dtype_counts().sort_index() expected = Series({ 'datetime64[ns]': 1, 'timedelta64[ns]': 1 }).sort_index() assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] expected = Series({ 'datetime64[ns]': 2, 'timedelta64[ns]': 1 }).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) # mixed int types df['D'] = 1 expected = Series({ 'datetime64[ns]': 2, 'timedelta64[ns]': 1, 'int64': 1 }).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) def test_arg_for_errors_in_astype(self): # issue #14878 df = DataFrame([1, 2, 3]) with pytest.raises(ValueError): df.astype(np.float64, errors=True) df.astype(np.int8, errors='ignore') def test_arg_for_errors_in_astype_dictlist(self): # GH-25905 df = pd.DataFrame([{ 'a': '1', 'b': '16.5%', 'c': 'test' }, { 'a': '2.2', 'b': '15.3', 'c': 'another_test' }]) expected = pd.DataFrame([{ 'a': 1.0, 'b': '16.5%', 'c': 'test' }, { 'a': 2.2, 'b': '15.3', 'c': 'another_test' }]) type_dict = {'a': 'float64', 'b': 'float64', 'c': 'object'} result = df.astype(dtype=type_dict, errors='ignore') tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('input_vals', [ ([1, 2]), (['1', '2']), (list(pd.date_range('1/1/2011', periods=2, freq='H'))), (list(pd.date_range('1/1/2011', periods=2, freq='H', tz='US/Eastern'))), ([pd.Interval(left=0, right=5)]), ]) def test_constructor_list_str(self, input_vals, string_dtype): # GH 16605 # Ensure that data elements are converted to strings when # dtype is str, 'str', or 'U' result = DataFrame({'A': input_vals}, dtype=string_dtype) expected = DataFrame({'A': input_vals}).astype({'A': string_dtype}) assert_frame_equal(result, expected) def test_constructor_list_str_na(self, string_dtype): result = DataFrame({"A": [1.0, 2.0, None]}, dtype=string_dtype) expected = DataFrame({"A": ['1.0', '2.0', None]}, dtype=object) assert_frame_equal(result, expected) @pytest.mark.parametrize( "data, expected", [ # empty (DataFrame(), True), # multi-same (DataFrame({ "A": [1, 2], "B": [1, 2] }), True), # multi-object (DataFrame({ "A": np.array([1, 2], dtype=object), "B": np.array(["a", "b"], dtype=object) }), True), # multi-extension (DataFrame({ "A": pd.Categorical(['a', 'b']), "B": pd.Categorical(['a', 'b']) }), True), # differ types (DataFrame({ "A": [1, 2], "B": [1., 2.] }), False), # differ sizes (DataFrame({ "A": np.array([1, 2], dtype=np.int32), "B": np.array([1, 2], dtype=np.int64) }), False), # multi-extension differ (DataFrame({ "A": pd.Categorical(['a', 'b']), "B": pd.Categorical(['b', 'c']) }), False), ]) def test_is_homogeneous_type(self, data, expected): assert data._is_homogeneous_type is expected def test_asarray_homogenous(self): df = pd.DataFrame({ "A": pd.Categorical([1, 2]), "B": pd.Categorical([1, 2]) }) result = np.asarray(df) # may change from object in the future expected = np.array([[1, 1], [2, 2]], dtype='object') tm.assert_numpy_array_equal(result, expected)
def test_unordered_same(self, ordered): c1 = CategoricalDtype(['a', 'b'], ordered=ordered) c2 = CategoricalDtype(['b', 'a'], ordered=ordered) assert hash(c1) == hash(c2)
def test_from_categorical_dtype_categories(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override categories result = CategoricalDtype._from_categorical_dtype( c1, categories=[2, 3]) assert result == CategoricalDtype([2, 3], ordered=True)
def test_hash_vs_equality(self): dtype = self.dtype dtype2 = CategoricalDtype() assert dtype == dtype2 assert dtype2 == dtype assert hash(dtype) == hash(dtype2)
def test_same_categories_different_order(self): c1 = CategoricalDtype(['a', 'b'], ordered=True) c2 = CategoricalDtype(['b', 'a'], ordered=True) assert c1 is not c2
def test_non_unique_invalid(self): with pytest.raises(ValueError): CategoricalDtype([1, 2, 1])
def test_nan_invalid(self): with pytest.raises(ValueError): CategoricalDtype([1, 2, np.nan])
def test_order_hashes_different(self, v1, v2): c1 = CategoricalDtype(v1, ordered=False) c2 = CategoricalDtype(v2, ordered=True) c3 = CategoricalDtype(v1, ordered=None) assert c1 is not c2 assert c1 is not c3
def test_equal_but_different(self, ordered): c1 = CategoricalDtype([1, 2, 3]) c2 = CategoricalDtype([1., 2., 3.]) assert c1 is not c2 assert c1 != c2
def test_categories(self): result = CategoricalDtype(['a', 'b', 'c']) tm.assert_index_equal(result.categories, pd.Index(['a', 'b', 'c'])) assert result.ordered is None
def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype): msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype)
def test_categorical_equality_strings(self, categories, ordered, other): c1 = CategoricalDtype(categories, ordered) result = c1 == other expected = other == 'category' assert result is expected
def test_update_dtype_errors(self, bad_dtype): dtype = CategoricalDtype(list('abc'), False) msg = 'a CategoricalDtype must be passed to perform an update, ' with tm.assert_raises_regex(ValueError, msg): dtype._update_dtype(bad_dtype)
def test_invalid_raises(self): with pytest.raises(TypeError, match='ordered'): CategoricalDtype(['a', 'b'], ordered='foo') with pytest.raises(TypeError, match="'categories' must be list-like"): CategoricalDtype('category')
def test_construction_from_string(self): result = CategoricalDtype.construct_from_string('category') assert is_dtype_equal(self.dtype, result) pytest.raises( TypeError, lambda: CategoricalDtype.construct_from_string('foo'))
def test_mixed(self): a = CategoricalDtype(['a', 'b', 1, 2]) b = CategoricalDtype(['a', 'b', '1', '2']) assert hash(a) != hash(b)
def test_set_dtype_nans(self): c = Categorical(["a", "b", np.nan]) result = c._set_dtype(CategoricalDtype(["a", "c"])) tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
def test_from_categorical_dtype_identity(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # Identity test for no changes c2 = CategoricalDtype._from_categorical_dtype(c1) assert c2 is c1
@pytest.mark.parametrize( "dtype", [CategoricalDtype, IntervalDtype, DatetimeTZDtype, PeriodDtype]) def test_registry(dtype): assert dtype in registry.dtypes @pytest.mark.parametrize( "dtype, expected", [ ("int64", None), ("interval", IntervalDtype()), ("interval[int64]", IntervalDtype()), ("interval[datetime64[ns]]", IntervalDtype("datetime64[ns]")), ("period[D]", PeriodDtype("D")), ("category", CategoricalDtype()), ("datetime64[ns, US/Eastern]", DatetimeTZDtype("ns", "US/Eastern")), ], ) def test_registry_find(dtype, expected): assert registry.find(dtype) == expected @pytest.mark.parametrize( "dtype, expected", [ (str, False), (int, False), (bool, True), (np.bool, True), (np.array(["a", "b"]), False),
def test_from_categorical_dtype_categories(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override categories result = CategoricalDtype._from_categorical_dtype(c1, categories=[2, 3]) assert result == CategoricalDtype([2, 3], ordered=True)
def test_from_values_or_dtype_raises(self, values, categories, ordered, dtype): msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype)
def test_from_categorical_dtype_ordered(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override ordered result = CategoricalDtype._from_categorical_dtype(c1, ordered=False) assert result == CategoricalDtype([1, 2, 3], ordered=False)
def test_is_dtype(self, dtype): assert CategoricalDtype.is_dtype(dtype) assert CategoricalDtype.is_dtype("category") assert CategoricalDtype.is_dtype(CategoricalDtype()) assert not CategoricalDtype.is_dtype("foo") assert not CategoricalDtype.is_dtype(np.float64)
def test_str_vs_repr(self, ordered): c1 = CategoricalDtype(['a', 'b'], ordered=ordered) assert str(c1) == 'category' # Py2 will have unicode prefixes pat = r"CategoricalDtype\(categories=\[.*\], ordered={ordered}\)" assert re.match(pat.format(ordered=ordered), repr(c1))
def test_construction_from_string(self): result = CategoricalDtype.construct_from_string('category') assert is_dtype_equal(self.dtype, result) msg = "cannot construct a CategoricalDtype" with pytest.raises(TypeError, match=msg): CategoricalDtype.construct_from_string('foo')
def test_equality(self): assert is_dtype_equal(self.dtype, 'category') assert is_dtype_equal(self.dtype, CategoricalDtype()) assert not is_dtype_equal(self.dtype, 'foo')
def test_from_values_or_dtype( self, values, categories, ordered, dtype, expected): result = CategoricalDtype._from_values_or_dtype(values, categories, ordered, dtype) assert result == expected
def test_categorical_categories(self): # GH17884 c1 = CategoricalDtype(Categorical(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b']))
def test_update_dtype_errors(self, bad_dtype): dtype = CategoricalDtype(list('abc'), False) msg = 'a CategoricalDtype must be passed to perform an update, ' with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype)
def test_update_dtype_errors(self, bad_dtype): dtype = CategoricalDtype(list('abc'), False) msg = 'a CategoricalDtype must be passed to perform an update, ' with pytest.raises(ValueError, match=msg): dtype.update_dtype(bad_dtype)
def test_construction_from_string(self): result = CategoricalDtype.construct_from_string('category') assert is_dtype_equal(self.dtype, result) msg = "cannot construct a CategoricalDtype" with pytest.raises(TypeError, match=msg): CategoricalDtype.construct_from_string('foo')
def test_constructor_invalid(self): msg = "Parameter 'categories' must be list-like" with pytest.raises(TypeError, match=msg): CategoricalDtype("category")
def test_from_categorical_dtype_identity(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # Identity test for no changes c2 = CategoricalDtype._from_categorical_dtype(c1) assert c2 is c1
assert not com.is_offsetlike(np.array([pd.DateOffset(), pd.Timestamp(0)])) @pytest.mark.parametrize('input_param,result', [ (int, np.dtype(int)), ('int32', np.dtype('int32')), (float, np.dtype(float)), ('float64', np.dtype('float64')), (np.dtype('float64'), np.dtype('float64')), (str, np.dtype(str)), (pd.Series([1, 2], dtype=np.dtype('int16')), np.dtype('int16')), (pd.Series(['a', 'b']), np.dtype(object)), (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), (CategoricalDtype(), CategoricalDtype()), (CategoricalDtype(['a', 'b']), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype('=M8[ns]')), (pd.DatetimeIndex([1, 2]).dtype, np.dtype('=M8[ns]')), ('<M8[ns]', np.dtype('<M8[ns]')), ('datetime64[ns, Europe/London]', DatetimeTZDtype('ns', 'Europe/London')), (pd.SparseSeries([1, 2], dtype='int32'), SparseDtype('int32')), (pd.SparseSeries([1, 2], dtype='int32').dtype, SparseDtype('int32')), (PeriodDtype(freq='D'), PeriodDtype(freq='D')), ('period[D]', PeriodDtype(freq='D')), (IntervalDtype(), IntervalDtype()), ])
def test_from_categorical_dtype_both(self): c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) # override ordered result = CategoricalDtype._from_categorical_dtype( c1, categories=[1, 2], ordered=False) assert result == CategoricalDtype([1, 2], ordered=False)
def test_categorical_dtype(self): assert com.pandas_dtype('category') == CategoricalDtype()
def test_is_dtype(self): assert CategoricalDtype.is_dtype(self.dtype) assert CategoricalDtype.is_dtype('category') assert CategoricalDtype.is_dtype(CategoricalDtype()) assert not CategoricalDtype.is_dtype('foo') assert not CategoricalDtype.is_dtype(np.float64)
def test_basic(self, categories, ordered): c1 = CategoricalDtype(categories, ordered=ordered) tm.assert_index_equal(c1.categories, pd.Index(categories)) assert c1.ordered is ordered