def test_timedeltas(self): df = DataFrame( dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) result = df.get_dtype_counts().sort_values() expected = Series({ 'datetime64[ns]': 1, 'timedelta64[ns]': 1 }).sort_values() assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] expected = Series({ 'datetime64[ns]': 2, 'timedelta64[ns]': 1 }).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) # mixed int types df['D'] = 1 expected = Series({ 'datetime64[ns]': 2, 'timedelta64[ns]': 1, 'int64': 1 }).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected)
class GetDtypeCounts(object): # 2807 def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): self.df.get_dtype_counts() def time_info(self): self.df.info()
class GetDtypeCounts: # 2807 def setup(self): self.df = DataFrame(np.random.randn(10, 10000)) def time_frame_get_dtype_counts(self): with warnings.catch_warnings(record=True): self.df.get_dtype_counts() def time_info(self): self.df.info()
def test_construction_with_mixed(self, float_string_frame): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], [ datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1) ]] df = DataFrame(data) # check dtypes result = df.get_dtype_counts().sort_values() expected = Series({'datetime64[ns]': 3}) # mixed-type frames float_string_frame['datetime'] = datetime.now() float_string_frame['timedelta'] = timedelta(days=1, seconds=1) assert float_string_frame['datetime'].dtype == 'M8[ns]' assert float_string_frame['timedelta'].dtype == 'm8[ns]' result = float_string_frame.get_dtype_counts().sort_values() expected = Series({ 'float64': 4, 'object': 1, 'datetime64[ns]': 1, 'timedelta64[ns]': 1 }).sort_values() assert_series_equal(result, expected)
def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = df.get_dtype_counts().sort_values() expected = Series({'object': 5}) assert_series_equal(result, expected) result = df.fillna(1) expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) result = result.get_dtype_counts().sort_values() expected = Series({'int64': 5}) assert_series_equal(result, expected) # empty block df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64') result = df.fillna('nan') expected = DataFrame('nan', index=lrange(3), columns=['A', 'B']) assert_frame_equal(result, expected) # equiv of replace df = DataFrame(dict(A=[1, np.nan], B=[1., 2.])) for v in ['', 1, np.nan, 1.0]: expected = df.replace(np.nan, v) result = df.fillna(v) assert_frame_equal(result, expected)
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) tm.assert_series_equal(df['a'], df['foo'], check_names=False) df.insert(2, 'bar', df['c']) tm.assert_index_equal(df.columns, Index(['foo', 'c', 'bar', 'b', 'a'])) tm.assert_almost_equal(df['c'], df['bar'], check_names=False) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float32=1, float64=5)) assert (df.get_dtype_counts().sort_index() == result).all() # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float32=2, float64=4)) assert (df.get_dtype_counts().sort_index() == result).all() df['y'] = df['a'].astype('int32') result = Series(dict(float32=2, float64=4, int32=1)) assert (df.get_dtype_counts().sort_index() == result).all() with pytest.raises(ValueError, match='already exists'): df.insert(1, 'a', df['b']) msg = "cannot insert c, already exists" with pytest.raises(ValueError, match=msg): df.insert(1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) assert df.columns.name == 'some_name' # GH 13522 df = DataFrame(index=['A', 'B', 'C']) df['X'] = df.index df['X'] = ['x', 'y', 'z'] exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp)
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) self.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a'])) tm.assert_series_equal(df['a'], df['foo'], check_names=False) df.insert(2, 'bar', df['c']) self.assert_index_equal(df.columns, Index(['foo', 'c', 'bar', 'b', 'a'])) tm.assert_almost_equal(df['c'], df['bar'], check_names=False) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float64=5, float32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float64=4, float32=2)) self.assertTrue((df.get_dtype_counts() == result).all()) df['y'] = df['a'].astype('int32') result = Series(dict(float64=4, float32=2, int32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) with assertRaisesRegexp(ValueError, 'already exists'): df.insert(1, 'a', df['b']) self.assertRaises(ValueError, df.insert, 1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name') # GH 13522 df = DataFrame(index=['A', 'B', 'C']) df['X'] = df.index df['X'] = ['x', 'y', 'z'] exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C']) assert_frame_equal(df, exp)
def test_unstack_dtypes(self): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] df = DataFrame(rows, columns=list('ABCD')) result = df.get_dtype_counts() expected = Series({'int64': 4}) assert_series_equal(result, expected) # single dtype df2 = df.set_index(['A', 'B']) df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'int64': 4}) assert_series_equal(result, expected) # mixed df2 = df.set_index(['A', 'B']) df2['C'] = 3. df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'int64': 2, 'float64': 2}) assert_series_equal(result, expected) df2['D'] = 'foo' df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'float64': 2, 'object': 2}) assert_series_equal(result, expected) # GH7405 for c, d in (np.zeros(5), np.zeros(5)), \ (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): df = DataFrame({ 'A': ['a'] * 5, 'C': c, 'D': d, 'B': pd.date_range('2012-01-01', periods=5) }) right = df.iloc[:3].copy(deep=True) df = df.set_index(['A', 'B']) df['D'] = df['D'].astype('int64') left = df.iloc[:3].unstack(0) right = right.set_index(['A', 'B']).unstack(0) right[('D', 'a')] = right[('D', 'a')].astype('int64') assert left.shape == (3, 2) tm.assert_frame_equal(left, right)
def test_unstack_dtypes(self): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] df = DataFrame(rows, columns=list('ABCD')) result = df.get_dtype_counts() expected = Series({'int64': 4}) assert_series_equal(result, expected) # single dtype df2 = df.set_index(['A', 'B']) df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'int64': 4}) assert_series_equal(result, expected) # mixed df2 = df.set_index(['A', 'B']) df2['C'] = 3. df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'int64': 2, 'float64': 2}) assert_series_equal(result, expected) df2['D'] = 'foo' df3 = df2.unstack('B') result = df3.get_dtype_counts() expected = Series({'float64': 2, 'object': 2}) assert_series_equal(result, expected) # GH7405 for c, d in (np.zeros(5), np.zeros(5)), \ (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')): df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d, 'B': pd.date_range('2012-01-01', periods=5)}) right = df.iloc[:3].copy(deep=True) df = df.set_index(['A', 'B']) df['D'] = df['D'].astype('int64') left = df.iloc[:3].unstack(0) right = right.set_index(['A', 'B']).unstack(0) right[('D', 'a')] = right[('D', 'a')].astype('int64') assert left.shape == (3, 2) tm.assert_frame_equal(left, right)
def test_apply_with_mixed_dtype(): # GH3480, apply with mixed dtype on axis=1 breaks in 0.11 df = DataFrame({'foo1': np.random.randn(6), 'foo2': ['one', 'two', 'two', 'three', 'one', 'two']}) result = df.apply(lambda x: x, axis=1) tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts()) # GH 3610 incorrect dtype conversion with as_index=False df = DataFrame({"c1": [1, 2, 6, 6, 8]}) df["c2"] = df.c1 / 2.0 result1 = df.groupby("c2").mean().reset_index().c2 result2 = df.groupby("c2", as_index=False).mean().c2 tm.assert_series_equal(result1, result2)
def test_timedeltas(self): df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3, freq='D')), B=Series([timedelta(days=i) for i in range(3)]))) result = df.get_dtype_counts().sort_values() expected = Series( {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_values() assert_series_equal(result, expected) df['C'] = df['A'] + df['B'] expected = Series( {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected) # mixed int types df['D'] = 1 expected = Series({'datetime64[ns]': 2, 'timedelta64[ns]': 1, 'int64': 1}).sort_values() result = df.get_dtype_counts().sort_values() assert_series_equal(result, expected)
def test_insert(self): df = DataFrame(np.random.randn(5, 3), index=np.arange(5), columns=['c', 'b', 'a']) df.insert(0, 'foo', df['a']) self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'b', 'a']) assert_almost_equal(df['a'], df['foo']) df.insert(2, 'bar', df['c']) self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'bar', 'b', 'a']) assert_almost_equal(df['c'], df['bar']) # diff dtype # new item df['x'] = df['a'].astype('float32') result = Series(dict(float64=5, float32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) # replacing current (in different block) df['a'] = df['a'].astype('float32') result = Series(dict(float64=4, float32=2)) self.assertTrue((df.get_dtype_counts() == result).all()) df['y'] = df['a'].astype('int32') result = Series(dict(float64=4, float32=2, int32=1)) self.assertTrue((df.get_dtype_counts() == result).all()) with assertRaisesRegexp(ValueError, 'already exists'): df.insert(1, 'a', df['b']) self.assertRaises(ValueError, df.insert, 1, 'c', df['b']) df.columns.name = 'some_name' # preserve columns name field df.insert(0, 'baz', df['c']) self.assertEqual(df.columns.name, 'some_name')
def test_frame_no_datetime64_dtype(self): dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') dr_tz = dr.tz_localize('US/Eastern') e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) self.assert_(e['B'].dtype == 'M8[ns]') # GH 2810 (with timezones) datetimes_naive = [ ts.to_pydatetime() for ts in dr ] datetimes_with_tz = [ ts.to_pydatetime() for ts in dr_tz ] df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz, 'datetimes_naive': datetimes_naive, 'datetimes_with_tz' : datetimes_with_tz }) result = df.get_dtype_counts() expected = Series({ 'datetime64[ns]' : 3, 'object' : 1 }) assert_series_equal(result, expected)
def test_frame_no_datetime64_dtype(self): dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize("US/Eastern") e = DataFrame({"A": "foo", "B": dr_tz}, index=dr) self.assert_(e["B"].dtype == "M8[ns]") # GH 2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] df = DataFrame( {"dr": dr, "dr_tz": dr_tz, "datetimes_naive": datetimes_naive, "datetimes_with_tz": datetimes_with_tz} ) result = df.get_dtype_counts() expected = Series({"datetime64[ns]": 3, "object": 1}) assert_series_equal(result, expected)
def test_unstack_dtypes(self): # GH 2929 rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]] df = DataFrame(rows, columns=list("ABCD")) result = df.get_dtype_counts() expected = Series({"int64": 4}) assert_series_equal(result, expected) # single dtype df2 = df.set_index(["A", "B"]) df3 = df2.unstack("B") result = df3.get_dtype_counts() expected = Series({"int64": 4}) assert_series_equal(result, expected) # mixed df2 = df.set_index(["A", "B"]) df2["C"] = 3.0 df3 = df2.unstack("B") result = df3.get_dtype_counts() expected = Series({"int64": 2, "float64": 2}) assert_series_equal(result, expected) df2["D"] = "foo" df3 = df2.unstack("B") result = df3.get_dtype_counts() expected = Series({"float64": 2, "object": 2}) assert_series_equal(result, expected) # GH7405 for c, d in (np.zeros(5), np.zeros(5)), (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")): df = DataFrame({"A": ["a"] * 5, "C": c, "D": d, "B": pd.date_range("2012-01-01", periods=5)}) right = df.iloc[:3].copy(deep=True) df = df.set_index(["A", "B"]) df["D"] = df["D"].astype("int64") left = df.iloc[:3].unstack(0) right = right.set_index(["A", "B"]).unstack(0) right[("D", "a")] = right[("D", "a")].astype("int64") self.assertEqual(left.shape, (3, 2)) assert_frame_equal(left, right)
def test_get_numeric_data(self): # TODO(wesm): unused? intname = np.dtype(np.int_).name # noqa floatname = np.dtype(np.float_).name # noqa datetime64name = np.dtype('M8[ns]').name objectname = np.dtype(np.object_).name df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'f': Timestamp('20010102')}, index=np.arange(10)) result = df.get_dtype_counts() expected = Series({'int64': 1, 'float64': 1, datetime64name: 1, objectname: 1}) result.sort_index() expected.sort_index() assert_series_equal(result, expected) df = DataFrame({'a': 1., 'b': 2, 'c': 'foo', 'd': np.array([1.] * 10, dtype='float32'), 'e': np.array([1] * 10, dtype='int32'), 'f': np.array([1] * 10, dtype='int16'), 'g': Timestamp('20010102')}, index=np.arange(10)) result = df._get_numeric_data() expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']] assert_frame_equal(result, expected) only_obj = df.loc[:, ['c', 'g']] result = only_obj._get_numeric_data() expected = df.loc[:, []] assert_frame_equal(result, expected) df = DataFrame.from_dict( {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]}) result = df._get_numeric_data() expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]}) assert_frame_equal(result, expected) df = result.copy() result = df._get_numeric_data() expected = df assert_frame_equal(result, expected)
def test_frame_no_datetime64_dtype(self): # after 7822 # these retain the timezones on dict construction dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI") dr_tz = dr.tz_localize(self.tzstr("US/Eastern")) e = DataFrame({"A": "foo", "B": dr_tz}, index=dr) self.assertEqual(e["B"].dtype, "O") # GH 2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] df = DataFrame( {"dr": dr, "dr_tz": dr_tz, "datetimes_naive": datetimes_naive, "datetimes_with_tz": datetimes_with_tz} ) result = df.get_dtype_counts() expected = Series({"datetime64[ns]": 2, "object": 2}) tm.assert_series_equal(result, expected)
def test_frame_no_datetime64_dtype(self): # after 7822 # these retain the timezones on dict construction dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') dr_tz = dr.tz_localize(self.tzstr('US/Eastern')) e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) self.assertEqual(e['B'].dtype, 'O') # GH 2810 (with timezones) datetimes_naive = [ ts.to_pydatetime() for ts in dr ] datetimes_with_tz = [ ts.to_pydatetime() for ts in dr_tz ] df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz, 'datetimes_naive': datetimes_naive, 'datetimes_with_tz' : datetimes_with_tz }) result = df.get_dtype_counts() expected = Series({ 'datetime64[ns]' : 2, 'object' : 2 }) tm.assert_series_equal(result, expected)
def test_frame_no_datetime64_dtype(self, tz): # after GH#7822 # these retain the timezones on dict construction dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI') dr_tz = dr.tz_localize(tz) df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr) tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo) assert df['B'].dtype == tz_expected # GH#2810 (with timezones) datetimes_naive = [ts.to_pydatetime() for ts in dr] datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz] df = DataFrame({'dr': dr, 'dr_tz': dr_tz, 'datetimes_naive': datetimes_naive, 'datetimes_with_tz': datetimes_with_tz}) result = df.get_dtype_counts().sort_index() expected = Series({'datetime64[ns]': 2, str(tz_expected): 2}).sort_index() tm.assert_series_equal(result, expected)
def test_construction_with_mixed(self): # test construction edge cases with mixed types # f7u12, this does not work without extensive workaround data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)], [datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 1)]] df = DataFrame(data) # check dtypes result = df.get_dtype_counts().sort_values() expected = Series({'datetime64[ns]': 3}) # mixed-type frames self.mixed_frame['datetime'] = datetime.now() self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1) assert self.mixed_frame['datetime'].dtype == 'M8[ns]' assert self.mixed_frame['timedelta'].dtype == 'm8[ns]' result = self.mixed_frame.get_dtype_counts().sort_values() expected = Series({'float64': 4, 'object': 1, 'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_values() assert_series_equal(result, expected)
def test_deprecated_get_dtype_counts(self): # GH 18262 df = DataFrame([1]) with tm.assert_produces_warning(FutureWarning): df.get_dtype_counts()