Example #1
0
class GetDtypeCounts(object):
    # 2807
    def setup(self):
        self.df = DataFrame(np.random.randn(10, 10000))

    def time_frame_get_dtype_counts(self):
        self.df.get_dtype_counts()

    def time_info(self):
        self.df.info()
Example #2
0
    def test_fillna_dtype_conversion(self):
        # make sure that fillna on an empty frame works
        df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = df.get_dtype_counts().sort_values()
        expected = Series({'object': 5})
        assert_series_equal(result, expected)

        result = df.fillna(1)
        expected = DataFrame(1, index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
        result = result.get_dtype_counts().sort_values()
        expected = Series({'int64': 5})
        assert_series_equal(result, expected)

        # empty block
        df = DataFrame(index=lrange(3), columns=['A', 'B'], dtype='float64')
        result = df.fillna('nan')
        expected = DataFrame('nan', index=lrange(3), columns=['A', 'B'])
        assert_frame_equal(result, expected)

        # equiv of replace
        df = DataFrame(dict(A=[1, np.nan], B=[1., 2.]))
        for v in ['', 1, np.nan, 1.0]:
            expected = df.replace(np.nan, v)
            result = df.fillna(v)
            assert_frame_equal(result, expected)
Example #3
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        tm.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
        tm.assert_series_equal(df['a'], df['foo'], check_names=False)

        df.insert(2, 'bar', df['c'])
        tm.assert_index_equal(df.columns,
                              Index(['foo', 'c', 'bar', 'b', 'a']))
        tm.assert_almost_equal(df['c'], df['bar'], check_names=False)

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float32=1, float64=5))
        assert (df.get_dtype_counts().sort_index() == result).all()

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float32=2, float64=4))
        assert (df.get_dtype_counts().sort_index() == result).all()

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float32=2, float64=4, int32=1))
        assert (df.get_dtype_counts().sort_index() == result).all()

        with pytest.raises(ValueError, match='already exists'):
            df.insert(1, 'a', df['b'])
        msg = "cannot insert c, already exists"
        with pytest.raises(ValueError, match=msg):
            df.insert(1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        assert df.columns.name == 'some_name'

        # GH 13522
        df = DataFrame(index=['A', 'B', 'C'])
        df['X'] = df.index
        df['X'] = ['x', 'y', 'z']
        exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
        assert_frame_equal(df, exp)
Example #4
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        self.assert_index_equal(df.columns, Index(['foo', 'c', 'b', 'a']))
        tm.assert_series_equal(df['a'], df['foo'], check_names=False)

        df.insert(2, 'bar', df['c'])
        self.assert_index_equal(df.columns,
                                Index(['foo', 'c', 'bar', 'b', 'a']))
        tm.assert_almost_equal(df['c'], df['bar'], check_names=False)

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float64=5, float32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float64=4, float32=2))
        self.assertTrue((df.get_dtype_counts() == result).all())

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float64=4, float32=2, int32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        with assertRaisesRegexp(ValueError, 'already exists'):
            df.insert(1, 'a', df['b'])
        self.assertRaises(ValueError, df.insert, 1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        self.assertEqual(df.columns.name, 'some_name')

        # GH 13522
        df = DataFrame(index=['A', 'B', 'C'])
        df['X'] = df.index
        df['X'] = ['x', 'y', 'z']
        exp = DataFrame(data={'X': ['x', 'y', 'z']}, index=['A', 'B', 'C'])
        assert_frame_equal(df, exp)
Example #5
0
    def test_unstack_dtypes(self):

        # GH 2929
        rows = [[1, 1, 3, 4],
                [1, 2, 3, 4],
                [2, 1, 3, 4],
                [2, 2, 3, 4]]

        df = DataFrame(rows, columns=list('ABCD'))
        result = df.get_dtype_counts()
        expected = Series({'int64': 4})
        assert_series_equal(result, expected)

        # single dtype
        df2 = df.set_index(['A', 'B'])
        df3 = df2.unstack('B')
        result = df3.get_dtype_counts()
        expected = Series({'int64': 4})
        assert_series_equal(result, expected)

        # mixed
        df2 = df.set_index(['A', 'B'])
        df2['C'] = 3.
        df3 = df2.unstack('B')
        result = df3.get_dtype_counts()
        expected = Series({'int64': 2, 'float64': 2})
        assert_series_equal(result, expected)

        df2['D'] = 'foo'
        df3 = df2.unstack('B')
        result = df3.get_dtype_counts()
        expected = Series({'float64': 2, 'object': 2})
        assert_series_equal(result, expected)

        # GH7405
        for c, d in (np.zeros(5), np.zeros(5)), \
                    (np.arange(5, dtype='f8'), np.arange(5, 10, dtype='f8')):

            df = DataFrame({'A': ['a'] * 5, 'C': c, 'D': d,
                            'B': pd.date_range('2012-01-01', periods=5)})

            right = df.iloc[:3].copy(deep=True)

            df = df.set_index(['A', 'B'])
            df['D'] = df['D'].astype('int64')

            left = df.iloc[:3].unstack(0)
            right = right.set_index(['A', 'B']).unstack(0)
            right[('D', 'a')] = right[('D', 'a')].astype('int64')

            assert left.shape == (3, 2)
            tm.assert_frame_equal(left, right)
Example #6
0
    def test_timedeltas(self):
        df = DataFrame(dict(A=Series(date_range('2012-1-1', periods=3,
                                                freq='D')),
                            B=Series([timedelta(days=i) for i in range(3)])))
        result = df.get_dtype_counts().sort_values()
        expected = Series(
            {'datetime64[ns]': 1, 'timedelta64[ns]': 1}).sort_values()
        assert_series_equal(result, expected)

        df['C'] = df['A'] + df['B']
        expected = Series(
            {'datetime64[ns]': 2, 'timedelta64[ns]': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)

        # mixed int types
        df['D'] = 1
        expected = Series({'datetime64[ns]': 2,
                           'timedelta64[ns]': 1,
                           'int64': 1}).sort_values()
        result = df.get_dtype_counts().sort_values()
        assert_series_equal(result, expected)
Example #7
0
def test_apply_with_mixed_dtype():
    # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
    df = DataFrame({'foo1': np.random.randn(6),
                    'foo2': ['one', 'two', 'two', 'three', 'one', 'two']})
    result = df.apply(lambda x: x, axis=1)
    tm.assert_series_equal(df.get_dtype_counts(), result.get_dtype_counts())

    # GH 3610 incorrect dtype conversion with as_index=False
    df = DataFrame({"c1": [1, 2, 6, 6, 8]})
    df["c2"] = df.c1 / 2.0
    result1 = df.groupby("c2").mean().reset_index().c2
    result2 = df.groupby("c2", as_index=False).mean().c2
    tm.assert_series_equal(result1, result2)
Example #8
0
    def test_frame_no_datetime64_dtype(self):

        dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
        dr_tz = dr.tz_localize('US/Eastern')
        e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
        self.assert_(e['B'].dtype == 'M8[ns]')

        # GH 2810 (with timezones)
        datetimes_naive   = [ ts.to_pydatetime() for ts in dr ]
        datetimes_with_tz = [ ts.to_pydatetime() for ts in dr_tz ]
        df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz, 'datetimes_naive': datetimes_naive, 'datetimes_with_tz' : datetimes_with_tz })
        result = df.get_dtype_counts()
        expected = Series({ 'datetime64[ns]' : 3, 'object' : 1 })
        assert_series_equal(result, expected)
Example #9
0
    def test_insert(self):
        df = DataFrame(np.random.randn(5, 3), index=np.arange(5),
                       columns=['c', 'b', 'a'])

        df.insert(0, 'foo', df['a'])
        self.assert_numpy_array_equal(df.columns, ['foo', 'c', 'b', 'a'])
        assert_almost_equal(df['a'], df['foo'])

        df.insert(2, 'bar', df['c'])
        self.assert_numpy_array_equal(df.columns,
                                      ['foo', 'c', 'bar', 'b', 'a'])
        assert_almost_equal(df['c'], df['bar'])

        # diff dtype

        # new item
        df['x'] = df['a'].astype('float32')
        result = Series(dict(float64=5, float32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        # replacing current (in different block)
        df['a'] = df['a'].astype('float32')
        result = Series(dict(float64=4, float32=2))
        self.assertTrue((df.get_dtype_counts() == result).all())

        df['y'] = df['a'].astype('int32')
        result = Series(dict(float64=4, float32=2, int32=1))
        self.assertTrue((df.get_dtype_counts() == result).all())

        with assertRaisesRegexp(ValueError, 'already exists'):
            df.insert(1, 'a', df['b'])
        self.assertRaises(ValueError, df.insert, 1, 'c', df['b'])

        df.columns.name = 'some_name'
        # preserve columns name field
        df.insert(0, 'baz', df['c'])
        self.assertEqual(df.columns.name, 'some_name')
Example #10
0
    def test_frame_no_datetime64_dtype(self):

        dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
        dr_tz = dr.tz_localize("US/Eastern")
        e = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
        self.assert_(e["B"].dtype == "M8[ns]")

        # GH 2810 (with timezones)
        datetimes_naive = [ts.to_pydatetime() for ts in dr]
        datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
        df = DataFrame(
            {"dr": dr, "dr_tz": dr_tz, "datetimes_naive": datetimes_naive, "datetimes_with_tz": datetimes_with_tz}
        )
        result = df.get_dtype_counts()
        expected = Series({"datetime64[ns]": 3, "object": 1})
        assert_series_equal(result, expected)
Example #11
0
    def test_unstack_dtypes(self):

        # GH 2929
        rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]]

        df = DataFrame(rows, columns=list("ABCD"))
        result = df.get_dtype_counts()
        expected = Series({"int64": 4})
        assert_series_equal(result, expected)

        # single dtype
        df2 = df.set_index(["A", "B"])
        df3 = df2.unstack("B")
        result = df3.get_dtype_counts()
        expected = Series({"int64": 4})
        assert_series_equal(result, expected)

        # mixed
        df2 = df.set_index(["A", "B"])
        df2["C"] = 3.0
        df3 = df2.unstack("B")
        result = df3.get_dtype_counts()
        expected = Series({"int64": 2, "float64": 2})
        assert_series_equal(result, expected)

        df2["D"] = "foo"
        df3 = df2.unstack("B")
        result = df3.get_dtype_counts()
        expected = Series({"float64": 2, "object": 2})
        assert_series_equal(result, expected)

        # GH7405
        for c, d in (np.zeros(5), np.zeros(5)), (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")):

            df = DataFrame({"A": ["a"] * 5, "C": c, "D": d, "B": pd.date_range("2012-01-01", periods=5)})

            right = df.iloc[:3].copy(deep=True)

            df = df.set_index(["A", "B"])
            df["D"] = df["D"].astype("int64")

            left = df.iloc[:3].unstack(0)
            right = right.set_index(["A", "B"]).unstack(0)
            right[("D", "a")] = right[("D", "a")].astype("int64")

            self.assertEqual(left.shape, (3, 2))
            assert_frame_equal(left, right)
Example #12
0
    def test_get_numeric_data(self):
        # TODO(wesm): unused?
        intname = np.dtype(np.int_).name  # noqa
        floatname = np.dtype(np.float_).name  # noqa

        datetime64name = np.dtype('M8[ns]').name
        objectname = np.dtype(np.object_).name

        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
                        'f': Timestamp('20010102')},
                       index=np.arange(10))
        result = df.get_dtype_counts()
        expected = Series({'int64': 1, 'float64': 1,
                           datetime64name: 1, objectname: 1})
        result.sort_index()
        expected.sort_index()
        assert_series_equal(result, expected)

        df = DataFrame({'a': 1., 'b': 2, 'c': 'foo',
                        'd': np.array([1.] * 10, dtype='float32'),
                        'e': np.array([1] * 10, dtype='int32'),
                        'f': np.array([1] * 10, dtype='int16'),
                        'g': Timestamp('20010102')},
                       index=np.arange(10))

        result = df._get_numeric_data()
        expected = df.loc[:, ['a', 'b', 'd', 'e', 'f']]
        assert_frame_equal(result, expected)

        only_obj = df.loc[:, ['c', 'g']]
        result = only_obj._get_numeric_data()
        expected = df.loc[:, []]
        assert_frame_equal(result, expected)

        df = DataFrame.from_dict(
            {'a': [1, 2], 'b': ['foo', 'bar'], 'c': [np.pi, np.e]})
        result = df._get_numeric_data()
        expected = DataFrame.from_dict({'a': [1, 2], 'c': [np.pi, np.e]})
        assert_frame_equal(result, expected)

        df = result.copy()
        result = df._get_numeric_data()
        expected = df
        assert_frame_equal(result, expected)
    def test_frame_no_datetime64_dtype(self):

        # after 7822
        # these retain the timezones on dict construction

        dr = date_range("2011/1/1", "2012/1/1", freq="W-FRI")
        dr_tz = dr.tz_localize(self.tzstr("US/Eastern"))
        e = DataFrame({"A": "foo", "B": dr_tz}, index=dr)
        self.assertEqual(e["B"].dtype, "O")

        # GH 2810 (with timezones)
        datetimes_naive = [ts.to_pydatetime() for ts in dr]
        datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
        df = DataFrame(
            {"dr": dr, "dr_tz": dr_tz, "datetimes_naive": datetimes_naive, "datetimes_with_tz": datetimes_with_tz}
        )
        result = df.get_dtype_counts()
        expected = Series({"datetime64[ns]": 2, "object": 2})
        tm.assert_series_equal(result, expected)
Example #14
0
    def test_frame_no_datetime64_dtype(self):

        # after 7822
        # these retain the timezones on dict construction

        dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
        dr_tz = dr.tz_localize(self.tzstr('US/Eastern'))
        e = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
        self.assertEqual(e['B'].dtype, 'O')

        # GH 2810 (with timezones)
        datetimes_naive   = [ ts.to_pydatetime() for ts in dr ]
        datetimes_with_tz = [ ts.to_pydatetime() for ts in dr_tz ]
        df = DataFrame({'dr' : dr, 'dr_tz' : dr_tz,
                        'datetimes_naive': datetimes_naive,
                        'datetimes_with_tz' : datetimes_with_tz })
        result = df.get_dtype_counts()
        expected = Series({ 'datetime64[ns]' : 2, 'object' : 2 })
        tm.assert_series_equal(result, expected)
Example #15
0
    def test_frame_no_datetime64_dtype(self, tz):
        # after GH#7822
        # these retain the timezones on dict construction
        dr = date_range('2011/1/1', '2012/1/1', freq='W-FRI')
        dr_tz = dr.tz_localize(tz)
        df = DataFrame({'A': 'foo', 'B': dr_tz}, index=dr)
        tz_expected = DatetimeTZDtype('ns', dr_tz.tzinfo)
        assert df['B'].dtype == tz_expected

        # GH#2810 (with timezones)
        datetimes_naive = [ts.to_pydatetime() for ts in dr]
        datetimes_with_tz = [ts.to_pydatetime() for ts in dr_tz]
        df = DataFrame({'dr': dr,
                        'dr_tz': dr_tz,
                        'datetimes_naive': datetimes_naive,
                        'datetimes_with_tz': datetimes_with_tz})
        result = df.get_dtype_counts().sort_index()
        expected = Series({'datetime64[ns]': 2,
                           str(tz_expected): 2}).sort_index()
        tm.assert_series_equal(result, expected)
Example #16
0
    def test_construction_with_mixed(self):
        # test construction edge cases with mixed types

        # f7u12, this does not work without extensive workaround
        data = [[datetime(2001, 1, 5), nan, datetime(2001, 1, 2)],
                [datetime(2000, 1, 2), datetime(2000, 1, 3),
                 datetime(2000, 1, 1)]]
        df = DataFrame(data)

        # check dtypes
        result = df.get_dtype_counts().sort_values()
        expected = Series({'datetime64[ns]': 3})

        # mixed-type frames
        self.mixed_frame['datetime'] = datetime.now()
        self.mixed_frame['timedelta'] = timedelta(days=1, seconds=1)
        assert self.mixed_frame['datetime'].dtype == 'M8[ns]'
        assert self.mixed_frame['timedelta'].dtype == 'm8[ns]'
        result = self.mixed_frame.get_dtype_counts().sort_values()
        expected = Series({'float64': 4,
                           'object': 1,
                           'datetime64[ns]': 1,
                           'timedelta64[ns]': 1}).sort_values()
        assert_series_equal(result, expected)