Beispiel #1
0
    def test_datetime_cut(self):
        # GH 14714
        # testing for time data to be present as series
        data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03']))
        result, bins = cut(data, 3, retbins=True)
        expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]',
                           '(2013-01-01 16:00:00, 2013-01-02 08:00:00]',
                           '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'],
                          ).astype("category", ordered=True)
        tm.assert_series_equal(result, expected)

        # testing for time data to be present as list
        data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'),
                np.datetime64('2013-01-03')]
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)

        # testing for time data to be present as ndarray
        data = np.array([np.datetime64('2013-01-01'),
                         np.datetime64('2013-01-02'),
                         np.datetime64('2013-01-03')])
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)

        # testing for time data to be present as datetime index
        data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03'])
        result, bins = cut(data, 3, retbins=True)
        tm.assert_series_equal(Series(result), expected)
Beispiel #2
0
    def test_label_formatting(self):
        self.assertEquals(tmod._trim_zeros('1.000'), '1')

        # it works
        result = cut(np.arange(11.), 2)

        result = cut(np.arange(11.) / 1e10, 2)
Beispiel #3
0
    def test_labels(self):
        arr = np.tile(np.arange(0, 1.01, 0.1), 4)

        result, bins = cut(arr, 4, retbins=True)
        ex_levels = ["(-0.001, 0.25]", "(0.25, 0.5]", "(0.5, 0.75]", "(0.75, 1]"]
        self.assert_(np.array_equal(result.levels, ex_levels))

        result, bins = cut(arr, 4, retbins=True, right=False)
        ex_levels = ["[0, 0.25)", "[0.25, 0.5)", "[0.5, 0.75)", "[0.75, 1.001)"]
        self.assert_(np.array_equal(result.levels, ex_levels))
Beispiel #4
0
    def test_cut_pass_labels(self):
        arr = [50, 5, 10, 15, 20, 30, 70]
        bins = [0, 25, 50, 100]
        labels = ['Small', 'Medium', 'Large']

        result = cut(arr, bins, labels=labels)

        exp = cut(arr, bins)
        exp.categories = labels

        tm.assert_categorical_equal(result, exp)
Beispiel #5
0
    def test_cut_pass_labels(self):
        arr = [50, 5, 10, 15, 20, 30, 70]
        bins = [0, 25, 50, 100]
        labels = ["Small", "Medium", "Large"]

        result = cut(arr, bins, labels=labels)

        exp = cut(arr, bins)
        exp.categories = labels

        self.assertTrue(result.equals(exp))
Beispiel #6
0
    def test_single_bin(self):
        # issue 14652
        expected = Series([0, 0])

        s = Series([9., 9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)

        s = Series([-9., -9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)
Beispiel #7
0
    def test_cut_pass_labels(self):
        arr = [50, 5, 10, 15, 20, 30, 70]
        bins = [0, 25, 50, 100]
        labels = ['Small', 'Medium', 'Large']

        result = cut(arr, bins, labels=labels)

        exp = cut(arr, bins)
        exp.levels = labels

        self.assert_(result.equals(exp))
Beispiel #8
0
    def test_cut_pass_labels(self):
        arr = [50, 5, 10, 15, 20, 30, 70]
        bins = [0, 25, 50, 100]
        labels = ['Small', 'Medium', 'Large']

        result = cut(arr, bins, labels=labels)

        exp = cut(arr, bins)
        exp.levels = labels

        self.assert_(result.equals(exp))
Beispiel #9
0
    def test_single_bin(self):
        # issue 14652
        expected = Series([0, 0])

        s = Series([9., 9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)

        s = Series([-9., -9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)
Beispiel #10
0
    def test_na_handling(self):
        arr = np.arange(0, 0.75, 0.01)
        arr[::3] = np.nan

        labels = cut(arr, 4)
        ex_labels = np.where(com.isnull(arr), np.nan, labels)

        tm.assert_almost_equal(labels, ex_labels)

        labels = cut(arr, 4, labels=False)
        ex_labels = np.where(com.isnull(arr), np.nan, labels)
        tm.assert_almost_equal(labels, ex_labels)
Beispiel #11
0
    def test_labels(self):
        arr = np.tile(np.arange(0, 1.01, 0.1), 4)

        result, bins = cut(arr, 4, retbins=True)
        ex_levels = Index(['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
                           '(0.75, 1]'])
        self.assert_index_equal(result.categories, ex_levels)

        result, bins = cut(arr, 4, retbins=True, right=False)
        ex_levels = Index(['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
                           '[0.75, 1.001)'])
        self.assert_index_equal(result.categories, ex_levels)
Beispiel #12
0
    def test_labels(self):
        arr = np.tile(np.arange(0, 1.01, 0.1), 4)

        result, bins = cut(arr, 4, retbins=True)
        ex_levels = Index(
            ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'])
        self.assert_index_equal(result.categories, ex_levels)

        result, bins = cut(arr, 4, retbins=True, right=False)
        ex_levels = Index(
            ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'])
        self.assert_index_equal(result.categories, ex_levels)
Beispiel #13
0
    def test_labels(self):
        arr = np.tile(np.arange(0, 1.01, 0.1), 4)

        result, bins = cut(arr, 4, retbins=True)
        ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
                     '(0.75, 1]']
        self.assert_numpy_array_equal(result.levels, ex_levels)

        result, bins = cut(arr, 4, retbins=True, right=False)
        ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
                     '[0.75, 1.001)']
        self.assert_numpy_array_equal(result.levels, ex_levels)
Beispiel #14
0
    def test_label_formatting(self):
        self.assertEquals(tmod._trim_zeros("1.000"), "1")

        # it works
        result = cut(np.arange(11.0), 2)

        result = cut(np.arange(11.0) / 1e10, 2)

        # #1979, negative numbers

        result = tmod._format_label(-117.9998, precision=3)
        self.assertEquals(result, "-118")
        result = tmod._format_label(117.9998, precision=3)
        self.assertEquals(result, "118")
Beispiel #15
0
    def test_labels(self):
        arr = np.tile(np.arange(0, 1.01, 0.1), 4)

        labels, bins = cut(arr, 4, retbins=True)
        distinct_labels = sorted(unique(labels))
        ex_labels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]',
                     '(0.75, 1]']
        self.assertEqual(distinct_labels, ex_labels)

        labels, bins = cut(arr, 4, retbins=True, right=False)
        distinct_labels = sorted(unique(labels))
        ex_labels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)',
                     '[0.75, 1.001)']
        self.assertEqual(distinct_labels, ex_labels)
Beispiel #16
0
    def test_label_formatting(self):
        self.assertEqual(tmod._trim_zeros('1.000'), '1')

        # it works
        result = cut(np.arange(11.), 2)

        result = cut(np.arange(11.) / 1e10, 2)

        # #1979, negative numbers

        result = tmod._format_label(-117.9998, precision=3)
        self.assertEqual(result, '-118')
        result = tmod._format_label(117.9998, precision=3)
        self.assertEqual(result, '118')
Beispiel #17
0
    def test_single_bin(self):
        # issue 14652
        # Explicit dtype since Series produces int64 for ints, while cut
        # (due to numpy.searchsorted) would use int32 on i386, so let's assure
        # correct default to the architecture int
        expected = Series([0, 0], dtype=np.dtype(int))

        s = Series([9., 9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)

        s = Series([-9., -9.])
        result = cut(s, 1, labels=False)
        tm.assert_series_equal(result, expected)
Beispiel #18
0
    def test_label_formatting(self):
        self.assertEqual(tmod._trim_zeros('1.000'), '1')

        # it works
        result = cut(np.arange(11.), 2)

        result = cut(np.arange(11.) / 1e10, 2)

        # #1979, negative numbers

        result = tmod._format_label(-117.9998, precision=3)
        self.assertEqual(result, '-118')
        result = tmod._format_label(117.9998, precision=3)
        self.assertEqual(result, '118')
Beispiel #19
0
    def test_inf_handling(self):
        data = np.arange(6)
        data_ser = Series(data, dtype='int64')

        result = cut(data, [-np.inf, 2, 4, np.inf])
        result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf])

        ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]'])

        tm.assert_index_equal(result.categories, ex_categories)
        tm.assert_index_equal(result_ser.cat.categories, ex_categories)
        self.assertEqual(result[5], '(4, inf]')
        self.assertEqual(result[0], '(-inf, 2]')
        self.assertEqual(result_ser[5], '(4, inf]')
        self.assertEqual(result_ser[0], '(-inf, 2]')
Beispiel #20
0
    def test_na_handling(self):
        arr = np.arange(0, 0.75, 0.01)
        arr[::3] = np.nan

        result = cut(arr, 4)

        result_arr = np.asarray(result)

        ex_arr = np.where(com.isnull(arr), np.nan, result_arr)

        tm.assert_almost_equal(result_arr, ex_arr)

        result = cut(arr, 4, labels=False)
        ex_result = np.where(com.isnull(arr), np.nan, result)
        tm.assert_almost_equal(result, ex_result)
Beispiel #21
0
    def test_inf_handling(self):
        data = np.arange(6)
        data_ser = Series(data, dtype='int64')

        result = cut(data, [-np.inf, 2, 4, np.inf])
        result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf])

        ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]'])

        tm.assert_index_equal(result.categories, ex_categories)
        tm.assert_index_equal(result_ser.cat.categories, ex_categories)
        self.assertEqual(result[5], '(4, inf]')
        self.assertEqual(result[0], '(-inf, 2]')
        self.assertEqual(result_ser[5], '(4, inf]')
        self.assertEqual(result_ser[0], '(-inf, 2]')
Beispiel #22
0
    def test_inf_handling(self):
        data = np.arange(6)
        data_ser = Series(data,dtype='int64')

        result = cut(data, [-np.inf, 2, 4, np.inf])
        result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf])

        ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]']

        np.testing.assert_array_equal(result.levels, ex_levels)
        np.testing.assert_array_equal(result_ser.levels, ex_levels)
        self.assertEquals(result[5], '(4, inf]')
        self.assertEquals(result[0], '(-inf, 2]')
        self.assertEquals(result_ser[5], '(4, inf]')
        self.assertEquals(result_ser[0], '(-inf, 2]')
Beispiel #23
0
    def test_na_handling(self):
        arr = np.arange(0, 0.75, 0.01)
        arr[::3] = np.nan

        result = cut(arr, 4)

        result_arr = np.asarray(result)

        ex_arr = np.where(com.isnull(arr), np.nan, result_arr)

        tm.assert_almost_equal(result_arr, ex_arr)

        result = cut(arr, 4, labels=False)
        ex_result = np.where(com.isnull(arr), np.nan, result)
        tm.assert_almost_equal(result, ex_result)
Beispiel #24
0
    def test_inf_handling(self):
        data = np.arange(6)
        data_ser = Series(data, dtype="int64")

        result = cut(data, [-np.inf, 2, 4, np.inf])
        result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf])

        ex_categories = ["(-inf, 2]", "(2, 4]", "(4, inf]"]

        np.testing.assert_array_equal(result.categories, ex_categories)
        np.testing.assert_array_equal(result_ser.cat.categories, ex_categories)
        self.assertEqual(result[5], "(4, inf]")
        self.assertEqual(result[0], "(-inf, 2]")
        self.assertEqual(result_ser[5], "(4, inf]")
        self.assertEqual(result_ser[0], "(-inf, 2]")
Beispiel #25
0
    def test_inf_handling(self):
        data = np.arange(6)
        data_ser = Series(data,dtype='int64')

        result = cut(data, [-np.inf, 2, 4, np.inf])
        result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf])

        ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]']

        np.testing.assert_array_equal(result.levels, ex_levels)
        np.testing.assert_array_equal(result_ser.levels, ex_levels)
        self.assertEquals(result[5], '(4, inf]')
        self.assertEquals(result[0], '(-inf, 2]')
        self.assertEquals(result_ser[5], '(4, inf]')
        self.assertEquals(result_ser[0], '(-inf, 2]')
Beispiel #26
0
 def test_noright(self):
     data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
     result, bins = cut(data, 4, right=False, retbins=True)
     exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8)
     tm.assert_numpy_array_equal(result.codes, exp_codes)
     exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095])
     tm.assert_almost_equal(bins, exp)
Beispiel #27
0
    def test_label_precision(self):
        arr = np.arange(0, 0.73, 0.01)

        result = cut(arr, 4, precision=2)
        ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
                     '(0.54, 0.72]']
        self.assert_(np.array_equal(result.levels, ex_levels))
Beispiel #28
0
    def test_label_precision(self):
        arr = np.arange(0, 0.73, 0.01)

        result = cut(arr, 4, precision=2)
        ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]',
                     '(0.54, 0.72]']
        self.assert_(np.array_equal(result.levels, ex_levels))
Beispiel #29
0
 def test_arraylike(self):
     data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
     result, bins = cut(data, 3, retbins=True)
     exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8)
     tm.assert_numpy_array_equal(result.codes, exp_codes)
     exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7])
     tm.assert_almost_equal(bins, exp)
Beispiel #30
0
    def test_label_precision(self):
        arr = np.arange(0, 0.73, 0.01)

        result = cut(arr, 4, precision=2)
        ex_levels = Index(['(-0.00072, 0.18]', '(0.18, 0.36]',
                           '(0.36, 0.54]', '(0.54, 0.72]'])
        self.assert_index_equal(result.categories, ex_levels)
Beispiel #31
0
 def test_arraylike(self):
     data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1]
     result, bins = cut(data, 3, retbins=True)
     exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8)
     tm.assert_numpy_array_equal(result.codes, exp_codes)
     exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7])
     tm.assert_almost_equal(bins, exp)
Beispiel #32
0
 def test_noright(self):
     data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575])
     result, bins = cut(data, 4, right=False, retbins=True)
     exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8)
     tm.assert_numpy_array_equal(result.codes, exp_codes)
     exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095])
     tm.assert_almost_equal(bins, exp)
Beispiel #33
0
    def test_cut_out_of_bounds(self):
        arr = np.random.randn(100)

        result = cut(arr, [-1, 0, 1])

        mask = result.codes == -1
        ex_mask = (arr < -1) | (arr > 1)
        self.assert_numpy_array_equal(mask, ex_mask)
Beispiel #34
0
    def test_label_precision(self):
        arr = np.arange(0, 0.75, 0.01)

        labels = cut(arr, 4, precision=2)
        distinct_labels = sorted(unique(labels))
        ex_labels = ['(-0.00074, 0.18]', '(0.18, 0.37]', '(0.37, 0.55]',
                     '(0.55, 0.74]']
        self.assertEqual(distinct_labels, ex_labels)
Beispiel #35
0
    def test_cut_out_of_bounds(self):
        arr = np.random.randn(100)

        result = cut(arr, [-1, 0, 1])

        mask = result.labels == -1
        ex_mask = (arr < -1) | (arr > 1)
        self.assertTrue(np.array_equal(mask, ex_mask))
Beispiel #36
0
 def test_cut_return_categorical(self):
     s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8])
     res = cut(s, 3)
     exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2],
                                         ["(-0.008, 2.667]",
                                          "(2.667, 5.333]", "(5.333, 8]"],
                                         ordered=True))
     tm.assert_series_equal(res, exp)
Beispiel #37
0
    def test_cut_out_of_bounds(self):
        arr = np.random.randn(100)

        result = cut(arr, [-1, 0, 1])

        mask = result.codes == -1
        ex_mask = (arr < -1) | (arr > 1)
        self.assert_numpy_array_equal(mask, ex_mask)
Beispiel #38
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0])
        assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        self.assert_(np.array_equal(labels, ex_levels))
Beispiel #39
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        tm.assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        self.assert_categorical_equal(labels, ex_levels)
Beispiel #40
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        tm.assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins, include_lowest=True)
        self.assert_categorical_equal(labels, ex_levels)
Beispiel #41
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        ex_bins[0] -= (arr.max() - arr.min()) * 0.001
        assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins)
        self.assert_(np.array_equal(labels, ex_levels))
Beispiel #42
0
    def test_series_retbins(self):
        # GH 8589
        s = Series(np.arange(4))
        result, bins = cut(s, 2, retbins=True)
        assert_equal(result.cat.codes.values, [0, 0, 1, 1])
        assert_almost_equal(bins, [-0.003, 1.5, 3])

        result, bins = qcut(s, 2, retbins=True)
        assert_equal(result.cat.codes.values, [0, 0, 1, 1])
        assert_almost_equal(bins, [0, 1.5, 3])
Beispiel #43
0
    def test_series_retbins(self):
        # GH 8589
        s = Series(np.arange(4))
        result, bins = cut(s, 2, retbins=True)
        tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1])
        tm.assert_almost_equal(bins, [-0.003, 1.5, 3])

        result, bins = qcut(s, 2, retbins=True)
        tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1])
        tm.assert_almost_equal(bins, [0, 1.5, 3])
Beispiel #44
0
    def test_qcut(self):
        arr = np.random.randn(1000)

        labels, bins = qcut(arr, 4, retbins=True)
        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
        ex_bins[0] -= (arr.max() - arr.min()) * 0.001
        assert_almost_equal(bins, ex_bins)

        ex_levels = cut(arr, ex_bins)
        self.assert_(np.array_equal(labels, ex_levels))
Beispiel #45
0
    def test_cut_out_of_bounds(self):
        np.random.seed(12345)

        arr = np.random.randn(100)

        result = cut(arr, [-1, 0, 1])

        mask = result.labels == -1
        ex_mask = (arr < -1) | (arr > 1)
        self.assert_(np.array_equal(mask, ex_mask))
Beispiel #46
0
    def test_datetime_bin(self):
        data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')]
        bin_data = ['2012-12-12', '2012-12-14', '2012-12-16']
        expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]',
                           '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'],
                          ).astype("category", ordered=True)

        for conv in [Timestamp, Timestamp, np.datetime64]:
            bins = [conv(v) for v in bin_data]
            result = cut(data, bins=bins)
            tm.assert_series_equal(Series(result), expected)

        bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data]
        result = cut(data, bins=bin_pydatetime)
        tm.assert_series_equal(Series(result), expected)

        bins = to_datetime(bin_data)
        result = cut(data, bins=bin_pydatetime)
        tm.assert_series_equal(Series(result), expected)
Beispiel #47
0
    def test_value_counts(self):
        from pandas.tools.tile import cut

        arr = np.random.randn(4)
        factor = cut(arr, 4)

        tm.assert_isinstance(factor, Categorical)

        result = algos.value_counts(factor)
        expected = algos.value_counts(np.asarray(factor))
        tm.assert_series_equal(result, expected)
Beispiel #48
0
    def test_series_retbins(self):
        # GH 8589
        s = Series(np.arange(4))
        result, bins = cut(s, 2, retbins=True)
        tm.assert_numpy_array_equal(result.cat.codes.values,
                                    np.array([0, 0, 1, 1], dtype=np.int8))
        tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3]))

        result, bins = qcut(s, 2, retbins=True)
        tm.assert_numpy_array_equal(result.cat.codes.values,
                                    np.array([0, 0, 1, 1], dtype=np.int8))
        tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3]))
Beispiel #49
0
    def test_value_counts(self):
        np.random.seed(1234)
        from pandas.tools.tile import cut

        arr = np.random.randn(4)
        factor = cut(arr, 4)

        tm.assertIsInstance(factor, Categorical)
        result = algos.value_counts(factor)
        cats = ['(-1.194, -0.535]', '(-0.535, 0.121]', '(0.121, 0.777]',
                '(0.777, 1.433]']
        expected_index = CategoricalIndex(cats, cats, ordered=True)
        expected = Series([1, 1, 1, 1], index=expected_index)
        tm.assert_series_equal(result.sort_index(), expected.sort_index())
Beispiel #50
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(
                dtype) or is_period or is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
Beispiel #51
0
    def test_cut_pass_series_name_to_factor(self):
        s = Series(np.random.randn(100), name='foo')

        factor = cut(s, 4)
        self.assertEqual(factor.name, 'foo')
Beispiel #52
0
 def test_cut_out_of_range_more(self):
     # #1511
     s = Series([0, -1, 0, 1, -3], name='x')
     ind = cut(s, [0, 1], labels=False)
     exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name='x')
     tm.assert_series_equal(ind, exp)
Beispiel #53
0
 def test_simple(self):
     data = np.ones(5)
     result = cut(data, 4, labels=False)
     desired = np.array([1, 1, 1, 1, 1])
     tm.assert_numpy_array_equal(result, desired, check_dtype=False)
Beispiel #54
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if is_extension_type(values) and not is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result
Beispiel #55
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas.tseries.period import PeriodIndex

    is_period = com.is_period_arraylike(values)
    values = Series(values).values
    is_category = com.is_categorical_dtype(values.dtype)

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes
    elif is_category:
        bins = values.categories
        cat = values
        values = cat.codes

    dtype = values.dtype

    if issubclass(values.dtype.type,
                  (np.datetime64, np.timedelta64)) or is_period:
        if is_period:
            values = PeriodIndex(values)

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        if dropna:
            from pandas.tslib import iNaT
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]
        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna:
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    result = Series(counts, index=com._values_from_object(keys))
    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        if not is_category:
            result.index = bins[:-1]
        else:
            result.index = cat.categories

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result