def test_datetime_cut(self): # GH 14714 # testing for time data to be present as series data = to_datetime(Series(['2013-01-01', '2013-01-02', '2013-01-03'])) result, bins = cut(data, 3, retbins=True) expected = Series(['(2012-12-31 23:57:07.200000, 2013-01-01 16:00:00]', '(2013-01-01 16:00:00, 2013-01-02 08:00:00]', '(2013-01-02 08:00:00, 2013-01-03 00:00:00]'], ).astype("category", ordered=True) tm.assert_series_equal(result, expected) # testing for time data to be present as list data = [np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')] result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as ndarray data = np.array([np.datetime64('2013-01-01'), np.datetime64('2013-01-02'), np.datetime64('2013-01-03')]) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected) # testing for time data to be present as datetime index data = DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03']) result, bins = cut(data, 3, retbins=True) tm.assert_series_equal(Series(result), expected)
def test_label_formatting(self): self.assertEquals(tmod._trim_zeros('1.000'), '1') # it works result = cut(np.arange(11.), 2) result = cut(np.arange(11.) / 1e10, 2)
def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) ex_levels = ["(-0.001, 0.25]", "(0.25, 0.5]", "(0.5, 0.75]", "(0.75, 1]"] self.assert_(np.array_equal(result.levels, ex_levels)) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ["[0, 0.25)", "[0.25, 0.5)", "[0.5, 0.75)", "[0.75, 1.001)"] self.assert_(np.array_equal(result.levels, ex_levels))
def test_cut_pass_labels(self): arr = [50, 5, 10, 15, 20, 30, 70] bins = [0, 25, 50, 100] labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) exp = cut(arr, bins) exp.categories = labels tm.assert_categorical_equal(result, exp)
def test_cut_pass_labels(self): arr = [50, 5, 10, 15, 20, 30, 70] bins = [0, 25, 50, 100] labels = ["Small", "Medium", "Large"] result = cut(arr, bins, labels=labels) exp = cut(arr, bins) exp.categories = labels self.assertTrue(result.equals(exp))
def test_single_bin(self): # issue 14652 expected = Series([0, 0]) s = Series([9., 9.]) result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) s = Series([-9., -9.]) result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected)
def test_cut_pass_labels(self): arr = [50, 5, 10, 15, 20, 30, 70] bins = [0, 25, 50, 100] labels = ['Small', 'Medium', 'Large'] result = cut(arr, bins, labels=labels) exp = cut(arr, bins) exp.levels = labels self.assert_(result.equals(exp))
def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan labels = cut(arr, 4) ex_labels = np.where(com.isnull(arr), np.nan, labels) tm.assert_almost_equal(labels, ex_labels) labels = cut(arr, 4, labels=False) ex_labels = np.where(com.isnull(arr), np.nan, labels) tm.assert_almost_equal(labels, ex_labels)
def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) ex_levels = Index(['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]']) self.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = Index(['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)']) self.assert_index_equal(result.categories, ex_levels)
def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) ex_levels = Index( ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]']) self.assert_index_equal(result.categories, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = Index( ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)']) self.assert_index_equal(result.categories, ex_levels)
def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) result, bins = cut(arr, 4, retbins=True) ex_levels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] self.assert_numpy_array_equal(result.levels, ex_levels) result, bins = cut(arr, 4, retbins=True, right=False) ex_levels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] self.assert_numpy_array_equal(result.levels, ex_levels)
def test_label_formatting(self): self.assertEquals(tmod._trim_zeros("1.000"), "1") # it works result = cut(np.arange(11.0), 2) result = cut(np.arange(11.0) / 1e10, 2) # #1979, negative numbers result = tmod._format_label(-117.9998, precision=3) self.assertEquals(result, "-118") result = tmod._format_label(117.9998, precision=3) self.assertEquals(result, "118")
def test_labels(self): arr = np.tile(np.arange(0, 1.01, 0.1), 4) labels, bins = cut(arr, 4, retbins=True) distinct_labels = sorted(unique(labels)) ex_labels = ['(-0.001, 0.25]', '(0.25, 0.5]', '(0.5, 0.75]', '(0.75, 1]'] self.assertEqual(distinct_labels, ex_labels) labels, bins = cut(arr, 4, retbins=True, right=False) distinct_labels = sorted(unique(labels)) ex_labels = ['[0, 0.25)', '[0.25, 0.5)', '[0.5, 0.75)', '[0.75, 1.001)'] self.assertEqual(distinct_labels, ex_labels)
def test_label_formatting(self): self.assertEqual(tmod._trim_zeros('1.000'), '1') # it works result = cut(np.arange(11.), 2) result = cut(np.arange(11.) / 1e10, 2) # #1979, negative numbers result = tmod._format_label(-117.9998, precision=3) self.assertEqual(result, '-118') result = tmod._format_label(117.9998, precision=3) self.assertEqual(result, '118')
def test_single_bin(self): # issue 14652 # Explicit dtype since Series produces int64 for ints, while cut # (due to numpy.searchsorted) would use int32 on i386, so let's assure # correct default to the architecture int expected = Series([0, 0], dtype=np.dtype(int)) s = Series([9., 9.]) result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected) s = Series([-9., -9.]) result = cut(s, 1, labels=False) tm.assert_series_equal(result, expected)
def test_inf_handling(self): data = np.arange(6) data_ser = Series(data, dtype='int64') result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) ex_categories = Index(['(-inf, 2]', '(2, 4]', '(4, inf]']) tm.assert_index_equal(result.categories, ex_categories) tm.assert_index_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], '(4, inf]') self.assertEqual(result[0], '(-inf, 2]') self.assertEqual(result_ser[5], '(4, inf]') self.assertEqual(result_ser[0], '(-inf, 2]')
def test_na_handling(self): arr = np.arange(0, 0.75, 0.01) arr[::3] = np.nan result = cut(arr, 4) result_arr = np.asarray(result) ex_arr = np.where(com.isnull(arr), np.nan, result_arr) tm.assert_almost_equal(result_arr, ex_arr) result = cut(arr, 4, labels=False) ex_result = np.where(com.isnull(arr), np.nan, result) tm.assert_almost_equal(result, ex_result)
def test_inf_handling(self): data = np.arange(6) data_ser = Series(data,dtype='int64') result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) ex_levels = ['(-inf, 2]', '(2, 4]', '(4, inf]'] np.testing.assert_array_equal(result.levels, ex_levels) np.testing.assert_array_equal(result_ser.levels, ex_levels) self.assertEquals(result[5], '(4, inf]') self.assertEquals(result[0], '(-inf, 2]') self.assertEquals(result_ser[5], '(4, inf]') self.assertEquals(result_ser[0], '(-inf, 2]')
def test_inf_handling(self): data = np.arange(6) data_ser = Series(data, dtype="int64") result = cut(data, [-np.inf, 2, 4, np.inf]) result_ser = cut(data_ser, [-np.inf, 2, 4, np.inf]) ex_categories = ["(-inf, 2]", "(2, 4]", "(4, inf]"] np.testing.assert_array_equal(result.categories, ex_categories) np.testing.assert_array_equal(result_ser.cat.categories, ex_categories) self.assertEqual(result[5], "(4, inf]") self.assertEqual(result[0], "(-inf, 2]") self.assertEqual(result_ser[5], "(4, inf]") self.assertEqual(result_ser[0], "(-inf, 2]")
def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) exp_codes = np.array([0, 0, 0, 2, 3, 0, 1], dtype=np.int8) tm.assert_numpy_array_equal(result.codes, exp_codes) exp = np.array([0.2, 2.575, 4.95, 7.325, 9.7095]) tm.assert_almost_equal(bins, exp)
def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) ex_levels = ['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]'] self.assert_(np.array_equal(result.levels, ex_levels))
def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) exp_codes = np.array([0, 0, 0, 1, 2, 0], dtype=np.int8) tm.assert_numpy_array_equal(result.codes, exp_codes) exp = np.array([0.1905, 3.36666667, 6.53333333, 9.7]) tm.assert_almost_equal(bins, exp)
def test_label_precision(self): arr = np.arange(0, 0.73, 0.01) result = cut(arr, 4, precision=2) ex_levels = Index(['(-0.00072, 0.18]', '(0.18, 0.36]', '(0.36, 0.54]', '(0.54, 0.72]']) self.assert_index_equal(result.categories, ex_levels)
def test_cut_out_of_bounds(self): arr = np.random.randn(100) result = cut(arr, [-1, 0, 1]) mask = result.codes == -1 ex_mask = (arr < -1) | (arr > 1) self.assert_numpy_array_equal(mask, ex_mask)
def test_label_precision(self): arr = np.arange(0, 0.75, 0.01) labels = cut(arr, 4, precision=2) distinct_labels = sorted(unique(labels)) ex_labels = ['(-0.00074, 0.18]', '(0.18, 0.37]', '(0.37, 0.55]', '(0.55, 0.74]'] self.assertEqual(distinct_labels, ex_labels)
def test_cut_out_of_bounds(self): arr = np.random.randn(100) result = cut(arr, [-1, 0, 1]) mask = result.labels == -1 ex_mask = (arr < -1) | (arr > 1) self.assertTrue(np.array_equal(mask, ex_mask))
def test_cut_return_categorical(self): s = Series([0, 1, 2, 3, 4, 5, 6, 7, 8]) res = cut(s, 3) exp = Series(Categorical.from_codes([0, 0, 0, 1, 1, 1, 2, 2, 2], ["(-0.008, 2.667]", "(2.667, 5.333]", "(5.333, 8]"], ordered=True)) tm.assert_series_equal(res, exp)
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, 0.25, 0.5, 0.75, 1.0]) assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) self.assert_(np.array_equal(labels, ex_levels))
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) tm.assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins, include_lowest=True) self.assert_categorical_equal(labels, ex_levels)
def test_qcut(self): arr = np.random.randn(1000) labels, bins = qcut(arr, 4, retbins=True) ex_bins = quantile(arr, [0, .25, .5, .75, 1.]) ex_bins[0] -= (arr.max() - arr.min()) * 0.001 assert_almost_equal(bins, ex_bins) ex_levels = cut(arr, ex_bins) self.assert_(np.array_equal(labels, ex_levels))
def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) assert_equal(result.cat.codes.values, [0, 0, 1, 1]) assert_almost_equal(bins, [-0.003, 1.5, 3]) result, bins = qcut(s, 2, retbins=True) assert_equal(result.cat.codes.values, [0, 0, 1, 1]) assert_almost_equal(bins, [0, 1.5, 3])
def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) tm.assert_almost_equal(bins, [-0.003, 1.5, 3]) result, bins = qcut(s, 2, retbins=True) tm.assert_numpy_array_equal(result.cat.codes.values, [0, 0, 1, 1]) tm.assert_almost_equal(bins, [0, 1.5, 3])
def test_cut_out_of_bounds(self): np.random.seed(12345) arr = np.random.randn(100) result = cut(arr, [-1, 0, 1]) mask = result.labels == -1 ex_mask = (arr < -1) | (arr > 1) self.assert_(np.array_equal(mask, ex_mask))
def test_datetime_bin(self): data = [np.datetime64('2012-12-13'), np.datetime64('2012-12-15')] bin_data = ['2012-12-12', '2012-12-14', '2012-12-16'] expected = Series(['(2012-12-12 00:00:00, 2012-12-14 00:00:00]', '(2012-12-14 00:00:00, 2012-12-16 00:00:00]'], ).astype("category", ordered=True) for conv in [Timestamp, Timestamp, np.datetime64]: bins = [conv(v) for v in bin_data] result = cut(data, bins=bins) tm.assert_series_equal(Series(result), expected) bin_pydatetime = [Timestamp(v).to_pydatetime() for v in bin_data] result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected) bins = to_datetime(bin_data) result = cut(data, bins=bin_pydatetime) tm.assert_series_equal(Series(result), expected)
def test_value_counts(self): from pandas.tools.tile import cut arr = np.random.randn(4) factor = cut(arr, 4) tm.assert_isinstance(factor, Categorical) result = algos.value_counts(factor) expected = algos.value_counts(np.asarray(factor)) tm.assert_series_equal(result, expected)
def test_series_retbins(self): # GH 8589 s = Series(np.arange(4)) result, bins = cut(s, 2, retbins=True) tm.assert_numpy_array_equal(result.cat.codes.values, np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([-0.003, 1.5, 3])) result, bins = qcut(s, 2, retbins=True) tm.assert_numpy_array_equal(result.cat.codes.values, np.array([0, 0, 1, 1], dtype=np.int8)) tm.assert_numpy_array_equal(bins, np.array([0, 1.5, 3]))
def test_value_counts(self): np.random.seed(1234) from pandas.tools.tile import cut arr = np.random.randn(4) factor = cut(arr, 4) tm.assertIsInstance(factor, Categorical) result = algos.value_counts(factor) cats = ['(-1.194, -0.535]', '(-0.535, 0.121]', '(0.121, 0.777]', '(0.777, 1.433]'] expected_index = CategoricalIndex(cats, cats, ordered=True) expected = Series([1, 1, 1, 1], index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index())
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def test_cut_pass_series_name_to_factor(self): s = Series(np.random.randn(100), name='foo') factor = cut(s, 4) self.assertEqual(factor.name, 'foo')
def test_cut_out_of_range_more(self): # #1511 s = Series([0, -1, 0, 1, -3], name='x') ind = cut(s, [0, 1], labels=False) exp = Series([np.nan, np.nan, np.nan, 0, np.nan], name='x') tm.assert_series_equal(ind, exp)
def test_simple(self): data = np.ones(5) result = cut(data, 4, labels=False) desired = np.array([1, 1, 1, 1, 1]) tm.assert_numpy_array_equal(result, desired, check_dtype=False)
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series name = getattr(values, 'name', None) if bins is not None: try: from pandas.tools.tile import cut values = Series(values).values cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values else: # ndarray path. pass original to handle DatetimeTzBlock keys, counts = _value_counts_arraylike(values, dropna=dropna) from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(counts.sum()) return result
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas.tseries.period import PeriodIndex is_period = com.is_period_arraylike(values) values = Series(values).values is_category = com.is_categorical_dtype(values.dtype) if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes elif is_category: bins = values.categories cat = values values = cat.codes dtype = values.dtype if issubclass(values.dtype.type, (np.datetime64, np.timedelta64)) or is_period: if is_period: values = PeriodIndex(values) values = values.view(np.int64) keys, counts = htable.value_count_int64(values) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna: keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) if not is_category: result.index = bins[:-1] else: result.index = cat.categories if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result