def _prep_values(self, values=None, kill_inf=True, how=None): if values is None: values = getattr(self._selected_obj, 'values', self._selected_obj) # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 if com.is_float_dtype(values.dtype): values = com._ensure_float64(values) elif com.is_integer_dtype(values.dtype): values = com._ensure_float64(values) elif com.needs_i8_conversion(values.dtype): raise NotImplementedError("ops for {action} for this " "dtype {dtype} are not " "implemented".format( action=self._window_type, dtype=values.dtype)) else: try: values = com._ensure_float64(values) except (ValueError, TypeError): raise TypeError("cannot handle this type -> {0}" "".format(values.dtype)) if kill_inf: values = values.copy() values[np.isinf(values)] = np.NaN return values
def backfill_2d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if com.is_float_dtype(values): _method = getattr(algos, 'backfill_2d_inplace_%s' % dtype.name, None) elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): _method = _backfill_2d_datetime elif com.is_integer_dtype(values): values = com._ensure_float64(values) _method = algos.backfill_2d_inplace_float64 elif values.dtype == np.object_: _method = algos.backfill_2d_inplace_object if _method is None: raise ValueError('Invalid dtype for backfill_2d [%s]' % dtype.name) if mask is None: mask = com.isnull(values) mask = mask.view(np.uint8) if np.all(values.shape): _method(values, mask, limit=limit) else: # for test coverage pass return values
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map['generic'] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _cython_agg_general(self, how): obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = self.grouper.aggregate(values, how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: all_items = [] for b in new_blocks: all_items.extend(b.items) output_keys = agg_labels[agg_labels.isin(all_items)] if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self.grouper.get_group_levels() zipped = zip(self.grouper.names, group_levels) for i, (name, labels) in enumerate(zipped): result.insert(i, name, labels) result = result.consolidate() else: index = self.grouper.result_index mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result
def _value_counts_arraylike(values, dropna=True): is_datetimetz = com.is_datetimetz(values) is_period = (isinstance(values, gt.ABCPeriodIndex) or com.is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if com.is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz: if isinstance(orig, gt.ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def _get_hash_table_and_cast(values): if com.is_float_dtype(values): klass = lib.Float64HashTable values = com._ensure_float64(values) elif com.is_integer_dtype(values): klass = lib.Int64HashTable values = com._ensure_int64(values) else: klass = lib.PyObjectHashTable values = com._ensure_object(values) return klass, values
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def _cython_agg_general(self, how): output = {} for name, obj in self._iterate_slices(): if not issubclass(obj.dtype.type, (np.number, np.bool_)): continue obj = com._ensure_float64(obj) result, counts = self.grouper.aggregate(obj, how) mask = counts > 0 output[name] = result[mask] if len(output) == 0: raise GroupByError('No numeric types to aggregate') return self._wrap_aggregated_output(output)
def _get_data_algo(values, func_map): if com.is_float_dtype(values): f = func_map['float64'] values = com._ensure_float64(values) elif com.needs_i8_conversion(values): f = func_map['int64'] values = values.view('i8') elif com.is_integer_dtype(values): f = func_map['int64'] values = com._ensure_int64(values) else: f = func_map['generic'] values = com._ensure_object(values) return f, values
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) uniques = uniques.view("M8[ns]") elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) return uniques
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = _hash.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.datetime64): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) uniques = uniques.view('M8[ns]') elif np.issubdtype(values.dtype, np.integer): table = _hash.Int64HashTable(len(values)) uniques = table.unique(com._ensure_int64(values)) else: table = _hash.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) return uniques
def unique1d(values): """ Hash table-based unique """ if np.issubdtype(values.dtype, np.floating): table = lib.Float64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_float64(values)), dtype=np.float64) elif np.issubdtype(values.dtype, np.integer): table = lib.Int64HashTable(len(values)) uniques = np.array(table.unique(com._ensure_int64(values)), dtype=np.int64) if values.dtype == np.datetime64: uniques = uniques.view('M8[us]') else: table = lib.PyObjectHashTable(len(values)) uniques = table.unique(com._ensure_object(values)) uniques = lib.list_to_object_array(uniques) return uniques
def _cython_agg_general(self, how): # TODO: address inefficiencies, like duplicating effort (should # aggregate all the columns at once?) comp_ids, obs_group_ids, max_group = self._group_info output = {} for name, obj in self._iterate_slices(): if not issubclass(obj.dtype.type, (np.number, np.bool_)): continue obj = com._ensure_float64(obj) result, counts = cython_aggregate(obj, comp_ids, max_group, how=how) mask = counts > 0 output[name] = result[mask] if len(output) == 0: raise GroupByError('No numeric types to aggregate') return self._wrap_aggregated_output(output, mask, obs_group_ids)
def pad_1d(values, limit=None, mask=None, dtype=None): if dtype is None: dtype = values.dtype _method = None if com.is_float_dtype(values): _method = getattr(algos, 'pad_inplace_%s' % dtype.name, None) elif dtype in com._DATELIKE_DTYPES or com.is_datetime64_dtype(values): _method = _pad_1d_datetime elif com.is_integer_dtype(values): values = com._ensure_float64(values) _method = algos.pad_inplace_float64 elif values.dtype == np.object_: _method = algos.pad_inplace_object if _method is None: raise ValueError('Invalid dtype for pad_1d [%s]' % dtype.name) if mask is None: mask = com.isnull(values) mask = mask.view(np.uint8) _method(values, mask, limit=limit) return values
def _get_data_algo(values, func_map): mask = None if com.is_float_dtype(values): f = func_map["float64"] values = com._ensure_float64(values) elif com.is_datetime64_dtype(values): # if we have NaT, punt to object dtype mask = com.isnull(values) if mask.ravel().any(): f = func_map["generic"] values = com._ensure_object(values) values[mask] = np.nan else: f = func_map["int64"] values = values.view("i8") elif com.is_integer_dtype(values): f = func_map["int64"] values = com._ensure_int64(values) else: f = func_map["generic"] values = com._ensure_object(values) return f, values
def _cython_agg_general(self, how): comp_ids, obs_group_ids, max_group = self._group_info obj = self._obj_with_exclusions if self.axis == 1: obj = obj.T new_blocks = [] for block in obj._data.blocks: values = block.values.T if not issubclass(values.dtype.type, (np.number, np.bool_)): continue values = com._ensure_float64(values) result, counts = cython_aggregate(values, comp_ids, max_group, how=how) mask = counts > 0 if len(mask) > 0: result = result[mask] newb = make_block(result.T, block.items, block.ref_items) new_blocks.append(newb) if len(new_blocks) == 0: raise GroupByError('No numeric types to aggregate') agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if sum(len(x.items) for x in new_blocks) == len(agg_labels): output_keys = agg_labels else: output_keys = [] for b in new_blocks: output_keys.extend(b.items) try: output_keys.sort() except TypeError: # pragma: no cover pass if isinstance(agg_labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=agg_labels.names) if not self.as_index: index = np.arange(new_blocks[0].values.shape[1]) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) group_levels = self._get_group_levels(mask, obs_group_ids) for i, (name, labels) in enumerate(group_levels): result.insert(i, name, labels) result = result.consolidate() else: index = self._get_multi_index(mask, obs_group_ids) mgr = BlockManager(new_blocks, [output_keys, index]) result = DataFrame(mgr) if self.axis == 1: result = result.T return result
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype( dtype) or is_period or is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: from pandas.tslib import iNaT msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result
def func(arg, window, min_periods=None): minp = check_minp(min_periods, window) # GH #12373: rolling functions error on float32 data return cfunc(com._ensure_float64(arg), window, minp, **kwargs)
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut from pandas import Index, PeriodIndex, DatetimeIndex name = getattr(values, 'name', None) values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if com.is_categorical_dtype(values.dtype): result = values.value_counts(dropna) else: dtype = values.dtype is_period = com.is_period_arraylike(values) is_datetimetz = com.is_datetimetz(values) if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \ is_datetimetz: if is_period: values = PeriodIndex(values) elif is_datetimetz: tz = getattr(values, 'tz', None) values = DatetimeIndex(values).tz_localize(None) values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # localize to the original tz if necessary if is_datetimetz: keys = DatetimeIndex(keys).tz_localize(tz) # convert the keys back to the dtype we came in else: keys = keys.astype(dtype) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(values.size) return result