def _homogenize_dict(frames, intersect=True, dtype=None): """ Conform set of DataFrame-like objects to either an intersection of indices / columns or a union. Parameters ---------- frames : dict intersect : boolean, default True Returns ------- dict of aligned frames, index, columns """ result = {} adj_frames = {} for k, v in frames.iteritems(): if isinstance(v, dict): adj_frames[k] = DataFrame(v) else: adj_frames[k] = v all_indexes = [df.index for df in adj_frames.values()] all_columns = [df.columns for df in adj_frames.values()] index = _get_combined_index(all_indexes, intersect=intersect) columns = _get_combined_index(all_columns, intersect=intersect) for key, frame in adj_frames.iteritems(): result[key] = frame.reindex(index=index, columns=columns, copy=False) return result, index, columns
def _get_comb_axis(self, i): if self._is_series: all_indexes = [x.index for x in self.objs] else: all_indexes = [x._data.axes[i] for x in self.objs] return _get_combined_index(all_indexes, intersect=self.intersect)
def _extract_axis(data, axis=0, intersect=False): if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_frames = False for v in data.values(): if isinstance(v, DataFrame): have_frames = True indexes.append(v._get_axis(axis)) else: have_raw_arrays = True raw_lengths.append(v.shape[axis]) if have_frames: index = _get_combined_index(indexes, intersect=intersect) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('ndarrays must match shape on axis %d' % axis) if have_frames: assert(lengths[0] == len(index)) else: index = Index(np.arange(lengths[0])) return _ensure_index(index)
def _extract_axis(data, axis=0, intersect=False): if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_frames = False for v in data.values(): if isinstance(v, DataFrame): have_frames = True indexes.append(v._get_axis(axis)) else: have_raw_arrays = True raw_lengths.append(v.shape[axis]) if have_frames: index = _get_combined_index(indexes, intersect=intersect) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('ndarrays must match shape on axis %d' % axis) if have_frames: assert (lengths[0] == len(index)) else: index = Index(np.arange(lengths[0])) return _ensure_index(index)
def _get_new_axes(self): ndim = self.objs[0].ndim new_axes = [None] * ndim if self.ignore_index: concat_axis = None else: concat_axis = self._get_concat_axis() new_axes[self.axis] = concat_axis if self.join_axes is None: for i in range(ndim): if i == self.axis: continue all_indexes = [x._data.axes[i] for x in self.objs] comb_axis = _get_combined_index(all_indexes, intersect=self.intersect) new_axes[i] = comb_axis else: assert(len(self.join_axes) == ndim - 1) # ufff... indices = range(ndim) indices.remove(self.axis) for i, ax in zip(indices, self.join_axes): new_axes[i] = ax return new_axes
def _get_new_axes(self): ndim = self.objs[0].ndim new_axes = [None] * ndim if self.ignore_index: concat_axis = None else: concat_axis = _concat_indexes([x._data.axes[self.axis] for x in self.objs]) self._maybe_check_integrity(concat_axis) new_axes[self.axis] = concat_axis if self.join_axes is None: for i in range(ndim): if i == self.axis: continue all_indexes = [x._data.axes[i] for x in self.objs] comb_axis = _get_combined_index(all_indexes, intersect=self.intersect) new_axes[i] = comb_axis else: assert(len(self.join_axes) == ndim - 1) # ufff... indices = range(ndim) indices.remove(self.axis) for i, ax in zip(indices, self.join_axes): new_axes[i] = ax return new_axes
def _get_comb_axis(self, i): if self._is_series: all_indexes = [x.index for x in self.objs] else: try: all_indexes = [x._data.axes[i] for x in self.objs] except IndexError: types = [type(x).__name__ for x in self.objs] raise TypeError("Cannot concatenate list of %s" % types) return _get_combined_index(all_indexes, intersect=self.intersect)
def _extract_axis(self, data, axis=0, intersect=False): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_frames = False for v in data.values(): if isinstance(v, self._constructor_sliced): have_frames = True indexes.append(v._get_axis(axis)) elif v is not None: have_raw_arrays = True raw_lengths.append(v.shape[axis]) if have_frames: index = _get_combined_index(indexes, intersect=intersect) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('ndarrays must match shape on axis %d' % axis) if have_frames: if lengths[0] != len(index): raise AssertionError('Length of data and index must match') else: index = Index(np.arange(lengths[0])) if index is None: index = Index([]) return _ensure_index(index)
def test_get_combined_index(): from pandas.core.index import _get_combined_index, NULL_INDEX result = _get_combined_index([]) assert(result is NULL_INDEX)
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. .. versionadded:: 0.21.0 dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", ... "bar", "bar", "foo", "foo", "foo"], dtype=object) >>> b = np.array(["one", "one", "one", "two", "one", "one", ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) ... # doctest: +NORMALIZE_WHITESPACE b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, ... # but they still will be counted in the output ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') obs_idxes = [obj.index for objs in (index, columns) for obj in objs if hasattr(obj, 'index')] if obs_idxes: common_idx = _get_combined_index(obs_idxes, intersect=True) else: common_idx = None data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") df = DataFrame(data, index=common_idx) if values is None: df['__dummy__'] = 0 kwargs = {'aggfunc': len, 'fill_value': 0} else: df['__dummy__'] = values kwargs = {'aggfunc': aggfunc} table = df.pivot_table('__dummy__', index=rownames, columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, **kwargs) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, margins_name=margins_name) return table
def test_get_combined_index(): from pandas.core.index import _get_combined_index, NULL_INDEX result = _get_combined_index([]) assert (result is NULL_INDEX)
def test_get_combined_index(): from pandas.core.index import _get_combined_index result = _get_combined_index([]) assert result.equals(Index([]))