def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list("abc"), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({"missing": missing, "dense": dense, "values": values}) grouped = df.groupby(["missing", "dense"]) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(["a", "b"], ordered=ordered), Categorical(["a", "b", "c"], ordered=ordered)], names=["missing", "dense"], ) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=["values"]) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([["a"], ["a", "b", "c"]], names=["missing", "dense"]) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return Series([]) key_names = [ping.name for ping in self.groupings] if isinstance(values[0], Series): if not_indexed_same: data_dict = dict(zip(keys, values)) result = DataFrame(data_dict).T if len(self.groupings) > 1: result.index = MultiIndex.from_tuples(keys, names=key_names) return result else: cat_values = np.concatenate([x.values for x in values]) cat_index = values[0].index if len(values) > 1: cat_index = cat_index.append([x.index for x in values[1:]]) return Series(cat_values, index=cat_index) elif isinstance(values[0], DataFrame): # possible that Series -> DataFrame by applied function return self._wrap_frames(keys, values, not_indexed_same=not_indexed_same) else: if len(self.groupings) > 1: index = MultiIndex.from_tuples(keys, names=key_names) return Series(values, index) else: return Series(values, keys)
def test_apply_categorical_data(self): # GH 10138 for ordered in [True, False]: dense = Categorical(list('abc'), ordered=ordered) # 'b' is in the categories but not in the list missing = Categorical( list('aaa'), categories=['a', 'b'], ordered=ordered) values = np.arange(len(dense)) df = DataFrame({'missing': missing, 'dense': dense, 'values': values}) grouped = df.groupby(['missing', 'dense']) # missing category 'b' should still exist in the output index idx = MultiIndex.from_product( [Categorical(['a', 'b'], ordered=ordered), Categorical(['a', 'b', 'c'], ordered=ordered)], names=['missing', 'dense']) expected = DataFrame([0, 1, 2, np.nan, np.nan, np.nan], index=idx, columns=['values']) assert_frame_equal(grouped.apply(lambda x: np.mean(x)), expected) assert_frame_equal(grouped.mean(), expected) assert_frame_equal(grouped.agg(np.mean), expected) # but for transform we should still get back the original index idx = MultiIndex.from_product([['a'], ['a', 'b', 'c']], names=['missing', 'dense']) expected = Series(1, index=idx) assert_series_equal(grouped.apply(lambda x: 1), expected)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ["hash", "category"] ser.name = "value" df = ser.reset_index() self.assertIn("value", df) df = ser.reset_index(name="value2") self.assertIn("value2", df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex( levels=[["bar"], ["one", "two", "three"], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]], ) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assertTrue(rs.index.equals(Index(index.get_level_values(1)))) tm.assertIsInstance(rs, Series)
def test_reset_index(self): df = tm.makeDataFrame()[:5] ser = df.stack() ser.index.names = ['hash', 'category'] ser.name = 'value' df = ser.reset_index() self.assertIn('value', df) df = ser.reset_index(name='value2') self.assertIn('value2', df) # check inplace s = ser.reset_index(drop=True) s2 = ser s2.reset_index(drop=True, inplace=True) assert_series_equal(s, s2) # level index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]], labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]]) s = Series(np.random.randn(6), index=index) rs = s.reset_index(level=1) self.assertEqual(len(rs.columns), 2) rs = s.reset_index(level=[0, 2], drop=True) self.assert_index_equal(rs.index, Index(index.get_level_values(1))) tm.assertIsInstance(rs, Series)
def _coo_to_sparse_series(A, dense_index=False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Use the defaults given in the SparseSeries constructor. """ s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) s = s.sort_index() s = s.to_sparse() # TODO: specify kind? if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) j = range(A.shape[1]) ind = MultiIndex.from_product([i, j]) s = s.reindex_axis(ind) return s
def test_droplevel_with_names(self): index = self.index[self.index.get_loc('foo')] dropped = index.droplevel(0) self.assertEqual(dropped.name, 'second') index = MultiIndex(levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])], names=['one', 'two', 'three']) dropped = index.droplevel(0) self.assertEqual(dropped.names, ['two', 'three'])
def test_slice_locs_not_sorted(self): index = MultiIndex(levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) self.assertRaises(Exception, index.slice_locs, (1, 0, 1), (2, 1, 0)) # works sorted_index, _ = index.sortlevel(0) result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0))
def test_get_loc(self): self.assert_(self.index.get_loc(('foo', 'two')) == 1) self.assert_(self.index.get_loc(('baz', 'two')) == 3) self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two')) self.assertRaises(KeyError, self.index.get_loc, 'quux') # 3 levels index = MultiIndex(levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0])]) self.assertRaises(KeyError, index.get_loc, (1, 1)) self.assert_(index.get_loc((2, 0)) == slice(3, 5))
def test_alignment(self): x = Series(data=[1, 2, 3], index=MultiIndex.from_tuples([("A", 1), ("A", 2), ("B", 3)])) y = Series(data=[4, 5, 6], index=MultiIndex.from_tuples([("Z", 1), ("Z", 2), ("B", 3)])) res = x - y exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp) # hit non-monotonic code path res = x[::-1] - y[::-1] exp_index = x.index.union(y.index) exp = x.reindex(exp_index) - y.reindex(exp_index) assert_series_equal(res, exp)
def test_dataframe_insert_column_all_na(self): # GH #1534 mix = MultiIndex.from_tuples([('1a', '2a'), ('1a', '2b'), ('1a', '2c')]) df = DataFrame([[1,2],[3,4],[5,6]], index=mix) s = Series({(1,1): 1, (1,2): 2}) df['new'] = s self.assert_(df['new'].isnull().all())
def _wrap_result_expand(self, result, expand=False): if not isinstance(expand, bool): raise ValueError("expand must be True or False") from pandas.core.index import Index, MultiIndex if not hasattr(result, 'ndim'): return result if isinstance(self.series, Index): name = getattr(result, 'name', None) # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if hasattr(result, 'dtype') and is_bool_dtype(result): return result if expand: result = list(result) return MultiIndex.from_tuples(result, names=name) else: return Index(result, name=name) else: index = self.series.index if expand: def cons_row(x): if is_list_like(x): return x else: return [ x ] cons = self.series._constructor_expanddim data = [cons_row(x) for x in result] return cons(data, index=index) else: name = getattr(result, 'name', None) cons = self.series._constructor return cons(result, name=name, index=index)
def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # XXX return DataFrame({}) key_names = [ping.name for ping in self.groupings] if isinstance(values[0], DataFrame): return self._wrap_frames(keys, values, not_indexed_same=not_indexed_same) else: if len(self.groupings) > 1: keys = MultiIndex.from_tuples(keys, names=key_names) if isinstance(values[0], np.ndarray): if self.axis == 0: stacked_values = np.vstack([np.asarray(x) for x in values]) columns = values[0].index index = keys else: stacked_values = np.vstack([np.asarray(x) for x in values]).T index = values[0].index columns = keys return DataFrame(stacked_values, index=index, columns=columns) else: return Series(values, index=keys)
def test_delevel_infer_dtype(self): tuples = [tuple for tuple in cart_product(["foo", "bar"], [10, 20], [1.0, 1.1])] index = MultiIndex.from_tuples(tuples, names=["prm0", "prm1", "prm2"]) df = DataFrame(np.random.randn(8, 3), columns=["A", "B", "C"], index=index) deleveled = df.reset_index() self.assert_(com.is_integer_dtype(deleveled["prm1"])) self.assert_(com.is_float_dtype(deleveled["prm2"]))
def setUp(self): index = MultiIndex( levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=["first", "second"], ) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(["A", "B", "C"], name="exp")) self.single_level = MultiIndex(levels=[["foo", "bar", "baz", "qux"]], labels=[[0, 1, 2, 3]], names=["first"]) # create test series object arrays = [ ["bar", "bar", "baz", "baz", "qux", "qux", "foo", "foo"], ["one", "two", "one", "two", "one", "two", "one", "two"], ] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype("i8") for lev in self.ymd.index.levels] self.ymd.index.names = ["year", "month", "day"]
def test_mixed_depth_pop(self): arrays = [ ["a", "top", "top", "routine1", "routine1", "routine2"], ["", "OD", "OD", "result1", "result2", "result1"], ["", "wx", "wy", "", "", ""], ] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4, 6), columns=index) df1 = df.copy() df2 = df.copy() result = df1.pop("a") expected = df2.pop(("a", "", "")) assert_series_equal(expected, result) assert_frame_equal(df1, df2) self.assertEquals(result.name, "a") expected = df1["top"] df1 = df1.drop(["top"], axis=1) result = df2.pop("top") assert_frame_equal(expected, result) assert_frame_equal(df1, df2)
def test_agg_compat(self): # GH 12334 df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'two', 'two', 'two', 'one', 'two'], 'C': np.random.randn(8) + 1.0, 'D': np.arange(8)}) g = df.groupby(['A', 'B']) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = MultiIndex.from_tuples([('C', 'sum'), ('C', 'std')]) result = g['D'].agg({'C': ['sum', 'std']}) assert_frame_equal(result, expected, check_like=True) expected = pd.concat([g['D'].sum(), g['D'].std()], axis=1) expected.columns = ['C', 'D'] result = g['D'].agg({'C': 'sum', 'D': 'std'}) assert_frame_equal(result, expected, check_like=True)
def test_mixed_depth_drop(self): arrays = [[ 'a', 'top', 'top', 'routine1', 'routine1', 'routine2'], [ '', 'OD', 'OD', 'result1', 'result2', 'result1'], [ '', 'wx', 'wy', '', '', '']] tuples = zip(*arrays) tuples.sort() index = MultiIndex.from_tuples(tuples) df = DataFrame(randn(4,6),columns = index) result = df.drop('a',axis=1) expected = df.drop([('a','','')],axis=1) assert_frame_equal(expected, result) result = df.drop(['top'],axis=1) expected = df.drop([('top','OD','wx')], axis=1) expected = expected.drop([('top','OD','wy')], axis=1) assert_frame_equal(expected, result) result = df.drop(('top', 'OD', 'wx'), axis=1) expected = df.drop([('top','OD','wx')], axis=1) assert_frame_equal(expected, result) expected = df.drop([('top','OD','wy')], axis=1) expected = df.drop('top', axis=1) result = df.drop('result1', level=1, axis=1) expected = df.drop([('routine1', 'result1', ''), ('routine2', 'result1', '')], axis=1) assert_frame_equal(expected, result)
def pivot_simple(index, columns, values): """ Produce 'pivot' table based on 3 columns of this DataFrame. Uses unique values from index / columns and fills with values. Parameters ---------- index : ndarray Labels to use to make new frame's index columns : ndarray Labels to use to make new frame's columns values : ndarray Values to use for populating new frame's values Note ---- Obviously, all 3 of the input arguments must have the same length Returns ------- DataFrame """ if (len(index) != len(columns)) or (len(columns) != len(values)): raise AssertionError('Length of index, columns, and values must be the' ' same') if len(index) == 0: return DataFrame(index=[]) hindex = MultiIndex.from_arrays([index, columns]) series = Series(values.ravel(), index=hindex) series = series.sortlevel(0) return series.unstack()
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ index_vals = self[index] column_vals = self[columns] mindex = MultiIndex.from_arrays([index_vals, column_vals]) try: mindex._verify_integrity() except Exception: raise Exception("duplicate index/column pairs!") if values is None: items = self.columns - [index, columns] mat = self.reindex(columns=items).values else: items = [values] mat = np.atleast_2d(self[values].values).T stacked = DataFrame(mat, index=mindex, columns=items) if not mindex.is_lexsorted(): stacked = stacked.sortlevel(level=0) unstacked = stacked.unstack() if values is not None: unstacked.columns = unstacked.columns.droplevel(0) return unstacked
def test_constructor(self): self.assertTrue(self.ts.index.is_all_dates) # Pass in Series derived = Series(self.ts) self.assertTrue(derived.index.is_all_dates) self.assertTrue(tm.equalContents(derived.index, self.ts.index)) # Ensure new index is not created self.assertEqual(id(self.ts.index), id(derived.index)) # Mixed type Series mixed = Series(['hello', np.NaN], index=[0, 1]) self.assertEqual(mixed.dtype, np.object_) self.assertIs(mixed[1], np.NaN) self.assertFalse(self.empty.index.is_all_dates) self.assertFalse(Series({}).index.is_all_dates) self.assertRaises(Exception, Series, np.random.randn(3, 3), index=np.arange(3)) mixed.name = 'Series' rs = Series(mixed).name xp = 'Series' self.assertEqual(rs, xp) # raise on MultiIndex GH4187 m = MultiIndex.from_arrays([[1, 2], [3, 4]]) self.assertRaises(NotImplementedError, Series, m)
def _agg_index(self, index, try_parse_dates=True): if np.isscalar(self.index_col): if try_parse_dates and self._should_parse_dates(self.index_col): index = self._conv_date(index) na_values = self.na_values if isinstance(na_values, dict): na_values = _get_na_values(self.index_name, na_values) index, na_count = _convert_types(index, na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: print 'Found %d NA values in the index' % na_count else: arrays = [] for i, arr in enumerate(index): if (try_parse_dates and self._should_parse_dates(self.index_col[i])): arr = self._conv_date(arr) col_na_values = self.na_values if isinstance(self.na_values, dict): col_name = self.index_name[i] if col_name is not None: col_na_values = _get_na_values(col_name, self.na_values) arr, _ = _convert_types(arr, col_na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) return index
def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): arrays.append(np.asarray(lev).take(lab)) result = MultiIndex.from_arrays(arrays) self.assertEquals(list(result), list(self.index))
def _wrap_aggregated_output(self, output, mask, comp_ids): agg_axis = 0 if self.axis == 1 else 1 agg_labels = self._obj_with_exclusions._get_axis(agg_axis) if len(output) == len(agg_labels): output_keys = agg_labels else: output_keys = sorted(output) try: output_keys.sort() except Exception: # pragma: no cover pass if isinstance(agg_labels, MultiIndex): output_keys = MultiIndex.from_tuples(output_keys, names=agg_labels.names) if not self.as_index: result = DataFrame(output, columns=output_keys) group_levels = self._get_group_levels(mask, comp_ids) for i, (name, labels) in enumerate(group_levels): result.insert(i, name, labels) result = result.consolidate() else: index = self._get_multi_index(mask, comp_ids) result = DataFrame(output, index=index, columns=output_keys) if self.axis == 1: result = result.T return result
def setUp(self): index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.frame = DataFrame(np.random.randn(10, 3), index=index, columns=Index(['A', 'B', 'C'], name='exp')) self.single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) # create test series object arrays = [['bar', 'bar', 'baz', 'baz', 'qux', 'qux', 'foo', 'foo'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] tuples = zip(*arrays) index = MultiIndex.from_tuples(tuples) s = Series(randn(8), index=index) s[3] = np.NaN self.series = s tm.N = 100 self.tdf = tm.makeTimeDataFrame() self.ymd = self.tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum() # use Int64Index, to make sure things work self.ymd.index.levels = [lev.astype('i8') for lev in self.ymd.index.levels] self.ymd.index.names = ['year', 'month', 'day']
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ index_vals = self[index] column_vals = self[columns] mindex = MultiIndex.from_arrays([index_vals, column_vals], names=[index, columns]) if values is None: items = self.columns - [index, columns] mat = self.reindex(columns=items).values else: items = [values] mat = np.atleast_2d(self[values].values).T stacked = DataFrame(mat, index=mindex, columns=items) if not mindex.is_lexsorted(): stacked = stacked.sortlevel(level=0) unstacked = stacked.unstack() if values is not None: unstacked.columns = unstacked.columns.droplevel(0) return unstacked
def test_get_loc(self): self.assert_(self.index.get_loc(("foo", "two")) == 1) self.assert_(self.index.get_loc(("baz", "two")) == 3) self.assertRaises(KeyError, self.index.get_loc, ("bar", "two")) self.assertRaises(KeyError, self.index.get_loc, "quux") # 3 levels index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]), ], ) self.assertRaises(KeyError, index.get_loc, (1, 1)) self.assert_(index.get_loc((2, 0)) == slice(3, 5))
def test_repr_name_coincide(self): index = MultiIndex.from_tuples([('a', 0, 'foo'), ('b', 1, 'bar')], names=['a', 'b', 'c']) df = DataFrame({'value': [0, 1]}, index=index) lines = repr(df).split('\n') self.assert_(lines[2].startswith('a 0 foo'))
def test_no_multiindexes(self): """ Only teardown single indexes. """ from pandas.core.index import MultiIndex index = MultiIndex.from_tuples([(0,1), (1,2)]) self.assertRaises(TypeError, teardown_index, index)
def test_drop(self): dropped = self.index.drop([('foo', 'two'), ('qux', 'one')]) index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) dropped2 = self.index.drop(index) expected = self.index[[0, 2, 3, 5]] self.assert_(dropped.equals(expected)) self.assert_(dropped2.equals(expected)) dropped = self.index.drop(['bar']) expected = self.index[[0, 1, 3, 4, 5]] self.assert_(dropped.equals(expected)) index = MultiIndex.from_tuples([('bar', 'two')]) self.assertRaises(Exception, self.index.drop, [('bar', 'two')]) self.assertRaises(Exception, self.index.drop, index)
def _reshape_tables(self): """ TODO _reshape_tables should be cleaned !!! """ dfs = self.aggregates_of_dataframe dfs_erf = self.aggregates_erfs_dataframe labels_variables = self.labels_variables agg = Aggregates() # We need this for the columns labels to work print 'Resetting index to avoid later trouble on manipulation' for d in dfs: d.reset_index(inplace=True) d.set_index('Mesure', inplace=True, drop=False) d.reindex_axis(labels_variables, axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() for d in dfs_erf: d.reset_index(inplace=True) d['Mesure'] = agg.labels['dep'] d.set_index('index', inplace=True, drop=False) d.reindex_axis(agg.labels.values(), axis=0) d.reset_index(inplace=True, drop=True) # print d.to_string() # Concatening the openfisca tables for =/= years temp = dfs[0] if len(dfs) != 1: for d in dfs[1:]: temp = pd.concat([temp, d], ignore_index=True) del temp[agg.labels['entity']], temp['index'] gc.collect() print 'We split the real aggregates from the of table' temp2 = temp[[ agg.labels['var'], agg.labels['benef_real'], agg.labels['dep_real'], 'year' ]] del temp[agg.labels['benef_real']], temp[agg.labels['dep_real']] sauvegarde = temp.columns.get_level_values(0).unique() temp['source'] = 'of' temp2['source'] = 'reel' temp2.rename(columns={ agg.labels['benef_real']: agg.labels['benef'], agg.labels['dep_real']: agg.labels['dep'] }, inplace=True) temp = pd.concat([temp, temp2], ignore_index=True) print 'We add the erf data to the table' for df in dfs_erf: del df['level_0'], df['Mesure'] df.rename(columns={ 'index': agg.labels['var'], 1: agg.labels['dep'] }, inplace=True) temp3 = dfs_erf[0] if len(dfs) != 1: for d3 in dfs_erf[1:]: temp3 = pd.concat([temp3, d3], ignore_index=True) temp3['source'] = 'erfs' gc.collect() temp = pd.concat([temp, temp3], ignore_index=True) # print temp.to_string() print 'Index manipulation to reshape the output' temp.reset_index(drop=True, inplace=True) # We set the new index # temp.set_index('Mesure', drop = True, inplace = True) # temp.set_index('source', drop = True, append = True, inplace = True) # temp.set_index('year', drop = False, append = True, inplace = True) temp = temp.groupby(by=["Mesure", "source", "year"], sort=False).sum() # Tricky, the [mesure, source, year] index is unique so sum() will return the only value # Groupby automatically deleted the source, mesure... columns and added them to index assert (isinstance(temp, pd.DataFrame)) # print temp.to_string() # We want the years to be in columns, so we use unstack temp_unstacked = temp.unstack() # Unfortunately, unstack automatically sorts rows and columns, we have to reindex the table : ## Reindexing rows from pandas.core.index import MultiIndex indtemp1 = temp.index.get_level_values(0) indtemp2 = temp.index.get_level_values(1) indexi = zip(*[indtemp1, indtemp2]) indexi_bis = [] for i in xrange(0, len(indexi)): if indexi[i] not in indexi_bis: indexi_bis.append(indexi[i]) indexi = indexi_bis del indexi_bis indexi = MultiIndex.from_tuples(indexi, names=['Mesure', 'source']) # import pdb # pdb.set_trace() temp_unstacked = temp_unstacked.reindex_axis( indexi, axis=0) # axis = 0 for rows, 1 for columns ## Reindexing columns # TODO : still not working col_indexi = [] print temp.columns for i in xrange(len(sauvegarde)): # for col in temp.columns.get_level_values(0).unique(): col = sauvegarde[i] for yr in self.years: col_indexi.append((col, yr)) col_indexi = MultiIndex.from_tuples(col_indexi) # print col_indexi # print temp_unstacked.columns print col_indexi print temp_unstacked.columns temp_unstacked = temp_unstacked.reindex_axis(col_indexi, axis=1) # Our table is ready to be turned to Excel worksheet ! # print temp_unstacked.to_string() del temp_unstacked['Mesure'], temp_unstacked['year'] temp_unstacked.fillna(0, inplace=True) return temp_unstacked
def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if rlocs == []: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name='__placeholder__') else: dummy_index = MultiIndex(levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [v if i > v else v - 1 for v in clocs] return result dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack('__placeholder__', fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def _get_multi_index(self, mask, obs_ids): masked = [labels for _, labels in self._get_group_levels(mask, obs_ids)] names = [ping.name for ping in self.groupings] return MultiIndex.from_arrays(masked, names=names)
def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() self.assert_(s1.equals(s2)) s1[1] = 99 self.assert_(not s1.equals(s2)) # NaNs compare as equal s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3]) s2 = s1.copy() self.assert_(s1.equals(s2)) s2[0] = 9.9 self.assert_(not s1.equals(s2)) idx = MultiIndex.from_tuples([(0, 'a'), (1, 'b'), (2, 'c')]) s1 = Series([1, 2, np.nan], index=idx) s2 = s1.copy() self.assert_(s1.equals(s2)) # Add object dtype column with nans index = np.random.random(10) df1 = DataFrame(np.random.random(10, ), index=index, columns=['floats']) df1['text'] = 'the sky is so blue. we could use more chocolate.'.split( ) df1['start'] = date_range('2000-1-1', periods=10, freq='T') df1['end'] = date_range('2000-1-1', periods=10, freq='D') df1['diff'] = df1['end'] - df1['start'] df1['bool'] = (np.arange(10) % 3 == 0) df1.ix[::2] = nan df2 = df1.copy() self.assert_(df1['text'].equals(df2['text'])) self.assert_(df1['start'].equals(df2['start'])) self.assert_(df1['end'].equals(df2['end'])) self.assert_(df1['diff'].equals(df2['diff'])) self.assert_(df1['bool'].equals(df2['bool'])) self.assert_(df1.equals(df2)) self.assert_(not df1.equals(object)) # different dtype different = df1.copy() different['floats'] = different['floats'].astype('float32') self.assert_(not df1.equals(different)) # different index different_index = -index different = df2.set_index(different_index) self.assert_(not df1.equals(different)) # different columns different = df2.copy() different.columns = df2.columns[::-1] self.assert_(not df1.equals(different)) # DatetimeIndex index = pd.date_range('2000-1-1', periods=10, freq='T') df1 = df1.set_index(index) df2 = df1.copy() self.assert_(df1.equals(df2)) # MultiIndex df3 = df1.set_index(['text'], append=True) df2 = df1.set_index(['text'], append=True) self.assert_(df3.equals(df2)) df2 = df1.set_index(['floats'], append=True) self.assert_(not df3.equals(df2)) # NaN in index df3 = df1.set_index(['floats'], append=True) df2 = df1.set_index(['floats'], append=True) self.assert_(df3.equals(df2))
def test_rename_mi(self): s = Series([11, 21, 31], index=MultiIndex.from_tuples([("A", x) for x in ["a", "B", "c"]])) result = s.rename(str.lower)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: levels = [ Categorical.from_array(zp, ordered=True).categories for zp in zipped ] else: levels = [_ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [_ensure_index(keys)] else: levels = [_ensure_index(x) for x in levels] if not _all_indexes_same(indexes): label_list = [] # things are potentially different sizes, so compute the exact labels # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key %s not in level %s' % (str(key), str(level))) to_concat.append(np.repeat(i, len(index))) label_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index, ordered=True) levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len(set([i.nlevels for i in indexes])) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct labels new_labels = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = _ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: %s' % str(hlevel[mask])) new_labels.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) else: new_levels.append(new_index) new_labels.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False)
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) except: warnings.warn("Inferring time-zone from %s in column %s " "failed, using time-zone-agnostic" "" % (timezones[six.text_type(col)], col)) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col + '-catdef'] = index._data else: if hasattr(t, 'base'): # funky pandas not-dtype t = t.base d = np.empty(size, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: try: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) except: warnings.warn("Inferring time-zone from %s in column %s " "failed, using time-zone-agnostic" "" % (timezones[six.text_type(col)], col)) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() index._codes = list() for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) if LooseVersion(pdver) >= LooseVersion("0.24.0"): index._codes = list(index._codes) + [d] else: index._labels.append(d) views[col] = d views[col + '-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype='M8[ns]') new_block = block.make_block_same_class( type(block.values)(values, dtype=block.values.dtype)) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col + '-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = np.asarray(block.values, dtype='M8[ns]') else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def test_format_integer_names(self): index = MultiIndex(levels=[[0, 1], [0, 1]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) index.format(names=True)
def test_multilevel_consolidate(self): index = MultiIndex.from_tuples([('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('bar', 'two')]) df = DataFrame(np.random.randn(4, 4), index=index, columns=index) df['Totals', ''] = df.sum(1) df = df.consolidate()
def setUp(self): """ Setup the dataframes used for the groupby tests derived from pandas """ self.dateRange = bdate_range('1/1/2005', periods=250) self.stringIndex = Index([rands(8).upper() for x in range(250)]) self.groupId = Series([x[0] for x in self.stringIndex], index=self.stringIndex) self.groupDict = dict( (k, v) for k, v in compat.iteritems(self.groupId)) self.columnIndex = Index(['A', 'B', 'C', 'D', 'E']) randMat = np.random.randn(250, 5) self.stringMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.stringIndex) self.timeMatrix = DataFrame(randMat, columns=self.columnIndex, index=self.dateRange) self.ts = tm.makeTimeSeries() self.seriesd = tm.getSeriesData() self.tsd = tm.getTimeSeriesData() self.frame = DataFrame(self.seriesd) self.tsframe = DataFrame(self.tsd) self.df = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.random.randn(8) }) self.df_mixed_floats = DataFrame({ 'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C': np.random.randn(8), 'D': np.array(np.random.randn(8), dtype='float32') }) index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], names=['first', 'second']) self.mframe = DataFrame(np.random.randn(10, 3), index=index, columns=['A', 'B', 'C']) self.three_group = DataFrame({ 'A': [ 'foo', 'foo', 'foo', 'foo', 'bar', 'bar', 'bar', 'bar', 'foo', 'foo', 'foo' ], 'B': [ 'one', 'one', 'one', 'two', 'one', 'one', 'one', 'two', 'two', 'two', 'one' ], 'C': [ 'dull', 'dull', 'shiny', 'dull', 'dull', 'shiny', 'shiny', 'dull', 'shiny', 'shiny', 'shiny' ], 'D': np.random.randn(11), 'E': np.random.randn(11), 'F': np.random.randn(11) }) super(self.__class__, self).setUp()
def _wrap_result(self, result, use_codes=True, name=None, expand=None): # TODO: this was blindly copied from `strings.StringMethods._wrap_result` for noew from pandas.core.index import Index, MultiIndex # for category, we do the stuff on the categories, so blow it up # to the full series again # But for some operations, we have to do the stuff on the full values, # so make it possible to skip this step as the method already did this # before the transformation... # if use_codes and self._is_categorical: # result = take_1d(result, self._orig.cat.codes) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): return result assert result.ndim < 3 if expand is None: # infer from ndim if expand is not specified expand = False if result.ndim == 1 else True elif expand is True and not isinstance(self._orig, Index): # required when expand=True is explicitly specified # not needed when inferred def cons_row(x): if is_list_like(x): return x else: return [x] result = [cons_row(x) for x in result] if result: # propagate nan values to match longest sequence (GH 18450) max_len = max(len(x) for x in result) result = [ x * max_len if len(x) == 0 or x[0] is np.nan else x for x in result ] if not isinstance(expand, bool): raise ValueError("expand must be True or False") if expand is False: # if expand is False, result should have the same name # as the original otherwise specified if name is None: name = getattr(result, 'name', None) if name is None: # do not use logical or, _orig may be a DataFrame # which has "name" column name = self._orig.name # Wait until we are sure result is a Series or Index before # checking attributes (GH 12180) if isinstance(self._orig, Index): # if result is a boolean np.array, return the np.array # instead of wrapping it into a boolean Index (GH 8875) if is_bool_dtype(result): return result if expand: result = list(result) out = MultiIndex.from_tuples(result, names=name) if out.nlevels == 1: # We had all tuples of length-one, which are # better represented as a regular Index. out = out.get_level_values(0) return out else: return Index(result, name=name) else: index = self._orig.index if expand: cons = self._orig._constructor_expanddim return cons(result, columns=name, index=index) else: # Must be a Series cons = self._orig._constructor return cons(result, name=name, index=index)
def _unstack_multiple(data, clocs): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_group_index(obs_ids, shape) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__']) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): raise NotImplementedError('Unstacking multiple levels with ' 'hierarchical columns not yet supported') dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def test_tz_convert_and_localize(self): l0 = date_range('20140701', periods=5, freq='D') # TODO: l1 should be a PeriodIndex for testing # after GH2106 is addressed with tm.assertRaises(NotImplementedError): period_range('20140701', periods=1).tz_convert('UTC') with tm.assertRaises(NotImplementedError): period_range('20140701', periods=1).tz_localize('UTC') # l1 = period_range('20140701', periods=5, freq='D') l1 = date_range('20140701', periods=5, freq='D') int_idx = Index(range(5)) for fn in ['tz_localize', 'tz_convert']: if fn == 'tz_convert': l0 = l0.tz_localize('UTC') l1 = l1.tz_localize('UTC') for idx in [l0, l1]: l0_expected = getattr(idx, fn)('US/Pacific') l1_expected = getattr(idx, fn)('US/Pacific') df1 = DataFrame(np.ones(5), index=l0) df1 = getattr(df1, fn)('US/Pacific') self.assertTrue(df1.index.equals(l0_expected)) # MultiIndex # GH7846 df2 = DataFrame(np.ones(5), MultiIndex.from_arrays([l0, l1])) df3 = getattr(df2, fn)('US/Pacific', level=0) self.assertFalse(df3.index.levels[0].equals(l0)) self.assertTrue(df3.index.levels[0].equals(l0_expected)) self.assertTrue(df3.index.levels[1].equals(l1)) self.assertFalse(df3.index.levels[1].equals(l1_expected)) df3 = getattr(df2, fn)('US/Pacific', level=1) self.assertTrue(df3.index.levels[0].equals(l0)) self.assertFalse(df3.index.levels[0].equals(l0_expected)) self.assertTrue(df3.index.levels[1].equals(l1_expected)) self.assertFalse(df3.index.levels[1].equals(l1)) df4 = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) df5 = getattr(df4, fn)('US/Pacific', level=1) self.assertTrue(df3.index.levels[0].equals(l0)) self.assertFalse(df3.index.levels[0].equals(l0_expected)) self.assertTrue(df3.index.levels[1].equals(l1_expected)) self.assertFalse(df3.index.levels[1].equals(l1)) # Bad Inputs for fn in ['tz_localize', 'tz_convert']: # Not DatetimeIndex / PeriodIndex with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'): df = DataFrame(index=int_idx) df = getattr(df, fn)('US/Pacific') # Not DatetimeIndex / PeriodIndex with tm.assertRaisesRegexp(TypeError, 'DatetimeIndex'): df = DataFrame(np.ones(5), MultiIndex.from_arrays([int_idx, l0])) df = getattr(df, fn)('US/Pacific', level=0) # Invalid level with tm.assertRaisesRegexp(ValueError, 'not valid'): df = DataFrame(index=l0) df = getattr(df, fn)('US/Pacific', level=1)
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[str(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and str(col) in timezones: d = Series(d).dt.tz_localize(timezones[str(col)]) df[str(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col + '-catdef'] = index._data else: d = np.empty(size, dtype=t) # if d.dtype.kind == "M" and str(col) in timezones: # d = Series(d).dt.tz_localize(timezones[str(col)]) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() for i, col in enumerate(index_names): if str(index_types[i]) == 'category': c = Categorical([], categories=cat(col), fastpath=True) z = CategoricalIndex(c) z._data._codes = c.categories._data z._set_categories = c._set_categories index._levels.append(z) vals = np.zeros(size, dtype=c.codes.dtype) index._labels.append(vals) views[col] = index._labels[i] views[col + '-catdef'] = index._levels[i] else: d = np.empty(size, dtype=index_types[i]) # if d.dtype.kind == "M" and str(col) in timezones: # d = Series(d).dt.tz_localize(timezones[str(col)]) index._levels.append(Index(d)) index._labels.append(np.arange(size, dtype=int)) views[col] = index._levels[i]._data axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype=block.values.values.dtype) new_block = block.make_block_same_class(values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col + '-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = block.values.values else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) if not self._has_complex_date_col and self.index_col is not None: index = self._get_simple_index(zipped_content) index = self._agg_index(index) else: index = Index(np.arange(len(content))) col_len, zip_len = len(self.columns), len(zipped_content) if col_len != zip_len: row_num = -1 for (i, l) in enumerate(content): if len(l) != col_len: break footers = 0 if self.skip_footer: footers = self.skip_footer row_num = self.pos - (len(content) - i + footers) msg = ('Expecting %d columns, got %d in row %d' % (col_len, zip_len, row_num)) raise ValueError(msg) data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) columns = list(self.columns) if self.parse_dates is not None: data, columns = self._process_date_conversion(data) data = _convert_to_ndarrays(data, self.na_values, self.verbose) df = DataFrame(data=data, columns=columns, index=index) if self._has_complex_date_col and self.index_col is not None: if not self._name_processed: self.index_name = self._get_index_name(list(columns)) self._name_processed = True data = dict(((k, v) for k, v in df.iteritems())) index = self._get_complex_date_index(data, col_names=columns, parse_dates=False) index = self._agg_index(index, False) data = dict(((k, v.values) for k, v in data.iteritems())) df = DataFrame(data=data, columns=columns, index=index) if self.squeeze and len(df.columns) == 1: return df[df.columns[0]] return df
class TestMultiIndex(unittest.TestCase): def setUp(self): major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) self.index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels], names=['first', 'second']) def test_constructor_single_level(self): single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) self.assert_(isinstance(single_level, Index)) self.assert_(not isinstance(single_level, MultiIndex)) self.assert_(single_level.name == 'first') single_level = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]]) self.assert_(single_level.name is None) def test_constructor_no_levels(self): self.assertRaises(Exception, MultiIndex, levels=[], labels=[]) def test_duplicate_names(self): self.index.names = ['foo', 'foo'] self.assertRaises(Exception, self.index._get_level_number, 'foo') def test_from_arrays(self): arrays = [] for lev, lab in zip(self.index.levels, self.index.labels): arrays.append(np.asarray(lev).take(lab)) result = MultiIndex.from_arrays(arrays) self.assertEquals(list(result), list(self.index)) def test_append(self): result = self.index[:3].append(self.index[3:]) self.assert_(result.equals(self.index)) foos = [self.index[:1], self.index[1:3], self.index[3:]] result = foos[0].append(foos[1:]) self.assert_(result.equals(self.index)) # empty result = self.index.append([]) self.assert_(result.equals(self.index)) def test_get_level_values(self): result = self.index.get_level_values(0) expected = ['foo', 'foo', 'bar', 'baz', 'qux', 'qux'] self.assert_(np.array_equal(result, expected)) result = self.index.get_level_values('first') expected = self.index.get_level_values(0) self.assert_(np.array_equal(result, expected)) def test_nlevels(self): self.assertEquals(self.index.nlevels, 2) def test_iter(self): result = list(self.index) expected = [('foo', 'one'), ('foo', 'two'), ('bar', 'one'), ('baz', 'two'), ('qux', 'one'), ('qux', 'two')] self.assert_(result == expected) def test_pickle(self): pickled = pickle.dumps(self.index) unpickled = pickle.loads(pickled) self.assert_(self.index.equals(unpickled)) def test_legacy_pickle(self): import os def curpath(): pth, _ = os.path.split(os.path.abspath(__file__)) return pth ppath = os.path.join(curpath(), 'data/multiindex_v1.pickle') obj = pickle.load(open(ppath, 'r')) self.assert_(obj._is_legacy_format) obj2 = MultiIndex.from_tuples(obj.values) self.assert_(obj.equals(obj2)) res = obj.get_indexer(obj2[::-1]) exp = obj.get_indexer(obj[::-1]) exp2 = obj2.get_indexer(obj2[::-1]) assert_almost_equal(res, exp) assert_almost_equal(exp, exp2) def test_contains(self): self.assert_(('foo', 'two') in self.index) self.assert_(('bar', 'two') not in self.index) self.assert_(None not in self.index) def test_is_all_dates(self): self.assert_(not self.index.is_all_dates) def test_getitem(self): # scalar self.assertEquals(self.index[2], ('bar', 'one')) # slice result = self.index[2:5] expected = self.index[[2, 3, 4]] self.assert_(result.equals(expected)) # boolean result = self.index[[True, False, True, False, True, True]] result2 = self.index[np.array([True, False, True, False, True, True])] expected = self.index[[0, 2, 4, 5]] self.assert_(result.equals(expected)) self.assert_(result2.equals(expected)) def test_getitem_group_select(self): sorted_idx, _ = self.index.sortlevel(0) self.assertEquals(sorted_idx.get_loc('baz'), slice(3, 4)) self.assertEquals(sorted_idx.get_loc('foo'), slice(0, 2)) def test_get_loc(self): self.assert_(self.index.get_loc(('foo', 'two')) == 1) self.assert_(self.index.get_loc(('baz', 'two')) == 3) self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two')) self.assertRaises(KeyError, self.index.get_loc, 'quux') # 3 levels index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]) ]) self.assertRaises(KeyError, index.get_loc, (1, 1)) self.assert_(index.get_loc((2, 0)) == slice(3, 5)) def test_get_loc_duplicates(self): index = Index([2, 2, 2, 2]) self.assertRaises(Exception, index.get_loc, 2) def test_slice_locs(self): df = tm.makeTimeDataFrame() stacked = df.stack() idx = stacked.index slob = slice(*idx.slice_locs(df.index[5], df.index[15])) sliced = stacked[slob] expected = df[5:16].stack() tm.assert_almost_equal(sliced.values, expected.values) slob = slice(*idx.slice_locs(df.index[5] + timedelta(seconds=30), df.index[15] - timedelta(seconds=30))) sliced = stacked[slob] expected = df[6:15].stack() tm.assert_almost_equal(sliced.values, expected.values) def test_slice_locs_not_sorted(self): index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]) ]) self.assertRaises(Exception, index.slice_locs, (1, 0, 1), (2, 1, 0)) # works sorted_index, _ = index.sortlevel(0) result = sorted_index.slice_locs((1, 0, 1), (2, 1, 0)) def test_slice_locs_partial(self): sorted_idx, _ = self.index.sortlevel(0) result = sorted_idx.slice_locs(('foo', 'two'), ('qux', 'one')) self.assertEquals(result, (1, 5)) result = sorted_idx.slice_locs(None, ('qux', 'one')) self.assertEquals(result, (0, 5)) result = sorted_idx.slice_locs(('foo', 'two'), None) self.assertEquals(result, (1, len(sorted_idx))) result = sorted_idx.slice_locs('bar', 'baz') self.assertEquals(result, (2, 4)) def test_slice_locs_not_contained(self): # some searchsorted action index = MultiIndex(levels=[[0, 2, 4, 6], [0, 2, 4]], labels=[[0, 0, 0, 1, 1, 2, 3, 3, 3], [0, 1, 2, 1, 2, 2, 0, 1, 2]], sortorder=0) result = index.slice_locs((1, 0), (5, 2)) self.assertEquals(result, (3, 6)) result = index.slice_locs(1, 5) self.assertEquals(result, (3, 6)) result = index.slice_locs((2, 2), (5, 2)) self.assertEquals(result, (3, 6)) result = index.slice_locs(2, 5) self.assertEquals(result, (3, 6)) result = index.slice_locs((1, 0), (6, 3)) self.assertEquals(result, (3, 8)) result = index.slice_locs(-1, 10) self.assertEquals(result, (0, len(index))) def test_consistency(self): # need to construct an overflow major_axis = range(70000) minor_axis = range(10) major_labels = np.arange(70000) minor_labels = np.repeat(range(10), 7000) # the fact that is works means it's consistent index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) # inconsistent major_labels = np.array([0, 0, 1, 1, 1, 2, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) self.assertRaises(Exception, getattr, index, 'indexMap') def test_truncate(self): major_axis = Index(range(4)) minor_axis = Index(range(2)) major_labels = np.array([0, 0, 1, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) result = index.truncate(before=1) self.assert_('foo' not in result.levels[0]) self.assert_(1 in result.levels[0]) result = index.truncate(after=1) self.assert_(2 not in result.levels[0]) self.assert_(1 in result.levels[0]) result = index.truncate(before=1, after=2) self.assertEqual(len(result.levels[0]), 2) # after < before self.assertRaises(ValueError, index.truncate, 3, 1) def test_get_indexer(self): major_axis = Index(range(4)) minor_axis = Index(range(2)) major_labels = np.array([0, 0, 1, 2, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) idx1 = index[:5] idx2 = index[[1, 3, 5]] r1 = idx1.get_indexer(idx2) assert_almost_equal(r1, [1, 3, -1]) r1 = idx2.get_indexer(idx1, method='pad') assert_almost_equal(r1, [-1, 0, 0, 1, 1]) rffill1 = idx2.get_indexer(idx1, method='ffill') assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') assert_almost_equal(r1, [0, 0, 1, 1, 2]) rbfill1 = idx2.get_indexer(idx1, method='bfill') assert_almost_equal(r1, rbfill1) # pass non-MultiIndex r1 = idx1.get_indexer(idx2.get_tuple_index()) rexp1 = idx1.get_indexer(idx2) assert_almost_equal(r1, rexp1) # self.assertRaises(Exception, idx1.get_indexer, # list(list(zip(*idx2.get_tuple_index()))[0])) def test_format(self): self.index.format() self.index[:0].format() def test_format_integer_names(self): index = MultiIndex(levels=[[0, 1], [0, 1]], labels=[[0, 0, 1, 1], [0, 1, 0, 1]], names=[0, 1]) index.format(names=True) def test_bounds(self): self.index._bounds def test_equals(self): self.assert_(self.index.equals(self.index)) self.assert_(self.index.equal_levels(self.index)) self.assert_(not self.index.equals(self.index[:-1])) self.assert_(self.index.equals(self.index.get_tuple_index())) # different number of levels index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]) ]) index2 = MultiIndex(levels=index.levels[:-1], labels=index.labels[:-1]) self.assert_(not index.equals(index2)) self.assert_(not index.equal_levels(index2)) # levels are different major_axis = Index(range(4)) minor_axis = Index(range(2)) major_labels = np.array([0, 0, 1, 2, 2, 3]) minor_labels = np.array([0, 1, 0, 0, 1, 0]) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) self.assert_(not self.index.equals(index)) self.assert_(not self.index.equal_levels(index)) # some of the labels are different major_axis = Index(['foo', 'bar', 'baz', 'qux']) minor_axis = Index(['one', 'two']) major_labels = np.array([0, 0, 2, 2, 3, 3]) minor_labels = np.array([0, 1, 0, 1, 0, 1]) index = MultiIndex(levels=[major_axis, minor_axis], labels=[major_labels, minor_labels]) self.assert_(not self.index.equals(index)) def test_union(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] the_union = piece1 | piece2 tups = sorted(self.index.get_tuple_index()) expected = MultiIndex.from_tuples(tups) self.assert_(the_union.equals(expected)) # corner case, pass self or empty thing: the_union = self.index.union(self.index) self.assert_(the_union is self.index) the_union = self.index.union(self.index[:0]) self.assert_(the_union is self.index) # won't work in python 3 # tuples = self.index.get_tuple_index() # result = self.index[:4] | tuples[4:] # self.assert_(result.equals(tuples)) # not valid for python 3 # def test_union_with_regular_index(self): # other = Index(['A', 'B', 'C']) # result = other.union(self.index) # self.assert_(('foo', 'one') in result) # self.assert_('B' in result) # result2 = self.index.union(other) # self.assert_(result.equals(result2)) def test_intersection(self): piece1 = self.index[:5][::-1] piece2 = self.index[3:] the_int = piece1 & piece2 tups = sorted(self.index[3:5].get_tuple_index()) expected = MultiIndex.from_tuples(tups) self.assert_(the_int.equals(expected)) # corner case, pass self the_int = self.index.intersection(self.index) self.assert_(the_int is self.index) # empty intersection: disjoint empty = self.index[:2] & self.index[2:] expected = self.index[:0] self.assert_(empty.equals(expected)) # can't do in python 3 # tuples = self.index.get_tuple_index() # result = self.index & tuples # self.assert_(result.equals(tuples)) def test_diff(self): first = self.index result = first - self.index[-3:] expected = MultiIndex.from_tuples(sorted(self.index[:-3].values), sortorder=0, names=self.index.names) self.assert_(isinstance(result, MultiIndex)) self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) # empty difference: reflexive result = self.index - self.index expected = self.index[:0] self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) # empty difference: superset result = self.index[-3:] - self.index expected = self.index[:0] self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) # empty difference: degenerate result = self.index[:0] - self.index expected = self.index[:0] self.assert_(result.equals(expected)) self.assertEqual(result.names, self.index.names) # names not the same chunklet = self.index[-3:] chunklet.names = ['foo', 'baz'] result = first - chunklet self.assertEqual(result.names, [None, None]) # empty, but non-equal result = self.index - self.index.sortlevel(1)[0] self.assert_(len(result) == 0) # raise Exception called with non-MultiIndex self.assertRaises(Exception, first.diff, first.get_tuple_index()) def test_from_tuples(self): self.assertRaises(Exception, MultiIndex.from_tuples, []) def test_argsort(self): result = self.index.argsort() expected = self.index.get_tuple_index().argsort() self.assert_(np.array_equal(result, expected)) def test_sortlevel(self): import random tuples = list(self.index) random.shuffle(tuples) index = MultiIndex.from_tuples(tuples) sorted_idx, _ = index.sortlevel(0) expected = MultiIndex.from_tuples(sorted(tuples)) self.assert_(sorted_idx.equals(expected)) sorted_idx, _ = index.sortlevel(0, ascending=False) self.assert_(sorted_idx.equals(expected[::-1])) sorted_idx, _ = index.sortlevel(1) by1 = sorted(tuples, key=lambda x: (x[1], x[0])) expected = MultiIndex.from_tuples(by1) self.assert_(sorted_idx.equals(expected)) sorted_idx, _ = index.sortlevel(1, ascending=False) self.assert_(sorted_idx.equals(expected[::-1])) def test_dims(self): pass def test_drop(self): dropped = self.index.drop([('foo', 'two'), ('qux', 'one')]) index = MultiIndex.from_tuples([('foo', 'two'), ('qux', 'one')]) dropped2 = self.index.drop(index) expected = self.index[[0, 2, 3, 5]] self.assert_(dropped.equals(expected)) self.assert_(dropped2.equals(expected)) dropped = self.index.drop(['bar']) expected = self.index[[0, 1, 3, 4, 5]] self.assert_(dropped.equals(expected)) index = MultiIndex.from_tuples([('bar', 'two')]) self.assertRaises(Exception, self.index.drop, [('bar', 'two')]) self.assertRaises(Exception, self.index.drop, index) # mixed partial / full drop dropped = self.index.drop(['foo', ('qux', 'one')]) expected = self.index[[2, 3, 5]] self.assert_(dropped.equals(expected)) def test_droplevel_with_names(self): index = self.index[self.index.get_loc('foo')] dropped = index.droplevel(0) self.assertEqual(dropped.name, 'second') index = MultiIndex( levels=[Index(range(4)), Index(range(4)), Index(range(4))], labels=[ np.array([0, 0, 1, 2, 2, 2, 3, 3]), np.array([0, 1, 0, 0, 0, 1, 0, 1]), np.array([1, 0, 1, 1, 0, 0, 1, 0]) ], names=['one', 'two', 'three']) dropped = index.droplevel(0) self.assertEqual(dropped.names, ['two', 'three']) def test_insert(self): # key contained in all levels new_index = self.index.insert(0, ('bar', 'two')) self.assert_(new_index.equal_levels(self.index)) self.assert_(new_index[0] == ('bar', 'two')) # key not contained in all levels new_index = self.index.insert(0, ('abc', 'three')) self.assert_( np.array_equal(new_index.levels[0], list(self.index.levels[0]) + ['abc'])) self.assert_( np.array_equal(new_index.levels[1], list(self.index.levels[1]) + ['three'])) self.assert_(new_index[0] == ('abc', 'three')) # key wrong length self.assertRaises(Exception, self.index.insert, 0, ('foo2', )) def test_take_preserve_name(self): taken = self.index.take([3, 0, 1]) self.assertEqual(taken.names, self.index.names) def test_join_level(self): def _check_how(other, how): join_index, lidx, ridx = other.join(self.index, how=how, level='second', return_indexers=True) exp_level = other.join(self.index.levels[1], how=how) self.assert_(join_index.levels[0].equals(self.index.levels[0])) self.assert_(join_index.levels[1].equals(exp_level)) # pare down levels mask = np.array([x[1] in exp_level for x in self.index], dtype=bool) exp_values = self.index.values[mask] self.assert_(np.array_equal(join_index.values, exp_values)) if how in ('outer', 'inner'): join_index2, ridx2, lidx2 = \ self.index.join(other, how=how, level='second', return_indexers=True) self.assert_(join_index.equals(join_index2)) self.assert_(np.array_equal(lidx, lidx2)) self.assert_(np.array_equal(ridx, ridx2)) self.assert_(np.array_equal(join_index2.values, exp_values)) def _check_all(other): _check_how(other, 'outer') _check_how(other, 'inner') _check_how(other, 'left') _check_how(other, 'right') _check_all(Index(['three', 'one', 'two'])) _check_all(Index(['one'])) _check_all(Index(['one', 'three'])) # some corner cases idx = Index(['three', 'one', 'two']) result = idx.join(self.index, level='second') self.assert_(isinstance(result, MultiIndex)) self.assertRaises(Exception, self.index.join, self.index, level=1) def test_reindex(self): result, indexer = self.index.reindex(list(self.index[:4])) self.assert_(isinstance(result, MultiIndex)) result, indexer = self.index.reindex(list(self.index)) self.assert_(isinstance(result, MultiIndex)) self.assert_(indexer is None) def test_reindex_level(self): idx = Index(['one']) target, indexer = self.index.reindex(idx, level='second') target2, indexer2 = idx.reindex(self.index, idx, level='second') exp_index = self.index.join(idx, level='second', how='left') self.assert_(target.equals(exp_index)) self.assert_(target2.equals(exp_index)) def test_has_duplicates(self): self.assert_(not self.index.has_duplicates) self.assert_(self.index.append(self.index).has_duplicates) index = MultiIndex(levels=[[0, 1], [0, 1, 2]], labels=[[0, 0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 0, 1, 2]]) self.assert_(index.has_duplicates)
def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() # this makes life much simpler if level != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level, frame.columns.nlevels - 1): roll_columns = roll_columns.swaplevel(i, i + 1) this.columns = roll_columns if not this.columns.is_lexsorted(): this = this.sortlevel(0, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] levsize = len(level_vals) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) slice_len = loc.stop - loc.start # can make more efficient? if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns - drop_cols N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level]) new_labels.append(np.tile(np.arange(levsize), N)) new_names.append(frame.columns.names[level]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(level_codes) for lev, level_codes in zip( this.columns.levels[:-1], this.columns.codes[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_used = level_vals[level_codes] levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if (frame._is_homogeneous_type and is_extension_array_dtype(frame.dtypes.iloc[0])): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.iteritems()]) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: new_levels = [this.index] new_codes = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def test_rename_mi(self): df = DataFrame([11, 21, 31], index=MultiIndex.from_tuples([ ("A", x) for x in ["a", "B", "c"] ])) result = df.rename(str.lower)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sortlevel(level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) slice_len = loc.stop - loc.start # can make more efficient? if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level_num]) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def _flex_binary_moment(arg1, arg2, f, pairwise=False): if not (isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame))): raise TypeError("arguments to moment function must be of type " "np.ndarray/Series/DataFrame") if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( arg2, (np.ndarray, ABCSeries)): X, Y = _prep_binary(arg1, arg2) return f(X, Y) elif isinstance(arg1, ABCDataFrame): from pandas import DataFrame def dataframe_from_int_dict(data, frame_template): result = DataFrame(data, index=frame_template.index) if len(result.columns) > 0: result.columns = frame_template.columns[result.columns] return result results = {} if isinstance(arg2, ABCDataFrame): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names for i, col in enumerate(arg1.columns): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: if not arg1.columns.is_unique: raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) X, Y = arg1.align(arg2, join="outer") X = X + 0 * Y Y = Y + 0 * X with warnings.catch_warnings(record=True): warnings.simplefilter("ignore", RuntimeWarning) res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) for i, k1 in enumerate(arg1.columns): for j, k2 in enumerate(arg2.columns): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] else: results[i][j] = f( *_prep_binary(arg1.iloc[:, i], arg2.iloc[:, j])) from pandas import concat result_index = arg1.index.union(arg2.index) if len(result_index): # construct result frame result = concat( [ concat( [ results[i][j] for j, c in enumerate(arg2.columns) ], ignore_index=True, ) for i, c in enumerate(arg1.columns) ], ignore_index=True, axis=1, ) result.columns = arg1.columns # set the index and reorder if arg2.columns.nlevels > 1: result.index = MultiIndex.from_product( arg2.columns.levels + [result_index]) result = result.reorder_levels([2, 0, 1]).sort_index() else: result.index = MultiIndex.from_product([ range(len(arg2.columns)), range(len(result_index)) ]) result = result.swaplevel(1, 0).sort_index() result.index = MultiIndex.from_product([result_index] + [arg2.columns]) else: # empty result result = DataFrame( index=MultiIndex(levels=[arg1.index, arg2.columns], codes=[[], []]), columns=arg2.columns, dtype="float64", ) # reset our index names to arg1 names # reset our column names to arg2 names # careful not to mutate the original names result.columns = result.columns.set_names(arg1.columns.names) result.index = result.index.set_names(result_index.names + arg2.columns.names) return result else: raise ValueError("'pairwise' is not True/False") else: results = { i: f(*_prep_binary(arg1.iloc[:, i], arg2)) for i, col in enumerate(arg1.columns) } return dataframe_from_int_dict(results, arg1) else: return _flex_binary_moment(arg2, arg1, f)
def to_frame(self, filter_observations=True): """ Transform wide format into long (stacked) format as DataFrame whose columns are the Panel's items and whose index is a MultiIndex formed of the Panel's major and minor axes. Parameters ---------- filter_observations : boolean, default True Drop (major, minor) pairs without a complete set of observations across all the items Returns ------- y : DataFrame """ _, N, K = self.shape if filter_observations: # shaped like the return DataFrame mask = com.notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: # size = N * K selector = slice(None, None) data = {} for item in self.items: data[item] = self[item].values.ravel()[selector] def construct_multi_parts(idx, n_repeat, n_shuffle=1): axis_idx = idx.to_hierarchical(n_repeat, n_shuffle) labels = [x[selector] for x in axis_idx.labels] levels = axis_idx.levels names = axis_idx.names return labels, levels, names def construct_index_parts(idx, major=True): levels = [idx] if major: labels = [np.arange(N).repeat(K)[selector]] names = idx.name or 'major' else: labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] labels = [labels.ravel()[selector]] names = idx.name or 'minor' names = [names] return labels, levels, names if isinstance(self.major_axis, MultiIndex): major_labels, major_levels, major_names = construct_multi_parts( self.major_axis, n_repeat=K) else: major_labels, major_levels, major_names = construct_index_parts( self.major_axis) if isinstance(self.minor_axis, MultiIndex): minor_labels, minor_levels, minor_names = construct_multi_parts( self.minor_axis, n_repeat=N, n_shuffle=K) else: minor_labels, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) levels = major_levels + minor_levels labels = major_labels + minor_labels names = major_names + minor_names index = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) return DataFrame(data, index=index, columns=self.items)
def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table( data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, ) pieces.append(table) keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if table.index.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if table.columns.nlevels > 1: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast="infer") if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def __init__( self, partitions=None, index=None, columns=None, row_lengths=None, column_widths=None, dtypes=None, op=None, index_cols=None, uses_rowid=False, force_execution_mode=None, ): assert dtypes is not None self.id = str(type(self)._next_id[0]) type(self)._next_id[0] += 1 if index is not None: index = ensure_index(index) columns = ensure_index(columns) self._op = op self._index_cols = index_cols self._partitions = partitions self._index_cache = index self._columns_cache = columns self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths if self._op is None: self._op = FrameNode(self) self._table_cols = columns.tolist() if self._index_cols is not None: self._table_cols = self._index_cols + self._table_cols assert len(dtypes) == len( self._table_cols ), f"unaligned dtypes ({dtypes}) and table columns ({self._table_cols})" if isinstance(dtypes, list): if self._index_cols is not None: # Table stores both index and data columns but those are accessed # differently if we have a MultiIndex for columns. To unify access # to dtype we extend index column names to tuples to have a MultiIndex # of dtypes. if isinstance(columns, MultiIndex): tail = [""] * (columns.nlevels - 1) index_tuples = [(col, *tail) for col in self._index_cols] dtype_index = MultiIndex.from_tuples(index_tuples).append( columns) self._dtypes = pd.Series(dtypes, index=dtype_index) else: self._dtypes = pd.Series(dtypes, index=self._table_cols) else: self._dtypes = pd.Series(dtypes, index=columns) else: self._dtypes = dtypes if partitions is not None: self._filter_empties() # This frame uses encoding for column names to support exotic # (e.g. non-string and reserved words) column names. Encoded # names are used in OmniSci tables and corresponding Arrow tables. # If we import Arrow table, we have to rename its columns for # proper processing. if self._has_arrow_table() and self._partitions.size > 0: assert self._partitions.size == 1 table = self._partitions[0][0].get() if table.column_names[0] != f"F_{self._table_cols[0]}": new_names = [f"F_{col}" for col in table.column_names] new_table = table.rename_columns(new_names) self._partitions[0][ 0] = self._frame_mgr_cls._partition_class.put_arrow( new_table) self._uses_rowid = uses_rowid # Tests use forced execution mode to take control over frame # execution process. Supported values: # "lazy" - RuntimeError is raised if execution is triggered for the frame # "arrow" - RuntimeError is raised if execution is triggered, but we cannot # execute it using Arrow API (have to use OmniSci for execution) self._force_execution_mode = force_execution_mode
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False) if frame._is_homogeneous_type: # For homogeneous EAs, frame.values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes.values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( [col._values for _, col in frame.iteritems()]) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame.values.ravel() else: # non-homogeneous new_values = frame.values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index)
def _make_hierarchical_index(index, names): return MultiIndex.from_tuples(*[index], names=names)
def _unstack_multiple(data, clocs): from pandas.core.groupby import decons_obs_group_ids if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if isinstance(values, (list, tuple)): values_multi = True else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] or i for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc) # discard the top level if values_passed and not values_multi: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def get_chunk(self, rows=None): if rows is not None and self.skip_footer: raise ValueError('skip_footer not supported for iteration') try: content = self._get_lines(rows) except StopIteration: if self._first_chunk: content = [] else: raise # done with first read, next time raise StopIteration self._first_chunk = False if len(content) == 0: # pragma: no cover if self.index_col is not None: if np.isscalar(self.index_col): index = Index([], name=self.index_name) else: index = MultiIndex.from_arrays([[]] * len(self.index_col), names=self.index_name) else: index = Index([]) return DataFrame(index=index, columns=self.columns) zipped_content = list(lib.to_object_array(content).T) # no index column specified, so infer that's what is wanted if self.index_col is not None: if np.isscalar(self.index_col): index = zipped_content.pop(self.index_col) else: # given a list of index index = [] for idx in self.index_col: index.append(zipped_content[idx]) # remove index items from content and columns, don't pop in loop for i in reversed(sorted(self.index_col)): zipped_content.pop(i) if np.isscalar(self.index_col): if self.parse_dates: index = lib.try_parse_dates(index, parser=self.date_parser) index, na_count = _convert_types(index, self.na_values) index = Index(index, name=self.index_name) if self.verbose and na_count: print 'Found %d NA values in the index' % na_count else: arrays = [] for arr in index: if self.parse_dates: arr = lib.try_parse_dates(arr, parser=self.date_parser) arr, _ = _convert_types(arr, self.na_values) arrays.append(arr) index = MultiIndex.from_arrays(arrays, names=self.index_name) else: index = Index(np.arange(len(content))) if not index._verify_integrity(): dups = index.get_duplicates() err_msg = 'Tried columns 1-X as index but found duplicates %s' raise Exception(err_msg % str(dups)) if len(self.columns) != len(zipped_content): raise Exception('wrong number of columns') data = dict((k, v) for k, v in izip(self.columns, zipped_content)) # apply converters for col, f in self.converters.iteritems(): if isinstance(col, int) and col not in self.columns: col = self.columns[col] data[col] = lib.map_infer(data[col], f) data = _convert_to_ndarrays(data, self.na_values, self.verbose) return DataFrame(data=data, columns=self.columns, index=index)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = zip(*keys) if names is None: names = [None] * len(zipped) if levels is None: levels = [Factor(zp).levels for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [_ensure_index(keys)] else: levels = [_ensure_index(x) for x in levels] if not _all_indexes_same(indexes): label_list = [] # things are potentially different sizes, so compute the exact labels # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): i = level.get_loc(key) to_concat.append(np.repeat(i, len(index))) label_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: factor = Factor(concat_index) levels.append(factor.levels) label_list.append(factor.labels) # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct labels new_labels = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): mapped = level.get_indexer(hlevel) new_labels.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) new_names.extend(new_index.names) else: new_levels.append(new_index) new_names.append(new_index.name) new_labels.append(np.tile(np.arange(n), kpieces)) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names)