def test_from_frame_level1_unsorted(self): tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1), ('MSFT', 1)] midx = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.rand(5,4), index=midx) p = df.to_panel() assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
def test_to_panel_na_handling(self): df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)), index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1], [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]]) panel = df.to_panel() self.assert_(isnull(panel[0].ix[1, [0, 1]]).all())
def test_from_frame_level1_unsorted(self): tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1), ('MSFT', 1)] midx = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.rand(5, 4), index=midx) p = df.to_panel() assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
def test_to_frame(self): # filtered filtered = self.panel.to_frame() expected = self.panel.to_frame().dropna(how="any") assert_frame_equal(filtered, expected) # unfiltered unfiltered = self.panel.to_frame(filter_observations=False) assert_panel_equal(unfiltered.to_panel(), self.panel) # names self.assertEqual(unfiltered.index.names, ["major", "minor"]) # unsorted, round trip df = self.panel.to_frame(filter_observations=False) unsorted = df.take(np.random.permutation(len(df))) pan = unsorted.to_panel() assert_panel_equal(pan, self.panel) # preserve original index names df = DataFrame( np.random.randn(6, 2), index=[["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], columns=["one", "two"] ) df.index.names = ["foo", "bar"] df.columns.name = "baz" rdf = df.to_panel().to_frame() self.assertEqual(rdf.index.names, df.index.names) self.assertEqual(rdf.columns.names, df.columns.names)
def _read_panel_table(self, group, where=None): from pandas.core.common import _asarray_tuplesafe table = getattr(group, 'table') # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(sel.values['values'], index=long_index, columns=fields) if not long_index.has_duplicates: lp = lp.sortlevel(level=0) wp = lp.to_panel() else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' 'appended') # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _read_panel_table(self, group, where=None): from pandas.core.common import _asarray_tuplesafe table = getattr(group, 'table') # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(sel.values['values'], index=long_index, columns=fields) if not long_index.has_duplicates: lp = lp.sortlevel(level=0) wp = lp.to_panel() else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor.from_array(index) minor = Factor.from_array(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.ref_items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index._tuple_index unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def test_from_frame_level1_unsorted(self): tuples = [("MSFT", 3), ("MSFT", 2), ("AAPL", 2), ("AAPL", 1), ("MSFT", 1)] midx = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.rand(5, 4), index=midx) p = df.to_panel() assert_frame_equal(p.minor_xs(2), df.ix[:, 2].sort_index())
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print ('Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
class PanelData(object): """ Abstraction to handle alternative formats for panel data Parameters ---------- x : {ndarray, Series, DataFrame, Panel, DataArray} Input data var_name : str, optional Variable name to use when naming variables in NumPy arrays or xarray DataArrays convert_dummies : bool, optional Flat indicating whether pandas categoricals or string input data should be converted to dummy variables drop_first : bool, optional Flag indicating to drop first dummy category when converting Notes ----- Data can be either 2- or 3-dimensional. The three key dimensions are * nvar - number of variables * nobs - number of time periods * nentity - number of entities All 3-d inputs should be in the form (nvar, nobs, nentity). With one exception, 2-d inputs are treated as (nobs, nentity) so that the input can be treated as-if being (1, nobs, nentity). If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the input is treated differently. Index level 0 is assumed ot be entity. Index level 1 is time. The columns are the variables. This is the most precise format to use since pandas Panels do not preserve all variable type information across transformations between Panel and MultiIndex DataFrame. MultiIndex Series are also accepted and treated as single column MultiIndex DataFrames. Raises ------ TypeError If the input type is not supported ValueError If the input has the wrong number of dimensions or a MultiIndex DataFrame does not have 2 levels """ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first if isinstance(x, PanelData): x = x.dataframe self._original = x if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError('Only 2-d or 3-d DataArrays are supported') x = x.to_pandas() if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( 'Series can only be used with a 2-level MultiIndex') if isinstance(x, (Panel, DataFrame)): if isinstance(x, DataFrame): if isinstance(x.index, pd.MultiIndex): if len(x.index.levels) != 2: raise ValueError('DataFrame input must have a ' 'MultiIndex with 2 levels') self._frame = x.copy() else: self._frame = DataFrame( {var_name: x.T.stack(dropna=False)}) else: self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, ndarray): if not 2 <= x.ndim <= 3: raise ValueError('2 or 3-d array required for numpy input') if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape variables = [var_name] if k == 1 else [ var_name + '.{0}'.format(i) for i in range(k) ] entities = ['entity.{0}'.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64) panel = Panel(x, items=variables, major_axis=time, minor_axis=entities) self._frame = panel.swapaxes(1, 2).to_frame(filter_observations=False) else: raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays ' 'supported.') if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64) time_index = Series(self._frame.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError('The index on the time dimension must be either ' 'numeric or date-like') self._k, self._t, self._n = self.panel.shape self._frame.index.levels[0].name = 'entity' self._frame.index.levels[1].name = 'time' @property def panel(self): """pandas Panel view of data""" return self._frame.to_panel().swapaxes(1, 2) @property def dataframe(self): """pandas DataFrame view of data""" return self._frame @property def values2d(self): """NumPy ndarray view of dataframe""" return self._frame.values @property def values3d(self): """NumPy ndarray view of panel""" return self.panel.values def drop(self, locs): """ Parameters ---------- locs : ndarray Booleam array indicating observations to drop with reference to the dataframe view of the data """ self._frame = self._frame.loc[~locs.ravel()] self._frame = self._minimize_multiindex(self._frame) self._k, self._t, self._n = self.shape @property def shape(self): """Shape of panel view of data""" return self.panel.shape @property def ndim(self): """Number of dimensions of panel view of data""" return 3 @property def isnull(self): """Locations with missing observations""" return np.any(self._frame.isnull(), axis=1) @property def nobs(self): """Number of time observations""" return self._t @property def nvar(self): """Number of variables""" return self._k @property def nentity(self): """Number of entities""" return self._n @property def vars(self): """List of variable names""" return list(self._frame.columns) @property def time(self): """List of time index names""" index = self._frame.index return list(index.levels[1][index.labels[1]].unique()) @property def entities(self): """List of entity index names""" index = self._frame.index return list(index.levels[0][index.labels[0]].unique()) @property def entity_ids(self): """ Get array containing entity group membership information Returns ------- id : ndarray 2d array containing entity ids corresponding dataframe view """ return np.asarray(self._frame.index.labels[0])[:, None] @property def time_ids(self): """ Get array containing time membership information Returns ------- id : ndarray 2d array containing time ids corresponding dataframe view """ return np.asarray(self._frame.index.labels[1])[:, None] def _demean_both(self, weights): """ Entity and time demean Parameters ---------- weights : PanelData, optional Weights to use in demeaning """ if self.nentity > self.nobs: group = 'entity' dummy = 'time' else: group = 'time' dummy = 'entity' e = self.demean(group, weights=weights) d = self.dummies(dummy, drop_first=True) d.index = e.index d = PanelData(d).demean(group, weights=weights) d = d.values2d e = e.values2d resid = e - d @ np.linalg.lstsq(d, e)[0] resid = DataFrame(resid, index=self._frame.index, columns=self._frame.columns) return PanelData(resid) def weighted_general_demean(self, groups, weights): """ Multi-way demeaning using only groupby Parameters ---------- groups : PanelData Arrays with the same size containing group identifiers weights : PanelData Weights to use in the weighted demeaning Returns ------- demeaned : PanelData Weighted, demeaned data according to groups Notes ----- Iterates until convergence """ if not isinstance(groups, PanelData): groups = PanelData(groups) weights = weights.values2d groups = groups.values2d.astype(np.int64) def weighted_group_mean(df, weights, root_w, level): num = (root_w * df).groupby(level=level).transform('sum') denom = weights.groupby(level=level).transform('sum') return num.values / denom.values def demean_pass(frame, weights, root_w): levels = groups.shape[1] for level in range(levels): mu = weighted_group_mean(frame, weights, root_w, level) if level == 0: frame = frame - root_w * mu else: frame -= root_w * mu return frame # Swap out the index for better performance init_index = pd.DataFrame(groups) init_index.set_index(list(init_index.columns), inplace=True) root_w = np.sqrt(weights) weights = pd.DataFrame(weights, index=init_index.index) wframe = root_w * self._frame wframe.index = init_index.index previous = wframe current = demean_pass(previous, weights, root_w) if groups.shape[1] == 1: current.index = self._frame.index return PanelData(current) exclude = np.ptp(self._frame.values, 0) == 0 max_rmse = np.sqrt(self._frame.values.var(0).max()) scale = self._frame.std().values exclude = exclude | (scale < 1e-14 * max_rmse) replacement = np.maximum(scale, 1) scale[exclude] = replacement[exclude] scale = scale[None, :] while np.max(np.abs(current.values - previous.values) / scale) > 1e-8: previous = current current = demean_pass(previous, weights, root_w) current.index = self._frame.index return PanelData(current) def general_demean(self, groups): """ Multi-way demeaning using only groupby Parameters ---------- groups : PanelData Arrays with the same size containing group identifiers Returns ------- demeaned : PanelData Demeaned data according to groups Notes ----- Iterates until convergence """ # TODO: Consolidate with weighted version if not isinstance(groups, PanelData): groups = PanelData(groups) groups = groups.values2d.astype(np.int64) def demean_pass(frame): levels = len(frame.index.levels) if isinstance( frame.index, pd.MultiIndex) else 1 for i in range(levels): mu = frame.groupby(level=i).transform('mean') if i == 0: frame = frame - mu else: frame -= mu return frame # Swap out the index for better performance previous = self._frame.copy() init_index = pd.DataFrame(groups) init_index.set_index(list(init_index.columns), inplace=True) previous.index = init_index.index current = demean_pass(previous) if groups.shape[1] == 1: current.index = self._frame.index return PanelData(current) exclude = np.ptp(self._frame.values, 0) == 0 max_rmse = np.sqrt(self._frame.values.var(0).max()) scale = self._frame.std().values exclude = exclude | (scale < 1e-14 * max_rmse) replacement = np.maximum(scale, 1) scale[exclude] = replacement[exclude] scale = scale[None, :] while np.max(np.abs(current.values - previous.values) / scale) > 1e-8: previous = current current = demean_pass(current) current.index = self._frame.index return PanelData(current) def demean(self, group='entity', weights=None): """ Demeans data by either entity or time group Parameters ---------- group : {'entity', 'time'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging Returns ------- demeaned : PanelData Demeaned data according to type Notes ----- If weights are provided, the values returned will be scaled by sqrt(weights) so that they can be used in WLS estimation. """ if group not in ('entity', 'time', 'both'): raise ValueError if group == 'both': return self._demean_both(weights) level = 0 if group == 'entity' else 1 if weights is None: group_mu = self._frame.groupby(level=level).transform('mean') return PanelData(self._frame - group_mu) else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).transform('sum') frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).transform('sum') group_mu = weighted_sum / sum_weights return PanelData(np.sqrt(w) * (self._frame - group_mu)) def __str__(self): return self.__class__.__name__ + '\n' + str(self._frame) def __repr__(self): return self.__str__( ) + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self)) def _repr_html_(self): return self.__class__.__name__ + '<br/>' + self._frame._repr_html_() def count(self, group='entity'): """ Count number of observations by entity or time Parameters ---------- group : {'entity', 'time'} Group to use in demeaning Returns ------- count : DataFrame Counts according to type. Either (entity by var) or (time by var) """ v = self.panel.values axis = 1 if group == 'entity' else 2 count = np.sum(np.isfinite(v), axis=axis) index = self.panel.minor_axis if group == 'entity' else self.panel.major_axis out = DataFrame(count.T, index=index, columns=self.vars) reindex = self.entities if group == 'entity' else self.time out = out.loc[reindex].astype(np.int64) return out @property def index(self): """Return the index of the multi-index dataframe view""" return self._frame.index def copy(self): """Return a deep copy""" return PanelData(self._frame.copy(), var_name=self._var_name, convert_dummies=self._convert_dummies, drop_first=self._drop_first) def mean(self, group='entity', weights=None): """ Compute data mean by either entity or time group Parameters ---------- group : {'entity', 'time'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging Returns ------- mean : DataFrame Data mean according to type. Either (entity by var) or (time by var) """ level = 0 if group == 'entity' else 1 if weights is None: mu = self._frame.groupby(level=level).mean() else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).sum() frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).sum() mu = weighted_sum / sum_weights reindex = self.entities if group == 'entity' else self.time out = mu.loc[reindex] return out def first_difference(self): """ Compute first differences of variables Returns ------- diffs : PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] diffs = Panel(diffs, items=self.panel.items, major_axis=self.panel.major_axis[1:], minor_axis=self.panel.minor_axis) diffs = diffs.swapaxes(1, 2).to_frame(filter_observations=False) diffs = diffs.reindex(self._frame.index).dropna(how='any') return PanelData(diffs) @staticmethod def _minimize_multiindex(df): index_cols = list(df.index.names) orig_names = index_cols[:] for i, col in enumerate(index_cols): col = ensure_unique_column(col, df) index_cols[i] = col df.index.names = index_cols df = df.reset_index() df = df.set_index(index_cols) df.index.names = orig_names return df def dummies(self, group='entity', drop_first=False): """ Generate entity or time dummies Parameters ---------- group : {'entity', 'time'}, optional Type of dummies to generate drop_first : bool, optional Flag indicating that the dummy column corresponding to the first entity or time period should be dropped Returns ------- dummies : DataFrame Dummy variables """ if group not in ('entity', 'time'): raise ValueError axis = 0 if group == 'entity' else 1 labels = self._frame.index.labels levels = self._frame.index.levels cat = pd.Categorical(levels[axis][labels[axis]]) dummies = pd.get_dummies(cat, drop_first=drop_first) cols = self.entities if group == 'entity' else self.time return dummies[[c for c in cols if c in dummies]].astype(np.float64)
from pandas import Series, DataFrame import pandas as pd import numpy as np import pandas_datareader.data as web ser = Series(np.arange(3.)) print(ser) ser2 = Series(np.arange(3.), index=['a', 'b', 'c']) print(ser2[-1]) print(ser.ix[:1]) ser3 = Series(range(3), index=[-5, 1, 3]) print(ser3.iloc[2]) print(ser3.iloc[-1]) frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1]) print(frame.iloc[0]) pdata = pd.Panel( dict((stk, web.get_data_yahoo(stk, '1/1/2010', '1/30/2010')) for stk in ['AAPL', 'IBM', 'MSFT', 'GOOG'])) print(pdata) print(pdata.ix[:, '1/5/2010', :]) frame = pdata.ix[:, '1/5/2010':, :].to_frame() print(frame) print(frame.to_panel()) print('finsih')
def _read_panel_table(self, group, where=None): from pandas.core.index import unique_int64, Factor from pandas.core.common import _asarray_tuplesafe from pandas.core.internals import BlockManager from pandas.core.reshape import block2d_to_block3d table = getattr(group, "table") # create the selection sel = Selection(table, where) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values["column"], table._v_attrs.columns_kind) index = _maybe_convert(sel.values["index"], table._v_attrs.index_kind) values = sel.values["values"] major = Factor(index) minor = Factor(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique_int64(key)) == len(key): sorter, _ = lib.groupsort_indexer(key, J * K) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print ("Duplicate entries in table, taking most recently " "appended") # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index.get_tuple_index() index_map = lib.map_indices_object(tuple_index) unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = lib.merge_indexer_object(unique_tuples, index_map) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp