def _repr_html_(self, *args, **kwargs): """ Ipython Notebook HTML appearance basically. This only generates the colored header; under the hood, self._frame._repr_html_ calculates the table, including the proper size for optimal viewing and so on. """ # Change output based on Spectra vs. Spectrum obj = self._frame # Series doesn't have _repr_html, so actually call DF's if isinstance(obj, Series): obj = DataFrame(obj, columns=[self.specifier]) # Call DataFrame _repr_html dfhtml = obj._repr_html_(*args, **kwargs) return ('<h4>%s</h4>' % ''.join(self._header_html)) +'<br>'+ dfhtml
class Repr(object): goal_time = 0.2 def setup(self): nrows = 10000 data = np.random.randn(nrows, 10) arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100) idx = MultiIndex.from_arrays(arrays) self.df3 = DataFrame(data, index=idx) self.df4 = DataFrame(data, index=np.random.randn(nrows)) self.df_tall = DataFrame(np.random.randn(nrows, 10)) self.df_wide = DataFrame(np.random.randn(10, nrows)) def time_html_repr_trunc_mi(self): self.df3._repr_html_() def time_html_repr_trunc_si(self): self.df4._repr_html_() def time_repr_tall(self): repr(self.df_tall) def time_frame_repr_wide(self): repr(self.df_wide)
def spectra_to_html(spectra, *args, **kwargs): """ HTML representation used for Spectra and Spectrum for ipython notebooks""" delim = ' ' * 8 if spectra.ndim > 1: colorshape = '<font color="#0000CD">(%s X %s)</font>' % (spectra.shape) else: colorshape = '<font color="#0000CD"> (%s)</font>' % (spectra.shape) #Color iunit if referenced or not if not spectra.iunit: countstring = 'Iunit: <font color="#197519">%s</font>' % spectra.full_iunit else: #orange countstring = 'Iunit: <font color="#FF3300">%s</font>' % spectra.full_iunit ftunit = getattr(spectra, 'full_varunit', 'None') spunit = getattr(spectra, 'full_specunit', 'None') outline = "%s %s%s [%s X %s] %s %s\n" % \ (spectra.name, colorshape, delim, ftunit, spunit, delim, countstring) # Change output based on Spectra vs. Spectrum obj = spectra._frame if isinstance(obj, Series): obj = DataFrame(obj, columns=[spectra.specifier]) # Call DataFrame _repr_html #outline += '<font color="#0000CD">This is some text!</font>' dfhtml = obj._repr_html_(*args, **kwargs) return ('<h4>%s</h4>' % ''.join(outline)) +'<br>'+ dfhtml
def test_to_html_border_option(self): df = DataFrame({'A': [1, 2]}) with option_context('display.html.border', 0): result = df.to_html() assert 'border="0"' in result assert 'border="0"' in df._repr_html_()
def test_to_html_truncate_multi_index_sparse_off(self): pytest.skip("unreliable on travis") arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']] df = DataFrame(index=arrays, columns=arrays) fmt.set_option('display.max_rows', 7) fmt.set_option('display.max_columns', 7) fmt.set_option('display.multi_sparse', False) result = df._repr_html_() expected = '''\ <div{0}> <table border="1" class="dataframe"> <thead> <tr> <th></th> <th></th> <th>bar</th> <th>bar</th> <th>baz</th> <th>...</th> <th>foo</th> <th>qux</th> <th>qux</th> </tr> <tr> <th></th> <th></th> <th>one</th> <th>two</th> <th>one</th> <th>...</th> <th>two</th> <th>one</th> <th>two</th> </tr> </thead> <tbody> <tr> <th>bar</th> <th>one</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>bar</th> <th>two</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>baz</th> <th>one</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>foo</th> <th>two</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>qux</th> <th>one</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>qux</th> <th>two</th> <td>NaN</td> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> <td>NaN</td> </tr> </tbody> </table> <p>8 rows × 8 columns</p> </div>'''.format(div_style) if compat.PY2: expected = expected.decode('utf-8') assert result == expected
def test_to_html_truncate(self): pytest.skip("unreliable on travis") index = pd.DatetimeIndex(start='20010101', freq='D', periods=20) df = DataFrame(index=index, columns=range(20)) fmt.set_option('display.max_rows', 8) fmt.set_option('display.max_columns', 4) result = df._repr_html_() expected = '''\ <div{0}> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>0</th> <th>1</th> <th>...</th> <th>18</th> <th>19</th> </tr> </thead> <tbody> <tr> <th>2001-01-01</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-02</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-03</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-04</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>...</th> <td>...</td> <td>...</td> <td>...</td> <td>...</td> <td>...</td> </tr> <tr> <th>2001-01-17</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-18</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-19</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> <tr> <th>2001-01-20</th> <td>NaN</td> <td>NaN</td> <td>...</td> <td>NaN</td> <td>NaN</td> </tr> </tbody> </table> <p>20 rows × 20 columns</p> </div>'''.format(div_style) if compat.PY2: expected = expected.decode('utf-8') assert result == expected
def test_to_html_border_option(self): df = DataFrame({'A': [1, 2]}) with pd.option_context('html.border', 0): result = df.to_html() self.assertTrue('border="0"' in result) self.assertTrue('border="0"' in df._repr_html_())
def test_to_html_border_option(self): df = DataFrame({'A': [1, 2]}) with pd.option_context('display.html.border', 0): result = df.to_html() assert 'border="0"' in result assert 'border="0"' in df._repr_html_()
def _save_html(dst_path: str, df: pandas.DataFrame) -> None: with tf.io.gfile.GFile(dst_path, 'w') as f: f.write(df._repr_html_()) # pylint: disable=protected-access
def pandas_df_to_html(df: DataFrame) -> Optional[str]: """Provide HTML formatting for pandas.DataFrame with rf_types.Tile in the columns. """ import pandas as pd # honor the existing options on display if not pd.get_option("display.notebook_repr_html"): return None default_max_colwidth = pd.get_option( 'display.max_colwidth') # we'll try to politely put it back if len(df) == 0: return df._repr_html_() tile_cols = [] geom_cols = [] bytearray_cols = [] for c in df.columns: if isinstance(df.iloc[0][c], pyrasterframes.rf_types.Tile ): # if the first is a Tile try formatting tile_cols.append(c) elif isinstance( df.iloc[0][c], BaseGeometry): # if the first is a Geometry try formatting geom_cols.append(c) elif isinstance(df.iloc[0][c], bytearray): bytearray_cols.append(c) def _safe_tile_to_html(t): if isinstance(t, pyrasterframes.rf_types.Tile): return tile_to_html(t, fig_size=(2, 2)) else: # handles case where objects in a column are not all Tile type return t.__repr__() def _safe_geom_to_html(g): if isinstance(g, BaseGeometry): wkt = g.wkt if len(wkt) > default_max_colwidth: return wkt[:default_max_colwidth - 3] + '...' else: return wkt else: return g.__repr__() def _safe_bytearray_to_html(b): if isinstance(b, bytearray): return binary_to_html(b) else: return b.__repr__() # dict keyed by column with custom rendering function formatter = {c: _safe_tile_to_html for c in tile_cols} formatter.update({c: _safe_geom_to_html for c in geom_cols}) formatter.update({c: _safe_bytearray_to_html for c in bytearray_cols}) # This is needed to avoid our tile being rendered as `<img src="only up to fifty char...` pd.set_option('display.max_colwidth', None) return_html = df.to_html( escape=False, # means our `< img` does not get changed to `< img` formatters=formatter, # apply custom format to columns render_links=True, # common in raster frames notebook=True, max_rows=pd.get_option("display.max_rows"), # retain existing options max_cols=pd.get_option("display.max_columns"), show_dimensions=pd.get_option("display.show_dimensions"), ) pd.set_option('display.max_colwidth', default_max_colwidth) return return_html
class PanelData(object): """ Abstraction to handle alternative formats for panel data Parameters ---------- x : {ndarray, Series, DataFrame, Panel, DataArray} Input data var_name : str, optional Variable name to use when naming variables in NumPy arrays or xarray DataArrays convert_dummies : bool, optional Flat indicating whether pandas categoricals or string input data should be converted to dummy variables drop_first : bool, optional Flag indicating to drop first dummy category when converting Notes ----- Data can be either 2- or 3-dimensional. The three key dimensions are * nvar - number of variables * nobs - number of time periods * nentity - number of entities All 3-d inputs should be in the form (nvar, nobs, nentity). With one exception, 2-d inputs are treated as (nobs, nentity) so that the input can be treated as-if being (1, nobs, nentity). If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the input is treated differently. Index level 0 is assumed ot be entity. Index level 1 is time. The columns are the variables. This is the most precise format to use since pandas Panels do not preserve all variable type information across transformations between Panel and MultiIndex DataFrame. MultiIndex Series are also accepted and treated as single column MultiIndex DataFrames. Raises ------ TypeError If the input type is not supported ValueError If the input has the wrong number of dimensions or a MultiIndex DataFrame does not have 2 levels """ def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, Panel, ndarray)): from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( 'Only 2-d or 3-d DataArrays are supported') x = x.to_pandas() if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( 'Series can only be used with a 2-level MultiIndex') if isinstance(x, (Panel, DataFrame)): if isinstance(x, DataFrame): if isinstance(x.index, pd.MultiIndex): if len(x.index.levels) != 2: raise ValueError('DataFrame input must have a ' 'MultiIndex with 2 levels') self._frame = x.copy() else: self._frame = DataFrame( {var_name: x.T.stack(dropna=False)}) else: self._frame = x.swapaxes(1, 2).to_frame(filter_observations=False) elif isinstance(x, ndarray): if x.ndim not in (2, 3): raise ValueError('2 or 3-d array required for numpy input') if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}' variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}' entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) x = x.astype(np.float64) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays ' 'supported.') if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64) time_index = Series(self._frame.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError('The index on the time dimension must be either ' 'numeric or date-like') self._k, self._t, self._n = self.panel.shape self._frame.index.levels[0].name = 'entity' self._frame.index.levels[1].name = 'time' @property def panel(self): """pandas Panel view of data""" return _Panel(self._frame) @property def dataframe(self): """pandas DataFrame view of data""" return self._frame @property def values2d(self): """NumPy ndarray view of dataframe""" return self._frame.values @property def values3d(self): """NumPy ndarray view of panel""" return self.panel.values def drop(self, locs): """ Parameters ---------- locs : ndarray Booleam array indicating observations to drop with reference to the dataframe view of the data """ self._frame = self._frame.loc[~locs.ravel()] self._frame = self._minimize_multiindex(self._frame) self._k, self._t, self._n = self.shape @property def shape(self): """Shape of panel view of data""" return self.panel.shape @property def ndim(self): """Number of dimensions of panel view of data""" return 3 @property def isnull(self): """Locations with missing observations""" return np.any(self._frame.isnull(), axis=1) @property def nobs(self): """Number of time observations""" return self._t @property def nvar(self): """Number of variables""" return self._k @property def nentity(self): """Number of entities""" return self._n @property def vars(self): """List of variable names""" return list(self._frame.columns) @property def time(self): """List of time index names""" index = self._frame.index return list(index.levels[1][index.labels[1]].unique()) @property def entities(self): """List of entity index names""" index = self._frame.index return list(index.levels[0][index.labels[0]].unique()) @property def entity_ids(self): """ Get array containing entity group membership information Returns ------- id : ndarray 2d array containing entity ids corresponding dataframe view """ return np.asarray(self._frame.index.labels[0])[:, None] @property def time_ids(self): """ Get array containing time membership information Returns ------- id : ndarray 2d array containing time ids corresponding dataframe view """ return np.asarray(self._frame.index.labels[1])[:, None] def _demean_both(self, weights): """ Entity and time demean Parameters ---------- weights : PanelData, optional Weights to use in demeaning """ if self.nentity > self.nobs: group = 'entity' dummy = 'time' else: group = 'time' dummy = 'entity' e = self.demean(group, weights=weights) d = self.dummies(dummy, drop_first=True) d.index = e.index d = PanelData(d).demean(group, weights=weights) d = d.values2d e = e.values2d resid = e - d @ np.linalg.lstsq(d, e)[0] resid = DataFrame(resid, index=self._frame.index, columns=self._frame.columns) return PanelData(resid) def general_demean(self, groups, weights=None): """ Multi-way demeaning using only groupby Parameters ---------- groups : PanelData Arrays with the same size containing group identifiers weights : PanelData, optional Weights to use in the weighted demeaning Returns ------- demeaned : PanelData Weighted, demeaned data according to groups Notes ----- Iterates until convergence """ if not isinstance(groups, PanelData): groups = PanelData(groups) if weights is None: weights = PanelData( pd.DataFrame(np.ones((self._frame.shape[0], 1)), index=self.index, columns=['weights'])) weights = weights.values2d groups = groups.values2d.astype(np.int64) weight_sum = {} def weighted_group_mean(df, weights, root_w, level): num = (root_w * df).groupby(level=level).transform('sum') if level in weight_sum: denom = weight_sum[level] else: denom = weights.groupby(level=level).transform('sum') weight_sum[level] = denom return num.values / denom.values def demean_pass(frame, weights, root_w): levels = groups.shape[1] for level in range(levels): mu = weighted_group_mean(frame, weights, root_w, level) if level == 0: frame = frame - root_w * mu else: frame -= root_w * mu return frame # Swap out the index for better performance init_index = pd.DataFrame(groups) init_index.set_index(list(init_index.columns), inplace=True) root_w = np.sqrt(weights) weights = pd.DataFrame(weights, index=init_index.index) wframe = root_w * self._frame wframe.index = init_index.index previous = wframe current = demean_pass(previous, weights, root_w) if groups.shape[1] == 1: current.index = self._frame.index return PanelData(current) exclude = np.ptp(self._frame.values, 0) == 0 max_rmse = np.sqrt(self._frame.values.var(0).max()) scale = self._frame.std().values exclude = exclude | (scale < 1e-14 * max_rmse) replacement = np.maximum(scale, 1) scale[exclude] = replacement[exclude] scale = scale[None, :] while np.max(np.abs(current.values - previous.values) / scale) > 1e-8: previous = current current = demean_pass(previous, weights, root_w) current.index = self._frame.index return PanelData(current) def demean(self, group='entity', weights=None): """ Demeans data by either entity or time group Parameters ---------- group : {'entity', 'time'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging Returns ------- demeaned : PanelData Demeaned data according to type Notes ----- If weights are provided, the values returned will be scaled by sqrt(weights) so that they can be used in WLS estimation. """ if group not in ('entity', 'time', 'both'): raise ValueError if group == 'both': return self._demean_both(weights) level = 0 if group == 'entity' else 1 if weights is None: group_mu = self._frame.groupby(level=level).transform('mean') return PanelData(self._frame - group_mu) else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).transform('sum') frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).transform('sum') group_mu = weighted_sum / sum_weights return PanelData(np.sqrt(w) * (self._frame - group_mu)) def __str__(self): return self.__class__.__name__ + '\n' + str(self._frame) def __repr__(self): return self.__str__( ) + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self)) def _repr_html_(self): return self.__class__.__name__ + '<br/>' + self._frame._repr_html_() def count(self, group='entity'): """ Count number of observations by entity or time Parameters ---------- group : {'entity', 'time'} Group to use in demeaning Returns ------- count : DataFrame Counts according to type. Either (entity by var) or (time by var) """ v = self.panel.values axis = 1 if group == 'entity' else 2 count = np.sum(np.isfinite(v), axis=axis) index = self.panel.minor_axis if group == 'entity' else self.panel.major_axis out = DataFrame(count.T, index=index, columns=self.vars) reindex = self.entities if group == 'entity' else self.time out = out.loc[reindex].astype(np.int64) out.index.name = group return out @property def index(self): """Return the index of the multi-index dataframe view""" return self._frame.index def copy(self): """Return a deep copy""" return PanelData(self._frame.copy(), var_name=self._var_name, convert_dummies=self._convert_dummies, drop_first=self._drop_first) def mean(self, group='entity', weights=None): """ Compute data mean by either entity or time group Parameters ---------- group : {'entity', 'time'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging Returns ------- mean : DataFrame Data mean according to type. Either (entity by var) or (time by var) """ level = 0 if group == 'entity' else 1 if weights is None: mu = self._frame.groupby(level=level).mean() else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).sum() frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).sum() mu = weighted_sum / sum_weights reindex = self.entities if group == 'entity' else self.time out = mu.loc[reindex] return out def first_difference(self): """ Compute first differences of variables Returns ------- diffs : PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] diffs = Panel(diffs, items=self.panel.items, major_axis=self.panel.major_axis[1:], minor_axis=self.panel.minor_axis) diffs = diffs.swapaxes(1, 2).to_frame(filter_observations=False) diffs = diffs.reindex(self._frame.index).dropna(how='any') return PanelData(diffs) @staticmethod def _minimize_multiindex(df): index_cols = list(df.index.names) orig_names = index_cols[:] for i, col in enumerate(index_cols): col = ensure_unique_column(col, df) index_cols[i] = col df.index.names = index_cols df = df.reset_index() df = df.set_index(index_cols) df.index.names = orig_names return df def dummies(self, group='entity', drop_first=False): """ Generate entity or time dummies Parameters ---------- group : {'entity', 'time'}, optional Type of dummies to generate drop_first : bool, optional Flag indicating that the dummy column corresponding to the first entity or time period should be dropped Returns ------- dummies : DataFrame Dummy variables """ if group not in ('entity', 'time'): raise ValueError axis = 0 if group == 'entity' else 1 labels = self._frame.index.labels levels = self._frame.index.levels cat = pd.Categorical(levels[axis][labels[axis]]) dummies = pd.get_dummies(cat, drop_first=drop_first) cols = self.entities if group == 'entity' else self.time return dummies[[c for c in cols if c in dummies]].astype(np.float64)
class PanelData(object): """ Abstraction to handle alternative formats for panel data Parameters ---------- x : {ndarray, Series, DataFrame, DataArray} Input data var_name : str, optional Variable name to use when naming variables in NumPy arrays or xarray DataArrays convert_dummies : bool, optional Flat indicating whether pandas categoricals or string input data should be converted to dummy variables drop_first : bool, optional Flag indicating to drop first dummy category when converting copy: bool, optional Flag indicating whether to copy the input. Only has an effect when x is a DataFrame Notes ----- Data can be either 2- or 3-dimensional. The three key dimensions are * nvar - number of variables * nobs - number of time periods * nentity - number of entities All 3-d inputs should be in the form (nvar, nobs, nentity). With one exception, 2-d inputs are treated as (nobs, nentity) so that the input can be treated as-if being (1, nobs, nentity). If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the input is treated differently. Index level 0 is assumed ot be entity. Index level 1 is time. The columns are the variables. MultiIndex Series are also accepted and treated as single column MultiIndex DataFrames. Raises ------ TypeError If the input type is not supported ValueError If the input has the wrong number of dimensions or a MultiIndex DataFrame does not have 2 levels """ def __init__( self, x: "PanelDataLike", var_name: str = "x", convert_dummies: bool = True, drop_first: bool = True, copy: bool = True, ): self._var_name = var_name self._convert_dummies = convert_dummies self._drop_first = drop_first self._panel: Optional[_Panel] = None self._shape: Optional[Tuple[int, int, int]] = None index_names = ["entity", "time"] if isinstance(x, PanelData): x = x.dataframe self._original = x if not isinstance(x, (Series, DataFrame, np.ndarray)): try: from xarray import DataArray if isinstance(x, DataArray): if x.ndim not in (2, 3): raise ValueError( "Only 2-d or 3-d DataArrays are supported") if x.ndim == 2: x = x.to_pandas() else: items: List[Hashable] = np.asarray( x.coords[x.dims[0]]).tolist() major: List[Hashable] = np.asarray( x.coords[x.dims[1]]).tolist() minor: List[Hashable] = np.asarray( x.coords[x.dims[2]]).tolist() values = x.values x = panel_to_frame(values, items, major, minor, True) except ImportError: pass if isinstance(x, Series) and isinstance(x.index, MultiIndex): x = DataFrame(x) elif isinstance(x, Series): raise ValueError( "Series can only be used with a 2-level MultiIndex") if isinstance(x, DataFrame): if isinstance(x.index, MultiIndex): if len(x.index.levels) != 2: raise ValueError("DataFrame input must have a " "MultiIndex with 2 levels") if isinstance(self._original, (DataFrame, PanelData, Series)): for i in range(2): index_names[ i] = x.index.levels[i].name or index_names[i] self._frame = x if copy: self._frame = self._frame.copy() else: self._frame = DataFrame({var_name: x.T.stack(dropna=False)}) elif isinstance(x, np.ndarray): if x.ndim not in (2, 3): raise ValueError("2 or 3-d array required for numpy input") if x.ndim == 2: x = x[None, :, :] k, t, n = x.shape var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}" variables = [var_name] if k == 1 else [ var_str.format(i) for i in range(k) ] entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}" entities = [entity_str.format(i) for i in range(n)] time = list(range(t)) assert isinstance(x, np.ndarray) x = x.astype(np.float64, copy=False) panel = _Panel.from_array(x, items=variables, major_axis=time, minor_axis=entities) self._fake_panel = panel self._frame = panel.to_frame() else: raise TypeError("Only ndarrays, DataFrames or DataArrays are " "supported") if convert_dummies: self._frame = expand_categoricals(self._frame, drop_first) self._frame = self._frame.astype(np.float64, copy=False) time_index = Series(self.index.levels[1]) if not (is_numeric_dtype(time_index.dtype) or is_datetime64_any_dtype(time_index.dtype)): raise ValueError("The index on the time dimension must be either " "numeric or date-like") # self._k, self._t, self._n = self.panel.shape self._k, self._t, self._n = self.shape self._frame.index.set_names(index_names, inplace=True) @property def panel(self) -> _Panel: """pandas Panel view of data""" if self._panel is None: self._panel = _Panel(self._frame) assert self._panel is not None return self._panel @property def dataframe(self) -> DataFrame: """pandas DataFrame view of data""" return self._frame @property def values2d(self) -> NDArray: """NumPy ndarray view of dataframe""" return np.asarray(self._frame) @property def values3d(self) -> NDArray: """NumPy ndarray view of panel""" return self.panel.values def drop(self, locs: Union[Series, NDArray]) -> None: """ Drop observations from the panel. Parameters ---------- locs : ndarray Boolean array indicating observations to drop with reference to the dataframe view of the data """ self._frame = self._frame.loc[~locs.ravel()] self._frame = self._minimize_multiindex(self._frame) # Reset panel and shape after a drop self._panel = self._shape = None self._k, self._t, self._n = self.shape @property def shape(self) -> Tuple[int, int, int]: """Shape of panel view of data""" if self._shape is None: k = self._frame.shape[1] index: Index = self._frame.index t = index.get_level_values(1).unique().shape[0] n = index.get_level_values(0).unique().shape[0] self._shape = k, t, n return self._shape @property def ndim(self) -> int: """Number of dimensions of panel view of data""" return 3 @property def isnull(self) -> Series: """Locations with missing observations""" return self._frame.isnull().any(axis=1) @property def nobs(self) -> int: """Number of time observations""" return self._t @property def nvar(self) -> int: """Number of variables""" return self._k @property def nentity(self) -> int: """Number of entities""" return self._n @property def vars(self) -> List[Label]: """List of variable names""" return list(self._frame.columns) @property def time(self) -> List[Label]: """List of time index names""" index = self.index return list(index.levels[1][get_codes(index)[1]].unique()) @property def entities(self) -> List[Label]: """List of entity index names""" index = self.index return list(index.levels[0][get_codes(index)[0]].unique()) @property def entity_ids(self) -> NDArray: """ Get array containing entity group membership information Returns ------- ndarray 2d array containing entity ids corresponding dataframe view """ index = self.index return np.asarray(get_codes(index)[0])[:, None] @property def time_ids(self) -> NDArray: """ Get array containing time membership information Returns ------- ndarray 2d array containing time ids corresponding dataframe view """ index = self.index return np.asarray(get_codes(index)[1])[:, None] def _demean_both_low_mem(self, weights: Optional["PanelData"]) -> "PanelData": groups = PanelData( DataFrame(np.c_[self.entity_ids, self.time_ids], index=self._frame.index), convert_dummies=False, copy=False, ) return self.general_demean(groups, weights=weights) def _demean_both(self, weights: Optional["PanelData"]) -> "PanelData": """ Entity and time demean Parameters ---------- weights : PanelData, optional Weights to use in demeaning """ if self.nentity > self.nobs: group = "entity" dummy = "time" else: group = "time" dummy = "entity" e = self.demean(group, weights=weights) d = self.dummies(dummy, drop_first=True) d.index = e.index d = PanelData(d).demean(group, weights=weights) d = d.values2d e = e.values2d resid = e - d @ lstsq(d, e, rcond=None)[0] resid = DataFrame(resid, index=self._frame.index, columns=self._frame.columns) return PanelData(resid) def general_demean(self, groups: "PanelDataLike", weights: Optional["PanelData"] = None) -> "PanelData": """ Multi-way demeaning using only groupby Parameters ---------- groups : PanelData Arrays with the same size containing group identifiers weights : PanelData, optional Weights to use in the weighted demeaning Returns ------- PanelData Weighted, demeaned data according to groups Notes ----- Iterates until convergence """ if not isinstance(groups, PanelData): groups = PanelData(groups) if weights is None: weights = PanelData( DataFrame( np.ones((self._frame.shape[0], 1)), index=self.index, columns=["weights"], )) weights = weights.values2d groups = groups.values2d.astype(np.int64, copy=False) weight_sum: Dict[int, Series] = {} def weighted_group_mean(df: DataFrame, weights: DataFrame, root_w: NDArray, level: int) -> NDArray: num = (root_w * df).groupby(level=level).transform("sum") if level in weight_sum: denom = weight_sum[level] else: denom = weights.groupby(level=level).transform("sum") weight_sum[level] = denom return np.asarray(num) / np.asarray(denom) def demean_pass(frame: DataFrame, weights: DataFrame, root_w: NDArray) -> DataFrame: levels = groups.shape[1] for level in range(levels): mu = weighted_group_mean(frame, weights, root_w, level) if level == 0: frame = frame - root_w * mu else: frame -= root_w * mu return frame # Swap out the index for better performance init_index = DataFrame(groups) init_index.set_index(list(init_index.columns), inplace=True) root_w = np.sqrt(weights) weights = DataFrame(weights, index=init_index.index) wframe = root_w * self._frame wframe.index = init_index.index previous = wframe current = demean_pass(previous, weights, root_w) if groups.shape[1] == 1: current.index = self._frame.index return PanelData(current) exclude = np.ptp(np.asarray(self._frame), 0) == 0 max_rmse = np.sqrt(np.asarray(self._frame).var(0).max()) scale = np.asarray(self._frame.std()) exclude = exclude | (scale < 1e-14 * max_rmse) replacement = np.maximum(scale, 1) scale[exclude] = replacement[exclude] scale = scale[None, :] while np.max( np.abs(np.asarray(current) - np.asarray(previous)) / scale) > 1e-8: previous = current current = demean_pass(previous, weights, root_w) current.index = self._frame.index return PanelData(current) def demean( self, group: str = "entity", weights: Optional["PanelData"] = None, return_panel: bool = True, low_memory: bool = False, ) -> Union["PanelData", np.ndarray]: """ Demeans data by either entity or time group Parameters ---------- group : {'entity', 'time', 'both'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging return_panel : bool Flag indicating to return a PanelData object. If False, a 2-d NumPy representation of the panel is returned low_memory : bool Flag indicating whether to use a low memory implementation that avoids constructing dummy variables. Only relevant when group is 'both' Returns ------- PanelData Demeaned data according to type Notes ----- If weights are provided, the values returned will be scaled by the square root of the weights so that they can be used in WLS estimation. """ if group not in ("entity", "time", "both"): raise ValueError if group == "both": if not low_memory: return self._demean_both(weights) else: return self._demean_both_low_mem(weights) level = 0 if group == "entity" else 1 if weights is None: group_mu = self._frame.groupby(level=level).transform("mean") out = self._frame - group_mu if not return_panel: return np.asarray(out) return PanelData(out) else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).transform("sum") frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).transform("sum") group_mu = weighted_sum / sum_weights out = np.sqrt(w) * (self._frame - group_mu) if not return_panel: return np.asarray(out) return PanelData(out) def __str__(self) -> str: return self.__class__.__name__ + "\n" + str(self._frame) def __repr__(self) -> str: return (self.__str__() + "\n" + self.__class__.__name__ + " object, id: " + hex(id(self))) def _repr_html_(self) -> str: return self.__class__.__name__ + "<br/>" + self._frame._repr_html_() def count(self, group: str = "entity") -> DataFrame: """ Count number of observations by entity or time Parameters ---------- group : {'entity', 'time'} Group to count Returns ------- DataFrame Counts according to type. Either (entity by var) or (time by var) """ level = 0 if group == "entity" else 1 reindex = self.entities if group == "entity" else self.time out = self._frame.groupby(level=level).count() return out.reindex(reindex) @property def index(self) -> MultiIndex: """Return the index of the multi-index dataframe view""" index = self._frame.index assert isinstance(index, MultiIndex) return index def copy(self) -> "PanelData": """Return a deep copy""" return PanelData( self._frame.copy(), var_name=self._var_name, convert_dummies=self._convert_dummies, drop_first=self._drop_first, ) def mean(self, group: str = "entity", weights: Optional["PanelData"] = None) -> DataFrame: """ Compute data mean by either entity or time group Parameters ---------- group : {'entity', 'time'} Group to use in demeaning weights : PanelData, optional Weights to implement weighted averaging Returns ------- DataFrame Data mean according to type. Either (entity by var) or (time by var) """ level = 0 if group == "entity" else 1 if weights is None: mu = self._frame.groupby(level=level).mean() else: w = weights.values2d frame = self._frame.copy() frame = w * frame weighted_sum = frame.groupby(level=level).sum() frame.iloc[:, :] = w sum_weights = frame.groupby(level=level).sum() mu = weighted_sum / sum_weights reindex = self.entities if group == "entity" else self.time out = mu.reindex(reindex) return out def first_difference(self) -> "PanelData": """ Compute first differences of variables Returns ------- PanelData Differenced values """ diffs = self.panel.values diffs = diffs[:, 1:] - diffs[:, :-1] diffs = panel_to_frame( diffs, self.panel.items, self.panel.major_axis[1:], self.panel.minor_axis, True, ) diffs = diffs.reindex(self._frame.index).dropna(how="any") return PanelData(diffs) @staticmethod def _minimize_multiindex(df: DataFrame) -> DataFrame: index_cols = list(df.index.names) orig_names = index_cols[:] for i, col in enumerate(index_cols): col = ensure_unique_column(col, df) index_cols[i] = col df.index.names = index_cols df = df.reset_index() df = df.set_index(index_cols) df.index.names = orig_names return df def dummies(self, group: str = "entity", drop_first: bool = False) -> DataFrame: """ Generate entity or time dummies Parameters ---------- group : {'entity', 'time'}, optional Type of dummies to generate drop_first : bool, optional Flag indicating that the dummy column corresponding to the first entity or time period should be dropped Returns ------- DataFrame Dummy variables """ if group not in ("entity", "time"): raise ValueError axis = 0 if group == "entity" else 1 labels = get_codes(self.index) levels = self.index.levels cat = Categorical(levels[axis][labels[axis]]) dummies = get_dummies(cat, drop_first=drop_first) cols = self.entities if group == "entity" else self.time return dummies[[c for c in cols if c in dummies]].astype(np.float64, copy=False)