def _repr_html_(self, *args, **kwargs):
    """ Ipython Notebook HTML appearance basically.  This only generates
    the colored header; under the hood, self._frame._repr_html_ calculates
    the table, including the proper size for optimal viewing and so on.
    """
    # Change output based on Spectra vs. Spectrum
    obj = self._frame
    
    # Series doesn't have _repr_html, so actually call DF's
    if isinstance(obj, Series):
       obj = DataFrame(obj, columns=[self.specifier])
 
    # Call DataFrame _repr_html
    dfhtml = obj._repr_html_(*args, **kwargs)
    return ('<h4>%s</h4>' % ''.join(self._header_html)) +'<br>'+ dfhtml
Example #2
0
class Repr(object):

    goal_time = 0.2

    def setup(self):
        nrows = 10000
        data = np.random.randn(nrows, 10)
        arrays = np.tile(np.random.randn(3, int(nrows / 100)), 100)
        idx = MultiIndex.from_arrays(arrays)
        self.df3 = DataFrame(data, index=idx)
        self.df4 = DataFrame(data, index=np.random.randn(nrows))
        self.df_tall = DataFrame(np.random.randn(nrows, 10))
        self.df_wide = DataFrame(np.random.randn(10, nrows))

    def time_html_repr_trunc_mi(self):
        self.df3._repr_html_()

    def time_html_repr_trunc_si(self):
        self.df4._repr_html_()

    def time_repr_tall(self):
        repr(self.df_tall)

    def time_frame_repr_wide(self):
        repr(self.df_wide)
Example #3
0
def spectra_to_html(spectra, *args, **kwargs):
   """ HTML representation used for Spectra and Spectrum for ipython notebooks"""

   delim = '&nbsp;' * 8

   if spectra.ndim > 1:
      colorshape = '<font color="#0000CD">(%s X %s)</font>' % (spectra.shape)
   else:
      colorshape = '<font color="#0000CD"> (%s)</font>' % (spectra.shape)

   #Color iunit if referenced or not
   if not spectra.iunit:
      countstring = 'Iunit:&nbsp<font color="#197519">%s</font>' % spectra.full_iunit
   else: #orange
      countstring = 'Iunit:&nbsp<font color="#FF3300">%s</font>' % spectra.full_iunit

   ftunit = getattr(spectra, 'full_varunit', 'None')
   spunit = getattr(spectra, 'full_specunit', 'None')

   outline = "%s&nbsp%s%s [%s X %s] %s %s\n" % \
      (spectra.name, 
       colorshape,
       delim,
       ftunit,
       spunit,
       delim,
       countstring)        

   # Change output based on Spectra vs. Spectrum
   obj = spectra._frame
   if isinstance(obj, Series):
      obj = DataFrame(obj, columns=[spectra.specifier])

   # Call DataFrame _repr_html
   #outline += '<font color="#0000CD">This is some text!</font>'
   dfhtml = obj._repr_html_(*args, **kwargs)
   return ('<h4>%s</h4>' % ''.join(outline)) +'<br>'+ dfhtml
Example #4
0
 def test_to_html_border_option(self):
     df = DataFrame({'A': [1, 2]})
     with option_context('display.html.border', 0):
         result = df.to_html()
         assert 'border="0"' in result
         assert 'border="0"' in df._repr_html_()
Example #5
0
    def test_to_html_truncate_multi_index_sparse_off(self):
        pytest.skip("unreliable on travis")
        arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        df = DataFrame(index=arrays, columns=arrays)
        fmt.set_option('display.max_rows', 7)
        fmt.set_option('display.max_columns', 7)
        fmt.set_option('display.multi_sparse', False)
        result = df._repr_html_()
        expected = '''\
<div{0}>
<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th></th>
      <th>bar</th>
      <th>bar</th>
      <th>baz</th>
      <th>...</th>
      <th>foo</th>
      <th>qux</th>
      <th>qux</th>
    </tr>
    <tr>
      <th></th>
      <th></th>
      <th>one</th>
      <th>two</th>
      <th>one</th>
      <th>...</th>
      <th>two</th>
      <th>one</th>
      <th>two</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>bar</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>bar</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>baz</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>foo</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>qux</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>qux</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
  </tbody>
</table>
<p>8 rows × 8 columns</p>
</div>'''.format(div_style)
        if compat.PY2:
            expected = expected.decode('utf-8')
        assert result == expected
Example #6
0
    def test_to_html_truncate(self):
        pytest.skip("unreliable on travis")
        index = pd.DatetimeIndex(start='20010101', freq='D', periods=20)
        df = DataFrame(index=index, columns=range(20))
        fmt.set_option('display.max_rows', 8)
        fmt.set_option('display.max_columns', 4)
        result = df._repr_html_()
        expected = '''\
<div{0}>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
      <th>...</th>
      <th>18</th>
      <th>19</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>2001-01-01</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-02</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-03</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-04</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>2001-01-17</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-18</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-19</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-20</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
  </tbody>
</table>
<p>20 rows × 20 columns</p>
</div>'''.format(div_style)
        if compat.PY2:
            expected = expected.decode('utf-8')
        assert result == expected
Example #7
0
 def test_to_html_border_option(self):
     df = DataFrame({'A': [1, 2]})
     with pd.option_context('html.border', 0):
         result = df.to_html()
         self.assertTrue('border="0"' in result)
         self.assertTrue('border="0"' in df._repr_html_())
Example #8
0
    def test_to_html_truncate(self):
        pytest.skip("unreliable on travis")
        index = pd.DatetimeIndex(start='20010101', freq='D', periods=20)
        df = DataFrame(index=index, columns=range(20))
        fmt.set_option('display.max_rows', 8)
        fmt.set_option('display.max_columns', 4)
        result = df._repr_html_()
        expected = '''\
<div{0}>
<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th></th>
      <th>0</th>
      <th>1</th>
      <th>...</th>
      <th>18</th>
      <th>19</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>2001-01-01</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-02</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-03</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-04</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>...</th>
      <td>...</td>
      <td>...</td>
      <td>...</td>
      <td>...</td>
      <td>...</td>
    </tr>
    <tr>
      <th>2001-01-17</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-18</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-19</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>2001-01-20</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
  </tbody>
</table>
<p>20 rows × 20 columns</p>
</div>'''.format(div_style)
        if compat.PY2:
            expected = expected.decode('utf-8')
        assert result == expected
Example #9
0
    def test_to_html_truncate_multi_index_sparse_off(self):
        pytest.skip("unreliable on travis")
        arrays = [['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
                  ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]
        df = DataFrame(index=arrays, columns=arrays)
        fmt.set_option('display.max_rows', 7)
        fmt.set_option('display.max_columns', 7)
        fmt.set_option('display.multi_sparse', False)
        result = df._repr_html_()
        expected = '''\
<div{0}>
<table border="1" class="dataframe">
  <thead>
    <tr>
      <th></th>
      <th></th>
      <th>bar</th>
      <th>bar</th>
      <th>baz</th>
      <th>...</th>
      <th>foo</th>
      <th>qux</th>
      <th>qux</th>
    </tr>
    <tr>
      <th></th>
      <th></th>
      <th>one</th>
      <th>two</th>
      <th>one</th>
      <th>...</th>
      <th>two</th>
      <th>one</th>
      <th>two</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>bar</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>bar</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>baz</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>foo</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>qux</th>
      <th>one</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
    <tr>
      <th>qux</th>
      <th>two</th>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>...</td>
      <td>NaN</td>
      <td>NaN</td>
      <td>NaN</td>
    </tr>
  </tbody>
</table>
<p>8 rows × 8 columns</p>
</div>'''.format(div_style)
        if compat.PY2:
            expected = expected.decode('utf-8')
        assert result == expected
Example #10
0
 def test_to_html_border_option(self):
     df = DataFrame({'A': [1, 2]})
     with pd.option_context('display.html.border', 0):
         result = df.to_html()
         assert 'border="0"' in result
         assert 'border="0"' in df._repr_html_()
Example #11
0
def _save_html(dst_path: str, df: pandas.DataFrame) -> None:
  with tf.io.gfile.GFile(dst_path, 'w') as f:
    f.write(df._repr_html_())  # pylint: disable=protected-access
Example #12
0
def pandas_df_to_html(df: DataFrame) -> Optional[str]:
    """Provide HTML formatting for pandas.DataFrame with rf_types.Tile in the columns.  """
    import pandas as pd
    # honor the existing options on display
    if not pd.get_option("display.notebook_repr_html"):
        return None

    default_max_colwidth = pd.get_option(
        'display.max_colwidth')  # we'll try to politely put it back

    if len(df) == 0:
        return df._repr_html_()

    tile_cols = []
    geom_cols = []
    bytearray_cols = []
    for c in df.columns:
        if isinstance(df.iloc[0][c], pyrasterframes.rf_types.Tile
                      ):  # if the first is a Tile try formatting
            tile_cols.append(c)
        elif isinstance(
                df.iloc[0][c],
                BaseGeometry):  # if the first is a Geometry try formatting
            geom_cols.append(c)
        elif isinstance(df.iloc[0][c], bytearray):
            bytearray_cols.append(c)

    def _safe_tile_to_html(t):
        if isinstance(t, pyrasterframes.rf_types.Tile):
            return tile_to_html(t, fig_size=(2, 2))
        else:
            # handles case where objects in a column are not all Tile type
            return t.__repr__()

    def _safe_geom_to_html(g):
        if isinstance(g, BaseGeometry):
            wkt = g.wkt
            if len(wkt) > default_max_colwidth:
                return wkt[:default_max_colwidth - 3] + '...'
            else:
                return wkt
        else:
            return g.__repr__()

    def _safe_bytearray_to_html(b):
        if isinstance(b, bytearray):
            return binary_to_html(b)
        else:
            return b.__repr__()

    # dict keyed by column with custom rendering function
    formatter = {c: _safe_tile_to_html for c in tile_cols}
    formatter.update({c: _safe_geom_to_html for c in geom_cols})
    formatter.update({c: _safe_bytearray_to_html for c in bytearray_cols})

    # This is needed to avoid our tile being rendered as `<img src="only up to fifty char...`
    pd.set_option('display.max_colwidth', None)
    return_html = df.to_html(
        escape=False,  # means our `< img` does not get changed to `&lt; img`
        formatters=formatter,  # apply custom format to columns
        render_links=True,  # common in raster frames
        notebook=True,
        max_rows=pd.get_option("display.max_rows"),  # retain existing options
        max_cols=pd.get_option("display.max_columns"),
        show_dimensions=pd.get_option("display.show_dimensions"),
    )
    pd.set_option('display.max_colwidth', default_max_colwidth)
    return return_html
Example #13
0
class PanelData(object):
    """
    Abstraction to handle alternative formats for panel data

    Parameters
    ----------
    x : {ndarray, Series, DataFrame, Panel, DataArray}
       Input data
    var_name : str, optional
        Variable name to use when naming variables in NumPy arrays or
        xarray DataArrays
    convert_dummies : bool, optional
        Flat indicating whether pandas categoricals or string input data
        should be converted to dummy variables
    drop_first : bool, optional
        Flag indicating to drop first dummy category when converting

    Notes
    -----
    Data can be either 2- or 3-dimensional. The three key dimensions are

    * nvar - number of variables
    * nobs - number of time periods
    * nentity - number of entities

    All 3-d inputs should be in the form (nvar, nobs, nentity). With one
    exception, 2-d inputs are treated as (nobs, nentity) so that the input
    can be treated as-if being (1, nobs, nentity).

    If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the
    input is treated differently.  Index level 0 is assumed ot be entity.
    Index level 1 is time.  The columns are the variables.  This is the most
    precise format to use since pandas Panels do not preserve all variable
    type information across transformations between Panel and MultiIndex
    DataFrame. MultiIndex Series are also accepted and treated as single
    column MultiIndex DataFrames.

    Raises
    ------
    TypeError
        If the input type is not supported
    ValueError
        If the input has the wrong number of dimensions or a MultiIndex
        DataFrame does not have 2 levels
    """
    def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if not isinstance(x, (Series, DataFrame, Panel, ndarray)):
            from xarray import DataArray
            if isinstance(x, DataArray):
                if x.ndim not in (2, 3):
                    raise ValueError(
                        'Only 2-d or 3-d DataArrays are supported')
                x = x.to_pandas()

        if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                'Series can only be used with a 2-level MultiIndex')

        if isinstance(x, (Panel, DataFrame)):
            if isinstance(x, DataFrame):
                if isinstance(x.index, pd.MultiIndex):
                    if len(x.index.levels) != 2:
                        raise ValueError('DataFrame input must have a '
                                         'MultiIndex with 2 levels')
                    self._frame = x.copy()
                else:
                    self._frame = DataFrame(
                        {var_name: x.T.stack(dropna=False)})
            else:
                self._frame = x.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        elif isinstance(x, ndarray):
            if x.ndim not in (2, 3):
                raise ValueError('2 or 3-d array required for numpy input')
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            var_str = var_name + '.{0:0>' + str(int(np.log10(k) + .01)) + '}'
            variables = [var_name] if k == 1 else [
                var_str.format(i) for i in range(k)
            ]
            entity_str = 'entity.{0:0>' + str(int(np.log10(n) + .01)) + '}'
            entities = [entity_str.format(i) for i in range(n)]
            time = list(range(t))
            x = x.astype(np.float64)
            panel = _Panel.from_array(x,
                                      items=variables,
                                      major_axis=time,
                                      minor_axis=entities)
            self._fake_panel = panel
            self._frame = panel.to_frame()
        else:
            raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays '
                            'supported.')
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64)

        time_index = Series(self._frame.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError('The index on the time dimension must be either '
                             'numeric or date-like')
        self._k, self._t, self._n = self.panel.shape
        self._frame.index.levels[0].name = 'entity'
        self._frame.index.levels[1].name = 'time'

    @property
    def panel(self):
        """pandas Panel view of data"""
        return _Panel(self._frame)

    @property
    def dataframe(self):
        """pandas DataFrame view of data"""
        return self._frame

    @property
    def values2d(self):
        """NumPy ndarray view of dataframe"""
        return self._frame.values

    @property
    def values3d(self):
        """NumPy ndarray view of panel"""
        return self.panel.values

    def drop(self, locs):
        """
        Parameters
        ----------
        locs : ndarray
            Booleam array indicating observations to drop with reference to
            the dataframe view of the data
        """
        self._frame = self._frame.loc[~locs.ravel()]
        self._frame = self._minimize_multiindex(self._frame)
        self._k, self._t, self._n = self.shape

    @property
    def shape(self):
        """Shape of panel view of data"""
        return self.panel.shape

    @property
    def ndim(self):
        """Number of dimensions of panel view of data"""
        return 3

    @property
    def isnull(self):
        """Locations with missing observations"""
        return np.any(self._frame.isnull(), axis=1)

    @property
    def nobs(self):
        """Number of time observations"""
        return self._t

    @property
    def nvar(self):
        """Number of variables"""
        return self._k

    @property
    def nentity(self):
        """Number of entities"""
        return self._n

    @property
    def vars(self):
        """List of variable names"""
        return list(self._frame.columns)

    @property
    def time(self):
        """List of time index names"""
        index = self._frame.index
        return list(index.levels[1][index.labels[1]].unique())

    @property
    def entities(self):
        """List of entity index names"""
        index = self._frame.index
        return list(index.levels[0][index.labels[0]].unique())

    @property
    def entity_ids(self):
        """
        Get array containing entity group membership information

        Returns
        -------
        id : ndarray
            2d array containing entity ids corresponding dataframe view
        """
        return np.asarray(self._frame.index.labels[0])[:, None]

    @property
    def time_ids(self):
        """
        Get array containing time membership information

        Returns
        -------
        id : ndarray
            2d array containing time ids corresponding dataframe view
        """
        return np.asarray(self._frame.index.labels[1])[:, None]

    def _demean_both(self, weights):
        """
        Entity and time demean

        Parameters
        ----------
        weights : PanelData, optional
             Weights to use in demeaning
        """
        if self.nentity > self.nobs:
            group = 'entity'
            dummy = 'time'
        else:
            group = 'time'
            dummy = 'entity'
        e = self.demean(group, weights=weights)
        d = self.dummies(dummy, drop_first=True)
        d.index = e.index
        d = PanelData(d).demean(group, weights=weights)
        d = d.values2d
        e = e.values2d
        resid = e - d @ np.linalg.lstsq(d, e)[0]
        resid = DataFrame(resid,
                          index=self._frame.index,
                          columns=self._frame.columns)

        return PanelData(resid)

    def general_demean(self, groups, weights=None):
        """
        Multi-way demeaning using only groupby

        Parameters
        ----------
        groups : PanelData
            Arrays with the same size containing group identifiers
        weights : PanelData, optional
            Weights to use in the weighted demeaning

        Returns
        -------
        demeaned : PanelData
            Weighted, demeaned data according to groups

        Notes
        -----
        Iterates until convergence
        """
        if not isinstance(groups, PanelData):
            groups = PanelData(groups)
        if weights is None:
            weights = PanelData(
                pd.DataFrame(np.ones((self._frame.shape[0], 1)),
                             index=self.index,
                             columns=['weights']))
        weights = weights.values2d
        groups = groups.values2d.astype(np.int64)

        weight_sum = {}

        def weighted_group_mean(df, weights, root_w, level):
            num = (root_w * df).groupby(level=level).transform('sum')
            if level in weight_sum:
                denom = weight_sum[level]
            else:
                denom = weights.groupby(level=level).transform('sum')
                weight_sum[level] = denom
            return num.values / denom.values

        def demean_pass(frame, weights, root_w):
            levels = groups.shape[1]
            for level in range(levels):
                mu = weighted_group_mean(frame, weights, root_w, level)
                if level == 0:
                    frame = frame - root_w * mu
                else:
                    frame -= root_w * mu

            return frame

        # Swap out the index for better performance
        init_index = pd.DataFrame(groups)
        init_index.set_index(list(init_index.columns), inplace=True)

        root_w = np.sqrt(weights)
        weights = pd.DataFrame(weights, index=init_index.index)
        wframe = root_w * self._frame
        wframe.index = init_index.index

        previous = wframe
        current = demean_pass(previous, weights, root_w)
        if groups.shape[1] == 1:
            current.index = self._frame.index
            return PanelData(current)

        exclude = np.ptp(self._frame.values, 0) == 0
        max_rmse = np.sqrt(self._frame.values.var(0).max())
        scale = self._frame.std().values
        exclude = exclude | (scale < 1e-14 * max_rmse)
        replacement = np.maximum(scale, 1)
        scale[exclude] = replacement[exclude]
        scale = scale[None, :]

        while np.max(np.abs(current.values - previous.values) / scale) > 1e-8:
            previous = current
            current = demean_pass(previous, weights, root_w)
        current.index = self._frame.index

        return PanelData(current)

    def demean(self, group='entity', weights=None):
        """
        Demeans data by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging

        Returns
        -------
        demeaned : PanelData
            Demeaned data according to type

        Notes
        -----
        If weights are provided, the values returned will be scaled by
        sqrt(weights) so that they can be used in WLS estimation.
        """
        if group not in ('entity', 'time', 'both'):
            raise ValueError
        if group == 'both':
            return self._demean_both(weights)

        level = 0 if group == 'entity' else 1
        if weights is None:
            group_mu = self._frame.groupby(level=level).transform('mean')
            return PanelData(self._frame - group_mu)
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).transform('sum')
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).transform('sum')
            group_mu = weighted_sum / sum_weights
            return PanelData(np.sqrt(w) * (self._frame - group_mu))

    def __str__(self):
        return self.__class__.__name__ + '\n' + str(self._frame)

    def __repr__(self):
        return self.__str__(
        ) + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self))

    def _repr_html_(self):
        return self.__class__.__name__ + '<br/>' + self._frame._repr_html_()

    def count(self, group='entity'):
        """
        Count number of observations by entity or time

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning

        Returns
        -------
        count : DataFrame
            Counts according to type. Either (entity by var) or (time by var)
        """
        v = self.panel.values
        axis = 1 if group == 'entity' else 2
        count = np.sum(np.isfinite(v), axis=axis)

        index = self.panel.minor_axis if group == 'entity' else self.panel.major_axis
        out = DataFrame(count.T, index=index, columns=self.vars)
        reindex = self.entities if group == 'entity' else self.time
        out = out.loc[reindex].astype(np.int64)
        out.index.name = group
        return out

    @property
    def index(self):
        """Return the index of the multi-index dataframe view"""
        return self._frame.index

    def copy(self):
        """Return a deep copy"""
        return PanelData(self._frame.copy(),
                         var_name=self._var_name,
                         convert_dummies=self._convert_dummies,
                         drop_first=self._drop_first)

    def mean(self, group='entity', weights=None):
        """
        Compute data mean by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging

        Returns
        -------
        mean : DataFrame
            Data mean according to type. Either (entity by var) or (time by var)
        """
        level = 0 if group == 'entity' else 1
        if weights is None:
            mu = self._frame.groupby(level=level).mean()
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).sum()
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).sum()
            mu = weighted_sum / sum_weights

        reindex = self.entities if group == 'entity' else self.time
        out = mu.loc[reindex]

        return out

    def first_difference(self):
        """
        Compute first differences of variables

        Returns
        -------
        diffs : PanelData
            Differenced values
        """
        diffs = self.panel.values
        diffs = diffs[:, 1:] - diffs[:, :-1]
        diffs = Panel(diffs,
                      items=self.panel.items,
                      major_axis=self.panel.major_axis[1:],
                      minor_axis=self.panel.minor_axis)
        diffs = diffs.swapaxes(1, 2).to_frame(filter_observations=False)
        diffs = diffs.reindex(self._frame.index).dropna(how='any')
        return PanelData(diffs)

    @staticmethod
    def _minimize_multiindex(df):
        index_cols = list(df.index.names)
        orig_names = index_cols[:]
        for i, col in enumerate(index_cols):
            col = ensure_unique_column(col, df)
            index_cols[i] = col
        df.index.names = index_cols
        df = df.reset_index()
        df = df.set_index(index_cols)
        df.index.names = orig_names
        return df

    def dummies(self, group='entity', drop_first=False):
        """
        Generate entity or time dummies

        Parameters
        ----------
        group : {'entity', 'time'}, optional
            Type of dummies to generate
        drop_first : bool, optional
            Flag indicating that the dummy column corresponding to the first
            entity or time period should be dropped

        Returns
        -------
        dummies : DataFrame
            Dummy variables
        """
        if group not in ('entity', 'time'):
            raise ValueError
        axis = 0 if group == 'entity' else 1
        labels = self._frame.index.labels
        levels = self._frame.index.levels
        cat = pd.Categorical(levels[axis][labels[axis]])
        dummies = pd.get_dummies(cat, drop_first=drop_first)
        cols = self.entities if group == 'entity' else self.time
        return dummies[[c for c in cols if c in dummies]].astype(np.float64)
Example #14
0
class PanelData(object):
    """
    Abstraction to handle alternative formats for panel data

    Parameters
    ----------
    x : {ndarray, Series, DataFrame, DataArray}
       Input data
    var_name : str, optional
        Variable name to use when naming variables in NumPy arrays or
        xarray DataArrays
    convert_dummies : bool, optional
        Flat indicating whether pandas categoricals or string input data
        should be converted to dummy variables
    drop_first : bool, optional
        Flag indicating to drop first dummy category when converting
    copy: bool, optional
        Flag indicating whether to copy the input. Only has an effect when
        x is a DataFrame

    Notes
    -----
    Data can be either 2- or 3-dimensional. The three key dimensions are

    * nvar - number of variables
    * nobs - number of time periods
    * nentity - number of entities

    All 3-d inputs should be in the form (nvar, nobs, nentity). With one
    exception, 2-d inputs are treated as (nobs, nentity) so that the input
    can be treated as-if being (1, nobs, nentity).

    If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the
    input is treated differently.  Index level 0 is assumed ot be entity.
    Index level 1 is time.  The columns are the variables.  MultiIndex Series
    are also accepted and treated as single column MultiIndex DataFrames.

    Raises
    ------
    TypeError
        If the input type is not supported
    ValueError
        If the input has the wrong number of dimensions or a MultiIndex
        DataFrame does not have 2 levels
    """
    def __init__(
        self,
        x: "PanelDataLike",
        var_name: str = "x",
        convert_dummies: bool = True,
        drop_first: bool = True,
        copy: bool = True,
    ):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        self._panel: Optional[_Panel] = None
        self._shape: Optional[Tuple[int, int, int]] = None
        index_names = ["entity", "time"]
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if not isinstance(x, (Series, DataFrame, np.ndarray)):
            try:
                from xarray import DataArray

                if isinstance(x, DataArray):
                    if x.ndim not in (2, 3):
                        raise ValueError(
                            "Only 2-d or 3-d DataArrays are supported")
                    if x.ndim == 2:
                        x = x.to_pandas()
                    else:
                        items: List[Hashable] = np.asarray(
                            x.coords[x.dims[0]]).tolist()
                        major: List[Hashable] = np.asarray(
                            x.coords[x.dims[1]]).tolist()
                        minor: List[Hashable] = np.asarray(
                            x.coords[x.dims[2]]).tolist()
                        values = x.values
                        x = panel_to_frame(values, items, major, minor, True)
            except ImportError:
                pass

        if isinstance(x, Series) and isinstance(x.index, MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                "Series can only be used with a 2-level MultiIndex")

        if isinstance(x, DataFrame):
            if isinstance(x.index, MultiIndex):
                if len(x.index.levels) != 2:
                    raise ValueError("DataFrame input must have a "
                                     "MultiIndex with 2 levels")
                if isinstance(self._original, (DataFrame, PanelData, Series)):
                    for i in range(2):
                        index_names[
                            i] = x.index.levels[i].name or index_names[i]
                self._frame = x
                if copy:
                    self._frame = self._frame.copy()
            else:
                self._frame = DataFrame({var_name: x.T.stack(dropna=False)})
        elif isinstance(x, np.ndarray):
            if x.ndim not in (2, 3):
                raise ValueError("2 or 3-d array required for numpy input")
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            var_str = var_name + ".{0:0>" + str(int(np.log10(k) + 0.01)) + "}"
            variables = [var_name] if k == 1 else [
                var_str.format(i) for i in range(k)
            ]
            entity_str = "entity.{0:0>" + str(int(np.log10(n) + 0.01)) + "}"
            entities = [entity_str.format(i) for i in range(n)]
            time = list(range(t))
            assert isinstance(x, np.ndarray)
            x = x.astype(np.float64, copy=False)
            panel = _Panel.from_array(x,
                                      items=variables,
                                      major_axis=time,
                                      minor_axis=entities)
            self._fake_panel = panel
            self._frame = panel.to_frame()
        else:
            raise TypeError("Only ndarrays, DataFrames or DataArrays are "
                            "supported")
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64, copy=False)

        time_index = Series(self.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError("The index on the time dimension must be either "
                             "numeric or date-like")
        # self._k, self._t, self._n = self.panel.shape
        self._k, self._t, self._n = self.shape
        self._frame.index.set_names(index_names, inplace=True)

    @property
    def panel(self) -> _Panel:
        """pandas Panel view of data"""
        if self._panel is None:
            self._panel = _Panel(self._frame)
        assert self._panel is not None
        return self._panel

    @property
    def dataframe(self) -> DataFrame:
        """pandas DataFrame view of data"""
        return self._frame

    @property
    def values2d(self) -> NDArray:
        """NumPy ndarray view of dataframe"""
        return np.asarray(self._frame)

    @property
    def values3d(self) -> NDArray:
        """NumPy ndarray view of panel"""
        return self.panel.values

    def drop(self, locs: Union[Series, NDArray]) -> None:
        """
        Drop observations from the panel.

        Parameters
        ----------
        locs : ndarray
            Boolean array indicating observations to drop with reference to
            the dataframe view of the data
        """
        self._frame = self._frame.loc[~locs.ravel()]
        self._frame = self._minimize_multiindex(self._frame)
        # Reset panel and shape after a drop
        self._panel = self._shape = None
        self._k, self._t, self._n = self.shape

    @property
    def shape(self) -> Tuple[int, int, int]:
        """Shape of panel view of data"""
        if self._shape is None:
            k = self._frame.shape[1]
            index: Index = self._frame.index
            t = index.get_level_values(1).unique().shape[0]
            n = index.get_level_values(0).unique().shape[0]
            self._shape = k, t, n
        return self._shape

    @property
    def ndim(self) -> int:
        """Number of dimensions of panel view of data"""
        return 3

    @property
    def isnull(self) -> Series:
        """Locations with missing observations"""
        return self._frame.isnull().any(axis=1)

    @property
    def nobs(self) -> int:
        """Number of time observations"""
        return self._t

    @property
    def nvar(self) -> int:
        """Number of variables"""
        return self._k

    @property
    def nentity(self) -> int:
        """Number of entities"""
        return self._n

    @property
    def vars(self) -> List[Label]:
        """List of variable names"""
        return list(self._frame.columns)

    @property
    def time(self) -> List[Label]:
        """List of time index names"""
        index = self.index
        return list(index.levels[1][get_codes(index)[1]].unique())

    @property
    def entities(self) -> List[Label]:
        """List of entity index names"""
        index = self.index
        return list(index.levels[0][get_codes(index)[0]].unique())

    @property
    def entity_ids(self) -> NDArray:
        """
        Get array containing entity group membership information

        Returns
        -------
        ndarray
            2d array containing entity ids corresponding dataframe view
        """
        index = self.index
        return np.asarray(get_codes(index)[0])[:, None]

    @property
    def time_ids(self) -> NDArray:
        """
        Get array containing time membership information

        Returns
        -------
        ndarray
            2d array containing time ids corresponding dataframe view
        """
        index = self.index
        return np.asarray(get_codes(index)[1])[:, None]

    def _demean_both_low_mem(self,
                             weights: Optional["PanelData"]) -> "PanelData":
        groups = PanelData(
            DataFrame(np.c_[self.entity_ids, self.time_ids],
                      index=self._frame.index),
            convert_dummies=False,
            copy=False,
        )
        return self.general_demean(groups, weights=weights)

    def _demean_both(self, weights: Optional["PanelData"]) -> "PanelData":
        """
        Entity and time demean

        Parameters
        ----------
        weights : PanelData, optional
             Weights to use in demeaning
        """
        if self.nentity > self.nobs:
            group = "entity"
            dummy = "time"
        else:
            group = "time"
            dummy = "entity"
        e = self.demean(group, weights=weights)
        d = self.dummies(dummy, drop_first=True)
        d.index = e.index
        d = PanelData(d).demean(group, weights=weights)
        d = d.values2d
        e = e.values2d
        resid = e - d @ lstsq(d, e, rcond=None)[0]
        resid = DataFrame(resid,
                          index=self._frame.index,
                          columns=self._frame.columns)

        return PanelData(resid)

    def general_demean(self,
                       groups: "PanelDataLike",
                       weights: Optional["PanelData"] = None) -> "PanelData":
        """
        Multi-way demeaning using only groupby

        Parameters
        ----------
        groups : PanelData
            Arrays with the same size containing group identifiers
        weights : PanelData, optional
            Weights to use in the weighted demeaning

        Returns
        -------
        PanelData
            Weighted, demeaned data according to groups

        Notes
        -----
        Iterates until convergence
        """
        if not isinstance(groups, PanelData):
            groups = PanelData(groups)
        if weights is None:
            weights = PanelData(
                DataFrame(
                    np.ones((self._frame.shape[0], 1)),
                    index=self.index,
                    columns=["weights"],
                ))
        weights = weights.values2d
        groups = groups.values2d.astype(np.int64, copy=False)

        weight_sum: Dict[int, Series] = {}

        def weighted_group_mean(df: DataFrame, weights: DataFrame,
                                root_w: NDArray, level: int) -> NDArray:
            num = (root_w * df).groupby(level=level).transform("sum")
            if level in weight_sum:
                denom = weight_sum[level]
            else:
                denom = weights.groupby(level=level).transform("sum")
                weight_sum[level] = denom
            return np.asarray(num) / np.asarray(denom)

        def demean_pass(frame: DataFrame, weights: DataFrame,
                        root_w: NDArray) -> DataFrame:
            levels = groups.shape[1]
            for level in range(levels):
                mu = weighted_group_mean(frame, weights, root_w, level)
                if level == 0:
                    frame = frame - root_w * mu
                else:
                    frame -= root_w * mu

            return frame

        # Swap out the index for better performance
        init_index = DataFrame(groups)
        init_index.set_index(list(init_index.columns), inplace=True)

        root_w = np.sqrt(weights)
        weights = DataFrame(weights, index=init_index.index)
        wframe = root_w * self._frame
        wframe.index = init_index.index

        previous = wframe
        current = demean_pass(previous, weights, root_w)
        if groups.shape[1] == 1:
            current.index = self._frame.index
            return PanelData(current)

        exclude = np.ptp(np.asarray(self._frame), 0) == 0
        max_rmse = np.sqrt(np.asarray(self._frame).var(0).max())
        scale = np.asarray(self._frame.std())
        exclude = exclude | (scale < 1e-14 * max_rmse)
        replacement = np.maximum(scale, 1)
        scale[exclude] = replacement[exclude]
        scale = scale[None, :]

        while np.max(
                np.abs(np.asarray(current) - np.asarray(previous)) /
                scale) > 1e-8:
            previous = current
            current = demean_pass(previous, weights, root_w)
        current.index = self._frame.index

        return PanelData(current)

    def demean(
        self,
        group: str = "entity",
        weights: Optional["PanelData"] = None,
        return_panel: bool = True,
        low_memory: bool = False,
    ) -> Union["PanelData", np.ndarray]:
        """
        Demeans data by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time', 'both'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging
        return_panel : bool
            Flag indicating to return a PanelData object. If False, a 2-d
            NumPy representation of the panel is returned
        low_memory : bool
            Flag indicating whether to use a low memory implementation
            that avoids constructing dummy variables. Only relevant when
            group is 'both'

        Returns
        -------
        PanelData
            Demeaned data according to type

        Notes
        -----
        If weights are provided, the values returned will be scaled by
        the square root of the weights so that they can be used in WLS
        estimation.
        """
        if group not in ("entity", "time", "both"):
            raise ValueError
        if group == "both":
            if not low_memory:
                return self._demean_both(weights)
            else:
                return self._demean_both_low_mem(weights)

        level = 0 if group == "entity" else 1
        if weights is None:
            group_mu = self._frame.groupby(level=level).transform("mean")
            out = self._frame - group_mu
            if not return_panel:
                return np.asarray(out)
            return PanelData(out)
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).transform("sum")
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).transform("sum")
            group_mu = weighted_sum / sum_weights
            out = np.sqrt(w) * (self._frame - group_mu)
            if not return_panel:
                return np.asarray(out)
            return PanelData(out)

    def __str__(self) -> str:
        return self.__class__.__name__ + "\n" + str(self._frame)

    def __repr__(self) -> str:
        return (self.__str__() + "\n" + self.__class__.__name__ +
                " object, id: " + hex(id(self)))

    def _repr_html_(self) -> str:
        return self.__class__.__name__ + "<br/>" + self._frame._repr_html_()

    def count(self, group: str = "entity") -> DataFrame:
        """
        Count number of observations by entity or time

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to count

        Returns
        -------
        DataFrame
            Counts according to type. Either (entity by var) or (time by var)
        """
        level = 0 if group == "entity" else 1
        reindex = self.entities if group == "entity" else self.time
        out = self._frame.groupby(level=level).count()

        return out.reindex(reindex)

    @property
    def index(self) -> MultiIndex:
        """Return the index of the multi-index dataframe view"""
        index = self._frame.index
        assert isinstance(index, MultiIndex)
        return index

    def copy(self) -> "PanelData":
        """Return a deep copy"""
        return PanelData(
            self._frame.copy(),
            var_name=self._var_name,
            convert_dummies=self._convert_dummies,
            drop_first=self._drop_first,
        )

    def mean(self,
             group: str = "entity",
             weights: Optional["PanelData"] = None) -> DataFrame:
        """
        Compute data mean by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging

        Returns
        -------
        DataFrame
            Data mean according to type. Either (entity by var) or (time by var)
        """
        level = 0 if group == "entity" else 1
        if weights is None:
            mu = self._frame.groupby(level=level).mean()
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).sum()
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).sum()
            mu = weighted_sum / sum_weights

        reindex = self.entities if group == "entity" else self.time
        out = mu.reindex(reindex)

        return out

    def first_difference(self) -> "PanelData":
        """
        Compute first differences of variables

        Returns
        -------
        PanelData
            Differenced values
        """
        diffs = self.panel.values
        diffs = diffs[:, 1:] - diffs[:, :-1]
        diffs = panel_to_frame(
            diffs,
            self.panel.items,
            self.panel.major_axis[1:],
            self.panel.minor_axis,
            True,
        )
        diffs = diffs.reindex(self._frame.index).dropna(how="any")
        return PanelData(diffs)

    @staticmethod
    def _minimize_multiindex(df: DataFrame) -> DataFrame:
        index_cols = list(df.index.names)
        orig_names = index_cols[:]
        for i, col in enumerate(index_cols):
            col = ensure_unique_column(col, df)
            index_cols[i] = col
        df.index.names = index_cols
        df = df.reset_index()
        df = df.set_index(index_cols)
        df.index.names = orig_names
        return df

    def dummies(self,
                group: str = "entity",
                drop_first: bool = False) -> DataFrame:
        """
        Generate entity or time dummies

        Parameters
        ----------
        group : {'entity', 'time'}, optional
            Type of dummies to generate
        drop_first : bool, optional
            Flag indicating that the dummy column corresponding to the first
            entity or time period should be dropped

        Returns
        -------
        DataFrame
            Dummy variables
        """
        if group not in ("entity", "time"):
            raise ValueError
        axis = 0 if group == "entity" else 1
        labels = get_codes(self.index)
        levels = self.index.levels
        cat = Categorical(levels[axis][labels[axis]])
        dummies = get_dummies(cat, drop_first=drop_first)
        cols = self.entities if group == "entity" else self.time
        return dummies[[c for c in cols if c in dummies]].astype(np.float64,
                                                                 copy=False)
Example #15
0
 def test_to_html_border_option(self):
     df = DataFrame({'A': [1, 2]})
     with pd.option_context('html.border', 0):
         result = df.to_html()
         self.assertTrue('border="0"' in result)
         self.assertTrue('border="0"' in df._repr_html_())