Beispiel #1
0
 def test_from_frame_level1_unsorted(self):
     tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2),
               ('AAPL', 1), ('MSFT', 1)]
     midx = MultiIndex.from_tuples(tuples)
     df = DataFrame(np.random.rand(5,4), index=midx)
     p = df.to_panel()
     assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
Beispiel #2
0
    def test_to_panel_na_handling(self):
        df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)),
                       index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
                              [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]])

        panel = df.to_panel()
        self.assert_(isnull(panel[0].ix[1, [0, 1]]).all())
Beispiel #3
0
    def test_to_panel_na_handling(self):
        df = DataFrame(np.random.randint(0, 10, size=20).reshape((10, 2)),
                       index=[[0, 0, 0, 0, 0, 0, 1, 1, 1, 1],
                              [0, 1, 2, 3, 4, 5, 2, 3, 4, 5]])

        panel = df.to_panel()
        self.assert_(isnull(panel[0].ix[1, [0, 1]]).all())
Beispiel #4
0
 def test_from_frame_level1_unsorted(self):
     tuples = [('MSFT', 3), ('MSFT', 2), ('AAPL', 2), ('AAPL', 1),
               ('MSFT', 1)]
     midx = MultiIndex.from_tuples(tuples)
     df = DataFrame(np.random.rand(5, 4), index=midx)
     p = df.to_panel()
     assert_frame_equal(p.minor_xs(2), df.xs(2, level=1).sort_index())
Beispiel #5
0
    def test_to_frame(self):
        # filtered
        filtered = self.panel.to_frame()
        expected = self.panel.to_frame().dropna(how="any")
        assert_frame_equal(filtered, expected)

        # unfiltered
        unfiltered = self.panel.to_frame(filter_observations=False)
        assert_panel_equal(unfiltered.to_panel(), self.panel)

        # names
        self.assertEqual(unfiltered.index.names, ["major", "minor"])

        # unsorted, round trip
        df = self.panel.to_frame(filter_observations=False)
        unsorted = df.take(np.random.permutation(len(df)))
        pan = unsorted.to_panel()
        assert_panel_equal(pan, self.panel)

        # preserve original index names
        df = DataFrame(
            np.random.randn(6, 2), index=[["a", "a", "b", "b", "c", "c"], [0, 1, 0, 1, 0, 1]], columns=["one", "two"]
        )
        df.index.names = ["foo", "bar"]
        df.columns.name = "baz"

        rdf = df.to_panel().to_frame()
        self.assertEqual(rdf.index.names, df.index.names)
        self.assertEqual(rdf.columns.names, df.columns.names)
Beispiel #6
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.common import _asarray_tuplesafe

        table = getattr(group, 'table')

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'],
                               table._v_attrs.index_kind)
        # reconstruct
        long_index = MultiIndex.from_arrays([index, columns])
        lp = DataFrame(sel.values['values'], index=long_index,
                       columns=fields)

        if not long_index.has_duplicates:
            lp = lp.sortlevel(level=0)
            wp = lp.to_panel()
        else:
            if not self._quiet:  # pragma: no cover
                print ('Duplicate entries in table, taking most recently '
                       'appended')

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Beispiel #7
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.common import _asarray_tuplesafe

        table = getattr(group, 'table')

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        # reconstruct
        long_index = MultiIndex.from_arrays([index, columns])
        lp = DataFrame(sel.values['values'], index=long_index, columns=fields)

        if not long_index.has_duplicates:
            lp = lp.sortlevel(level=0)
            wp = lp.to_panel()
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Beispiel #8
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor.from_array(index)
        minor = Factor.from_array(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K)
            sorter = com._ensure_platform_int(sorter)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.ref_items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index._tuple_index

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)
            indexer = com._ensure_platform_int(indexer)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Beispiel #9
0
 def test_from_frame_level1_unsorted(self):
     tuples = [("MSFT", 3), ("MSFT", 2), ("AAPL", 2), ("AAPL", 1), ("MSFT", 1)]
     midx = MultiIndex.from_tuples(tuples)
     df = DataFrame(np.random.rand(5, 4), index=midx)
     p = df.to_panel()
     assert_frame_equal(p.minor_xs(2), df.ix[:, 2].sort_index())
Beispiel #10
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'],
                               table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block], [block.items,
                                         major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print ('Duplicate entries in table, taking most recently '
                       'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Beispiel #11
0
class PanelData(object):
    """
    Abstraction to handle alternative formats for panel data

    Parameters
    ----------
    x : {ndarray, Series, DataFrame, Panel, DataArray}
       Input data
    var_name : str, optional
        Variable name to use when naming variables in NumPy arrays or
        xarray DataArrays
    convert_dummies : bool, optional
        Flat indicating whether pandas categoricals or string input data
        should be converted to dummy variables
    drop_first : bool, optional
        Flag indicating to drop first dummy category when converting

    Notes
    -----
    Data can be either 2- or 3-dimensional. The three key dimensions are

      * nvar - number of variables
      * nobs - number of time periods
      * nentity - number of entities

    All 3-d inputs should be in the form (nvar, nobs, nentity). With one
    exception, 2-d inputs are treated as (nobs, nentity) so that the input
    can be treated as-if being (1, nobs, nentity).

    If the 2-d input is a pandas DataFrame with a 2-level MultiIndex then the
    input is treated differently.  Index level 0 is assumed ot be entity.
    Index level 1 is time.  The columns are the variables.  This is the most
    precise format to use since pandas Panels do not preserve all variable
    type information across transformations between Panel and MultiIndex
    DataFrame. MultiIndex Series are also accepted and treated as single
    column MultiIndex DataFrames.

    Raises
    ------
    TypeError
        If the input type is not supported
    ValueError
        If the input has the wrong number of dimensions or a MultiIndex
        DataFrame does not have 2 levels
    """
    def __init__(self, x, var_name='x', convert_dummies=True, drop_first=True):
        self._var_name = var_name
        self._convert_dummies = convert_dummies
        self._drop_first = drop_first
        if isinstance(x, PanelData):
            x = x.dataframe
        self._original = x

        if isinstance(x, DataArray):
            if x.ndim not in (2, 3):
                raise ValueError('Only 2-d or 3-d DataArrays are supported')
            x = x.to_pandas()

        if isinstance(x, Series) and isinstance(x.index, pd.MultiIndex):
            x = DataFrame(x)
        elif isinstance(x, Series):
            raise ValueError(
                'Series can only be used with a 2-level MultiIndex')

        if isinstance(x, (Panel, DataFrame)):
            if isinstance(x, DataFrame):
                if isinstance(x.index, pd.MultiIndex):
                    if len(x.index.levels) != 2:
                        raise ValueError('DataFrame input must have a '
                                         'MultiIndex with 2 levels')
                    self._frame = x.copy()
                else:
                    self._frame = DataFrame(
                        {var_name: x.T.stack(dropna=False)})
            else:
                self._frame = x.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        elif isinstance(x, ndarray):
            if not 2 <= x.ndim <= 3:
                raise ValueError('2 or 3-d array required for numpy input')
            if x.ndim == 2:
                x = x[None, :, :]

            k, t, n = x.shape
            variables = [var_name] if k == 1 else [
                var_name + '.{0}'.format(i) for i in range(k)
            ]
            entities = ['entity.{0}'.format(i) for i in range(n)]
            time = list(range(t))
            x = x.astype(np.float64)
            panel = Panel(x,
                          items=variables,
                          major_axis=time,
                          minor_axis=entities)
            self._frame = panel.swapaxes(1,
                                         2).to_frame(filter_observations=False)
        else:
            raise TypeError('Only ndarrays, DataFrames, Panels or DataArrays '
                            'supported.')
        if convert_dummies:
            self._frame = expand_categoricals(self._frame, drop_first)
            self._frame = self._frame.astype(np.float64)

        time_index = Series(self._frame.index.levels[1])
        if not (is_numeric_dtype(time_index.dtype)
                or is_datetime64_any_dtype(time_index.dtype)):
            raise ValueError('The index on the time dimension must be either '
                             'numeric or date-like')
        self._k, self._t, self._n = self.panel.shape
        self._frame.index.levels[0].name = 'entity'
        self._frame.index.levels[1].name = 'time'

    @property
    def panel(self):
        """pandas Panel view of data"""
        return self._frame.to_panel().swapaxes(1, 2)

    @property
    def dataframe(self):
        """pandas DataFrame view of data"""
        return self._frame

    @property
    def values2d(self):
        """NumPy ndarray view of dataframe"""
        return self._frame.values

    @property
    def values3d(self):
        """NumPy ndarray view of panel"""
        return self.panel.values

    def drop(self, locs):
        """
        Parameters
        ----------
        locs : ndarray
            Booleam array indicating observations to drop with reference to
            the dataframe view of the data
        """
        self._frame = self._frame.loc[~locs.ravel()]
        self._frame = self._minimize_multiindex(self._frame)
        self._k, self._t, self._n = self.shape

    @property
    def shape(self):
        """Shape of panel view of data"""
        return self.panel.shape

    @property
    def ndim(self):
        """Number of dimensions of panel view of data"""
        return 3

    @property
    def isnull(self):
        """Locations with missing observations"""
        return np.any(self._frame.isnull(), axis=1)

    @property
    def nobs(self):
        """Number of time observations"""
        return self._t

    @property
    def nvar(self):
        """Number of variables"""
        return self._k

    @property
    def nentity(self):
        """Number of entities"""
        return self._n

    @property
    def vars(self):
        """List of variable names"""
        return list(self._frame.columns)

    @property
    def time(self):
        """List of time index names"""
        index = self._frame.index
        return list(index.levels[1][index.labels[1]].unique())

    @property
    def entities(self):
        """List of entity index names"""
        index = self._frame.index
        return list(index.levels[0][index.labels[0]].unique())

    @property
    def entity_ids(self):
        """
        Get array containing entity group membership information

        Returns
        -------
        id : ndarray
            2d array containing entity ids corresponding dataframe view
        """
        return np.asarray(self._frame.index.labels[0])[:, None]

    @property
    def time_ids(self):
        """
        Get array containing time membership information

        Returns
        -------
        id : ndarray
            2d array containing time ids corresponding dataframe view
        """
        return np.asarray(self._frame.index.labels[1])[:, None]

    def _demean_both(self, weights):
        """
        Entity and time demean

        Parameters
        ----------
        weights : PanelData, optional
             Weights to use in demeaning
        """
        if self.nentity > self.nobs:
            group = 'entity'
            dummy = 'time'
        else:
            group = 'time'
            dummy = 'entity'
        e = self.demean(group, weights=weights)
        d = self.dummies(dummy, drop_first=True)
        d.index = e.index
        d = PanelData(d).demean(group, weights=weights)
        d = d.values2d
        e = e.values2d
        resid = e - d @ np.linalg.lstsq(d, e)[0]
        resid = DataFrame(resid,
                          index=self._frame.index,
                          columns=self._frame.columns)

        return PanelData(resid)

    def weighted_general_demean(self, groups, weights):
        """
        Multi-way demeaning using only groupby

        Parameters
        ----------
        groups : PanelData
            Arrays with the same size containing group identifiers
        weights : PanelData
            Weights to use in the weighted demeaning

        Returns
        -------
        demeaned : PanelData
            Weighted, demeaned data according to groups

        Notes
        -----
        Iterates until convergence
        """
        if not isinstance(groups, PanelData):
            groups = PanelData(groups)
        weights = weights.values2d
        groups = groups.values2d.astype(np.int64)

        def weighted_group_mean(df, weights, root_w, level):
            num = (root_w * df).groupby(level=level).transform('sum')
            denom = weights.groupby(level=level).transform('sum')
            return num.values / denom.values

        def demean_pass(frame, weights, root_w):
            levels = groups.shape[1]
            for level in range(levels):
                mu = weighted_group_mean(frame, weights, root_w, level)
                if level == 0:
                    frame = frame - root_w * mu
                else:
                    frame -= root_w * mu

            return frame

        # Swap out the index for better performance
        init_index = pd.DataFrame(groups)
        init_index.set_index(list(init_index.columns), inplace=True)

        root_w = np.sqrt(weights)
        weights = pd.DataFrame(weights, index=init_index.index)
        wframe = root_w * self._frame
        wframe.index = init_index.index

        previous = wframe
        current = demean_pass(previous, weights, root_w)
        if groups.shape[1] == 1:
            current.index = self._frame.index
            return PanelData(current)

        exclude = np.ptp(self._frame.values, 0) == 0
        max_rmse = np.sqrt(self._frame.values.var(0).max())
        scale = self._frame.std().values
        exclude = exclude | (scale < 1e-14 * max_rmse)
        replacement = np.maximum(scale, 1)
        scale[exclude] = replacement[exclude]
        scale = scale[None, :]

        while np.max(np.abs(current.values - previous.values) / scale) > 1e-8:
            previous = current
            current = demean_pass(previous, weights, root_w)
        current.index = self._frame.index

        return PanelData(current)

    def general_demean(self, groups):
        """
        Multi-way demeaning using only groupby

        Parameters
        ----------
        groups : PanelData
            Arrays with the same size containing group identifiers

        Returns
        -------
        demeaned : PanelData
            Demeaned data according to groups

        Notes
        -----
        Iterates until convergence
        """
        # TODO: Consolidate with weighted version
        if not isinstance(groups, PanelData):
            groups = PanelData(groups)
        groups = groups.values2d.astype(np.int64)

        def demean_pass(frame):
            levels = len(frame.index.levels) if isinstance(
                frame.index, pd.MultiIndex) else 1
            for i in range(levels):
                mu = frame.groupby(level=i).transform('mean')
                if i == 0:
                    frame = frame - mu
                else:
                    frame -= mu
            return frame

        # Swap out the index for better performance
        previous = self._frame.copy()
        init_index = pd.DataFrame(groups)
        init_index.set_index(list(init_index.columns), inplace=True)
        previous.index = init_index.index
        current = demean_pass(previous)

        if groups.shape[1] == 1:
            current.index = self._frame.index
            return PanelData(current)

        exclude = np.ptp(self._frame.values, 0) == 0
        max_rmse = np.sqrt(self._frame.values.var(0).max())
        scale = self._frame.std().values
        exclude = exclude | (scale < 1e-14 * max_rmse)
        replacement = np.maximum(scale, 1)
        scale[exclude] = replacement[exclude]
        scale = scale[None, :]

        while np.max(np.abs(current.values - previous.values) / scale) > 1e-8:
            previous = current
            current = demean_pass(current)
        current.index = self._frame.index

        return PanelData(current)

    def demean(self, group='entity', weights=None):
        """
        Demeans data by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging

        Returns
        -------
        demeaned : PanelData
            Demeaned data according to type

        Notes
        -----
        If weights are provided, the values returned will be scaled by
        sqrt(weights) so that they can be used in WLS estimation.
        """
        if group not in ('entity', 'time', 'both'):
            raise ValueError
        if group == 'both':
            return self._demean_both(weights)

        level = 0 if group == 'entity' else 1
        if weights is None:
            group_mu = self._frame.groupby(level=level).transform('mean')
            return PanelData(self._frame - group_mu)
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).transform('sum')
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).transform('sum')
            group_mu = weighted_sum / sum_weights
            return PanelData(np.sqrt(w) * (self._frame - group_mu))

    def __str__(self):
        return self.__class__.__name__ + '\n' + str(self._frame)

    def __repr__(self):
        return self.__str__(
        ) + '\n' + self.__class__.__name__ + ' object, id: ' + hex(id(self))

    def _repr_html_(self):
        return self.__class__.__name__ + '<br/>' + self._frame._repr_html_()

    def count(self, group='entity'):
        """
        Count number of observations by entity or time

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning

        Returns
        -------
        count : DataFrame
            Counts according to type. Either (entity by var) or (time by var)
        """
        v = self.panel.values
        axis = 1 if group == 'entity' else 2
        count = np.sum(np.isfinite(v), axis=axis)

        index = self.panel.minor_axis if group == 'entity' else self.panel.major_axis
        out = DataFrame(count.T, index=index, columns=self.vars)
        reindex = self.entities if group == 'entity' else self.time
        out = out.loc[reindex].astype(np.int64)
        return out

    @property
    def index(self):
        """Return the index of the multi-index dataframe view"""
        return self._frame.index

    def copy(self):
        """Return a deep copy"""
        return PanelData(self._frame.copy(),
                         var_name=self._var_name,
                         convert_dummies=self._convert_dummies,
                         drop_first=self._drop_first)

    def mean(self, group='entity', weights=None):
        """
        Compute data mean by either entity or time group

        Parameters
        ----------
        group : {'entity', 'time'}
            Group to use in demeaning
        weights : PanelData, optional
            Weights to implement weighted averaging

        Returns
        -------
        mean : DataFrame
            Data mean according to type. Either (entity by var) or (time by var)
        """
        level = 0 if group == 'entity' else 1
        if weights is None:
            mu = self._frame.groupby(level=level).mean()
        else:
            w = weights.values2d
            frame = self._frame.copy()
            frame = w * frame
            weighted_sum = frame.groupby(level=level).sum()
            frame.iloc[:, :] = w
            sum_weights = frame.groupby(level=level).sum()
            mu = weighted_sum / sum_weights

        reindex = self.entities if group == 'entity' else self.time
        out = mu.loc[reindex]

        return out

    def first_difference(self):
        """
        Compute first differences of variables

        Returns
        -------
        diffs : PanelData
            Differenced values
        """
        diffs = self.panel.values
        diffs = diffs[:, 1:] - diffs[:, :-1]
        diffs = Panel(diffs,
                      items=self.panel.items,
                      major_axis=self.panel.major_axis[1:],
                      minor_axis=self.panel.minor_axis)
        diffs = diffs.swapaxes(1, 2).to_frame(filter_observations=False)
        diffs = diffs.reindex(self._frame.index).dropna(how='any')
        return PanelData(diffs)

    @staticmethod
    def _minimize_multiindex(df):
        index_cols = list(df.index.names)
        orig_names = index_cols[:]
        for i, col in enumerate(index_cols):
            col = ensure_unique_column(col, df)
            index_cols[i] = col
        df.index.names = index_cols
        df = df.reset_index()
        df = df.set_index(index_cols)
        df.index.names = orig_names
        return df

    def dummies(self, group='entity', drop_first=False):
        """
        Generate entity or time dummies

        Parameters
        ----------
        group : {'entity', 'time'}, optional
            Type of dummies to generate
        drop_first : bool, optional
            Flag indicating that the dummy column corresponding to the first
            entity or time period should be dropped

        Returns
        -------
        dummies : DataFrame
            Dummy variables
        """
        if group not in ('entity', 'time'):
            raise ValueError
        axis = 0 if group == 'entity' else 1
        labels = self._frame.index.labels
        levels = self._frame.index.levels
        cat = pd.Categorical(levels[axis][labels[axis]])
        dummies = pd.get_dummies(cat, drop_first=drop_first)
        cols = self.entities if group == 'entity' else self.time
        return dummies[[c for c in cols if c in dummies]].astype(np.float64)
Beispiel #12
0
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import pandas_datareader.data as web

ser = Series(np.arange(3.))
print(ser)

ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])
print(ser2[-1])

print(ser.ix[:1])

ser3 = Series(range(3), index=[-5, 1, 3])
print(ser3.iloc[2])
print(ser3.iloc[-1])

frame = DataFrame(np.arange(6).reshape(3, 2), index=[2, 0, 1])
print(frame.iloc[0])

pdata = pd.Panel(
    dict((stk, web.get_data_yahoo(stk, '1/1/2010', '1/30/2010')) for stk in ['AAPL', 'IBM', 'MSFT', 'GOOG']))
print(pdata)
print(pdata.ix[:, '1/5/2010', :])
frame = pdata.ix[:, '1/5/2010':, :].to_frame()
print(frame)
print(frame.to_panel())
print('finsih')
Beispiel #13
0
    def _read_panel_table(self, group, where=None):
        from pandas.core.index import unique_int64, Factor
        from pandas.core.common import _asarray_tuplesafe
        from pandas.core.internals import BlockManager
        from pandas.core.reshape import block2d_to_block3d

        table = getattr(group, "table")

        # create the selection
        sel = Selection(table, where)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values["column"], table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values["index"], table._v_attrs.index_kind)
        values = sel.values["values"]

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique_int64(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels)

            mgr = BlockManager([block], [block.items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print ("Duplicate entries in table, taking most recently " "appended")

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()
            index_map = lib.map_indices_object(tuple_index)

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = lib.merge_indexer_object(unique_tuples, index_map)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp