Ejemplo n.º 1
0
    def equals(self, other):
        if self is other:
            return True

        if not isinstance(other, Index):
            return False

        return Index.equals(self.view(Index), other)
Ejemplo n.º 2
0
 def test_equals(self):
     same_values = Index(self.index, dtype=object)
     self.assert_(self.index.equals(same_values))
     self.assert_(same_values.equals(self.index))
Ejemplo n.º 3
0
class DataMatrix(DataFrame):
    """
    Matrix version of DataFrame, optimized for cross-section operations,
    numerical computation, and other operations that do not require the
    frame to change size.

    Parameters
    ----------
    data : numpy ndarray or dict of sequence-like objects
        Dict can contain Series, arrays, or list-like objects
        Constructor can understand various kinds of inputs
    index : Index or array-like
        Index to use for resulting frame (optional if provided dict of Series)
    columns : Index or array-like
        Required if data is ndarray
    dtype : dtype, default None (infer)
        Data type to force

    Notes
    -----
    Transposing is much faster in this regime, as is calling getXS, so please
    take note of this.
    """
    objects = None

    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 dtype=None,
                 objects=None):

        if isinstance(data, dict) and len(data) > 0:
            (index, columns, values,
             objects) = self._initDict(data, index, columns, objects, dtype)
        elif isinstance(data, (np.ndarray, list)):
            (index, columns,
             values) = self._initMatrix(data, index, columns, dtype)

            if objects is not None:
                if isinstance(objects, DataMatrix):
                    if not objects.index.equals(index):
                        objects = objects.reindex(index)
                else:
                    objects = DataMatrix(objects, index=index)
        elif isinstance(data, DataFrame):
            if not isinstance(data, DataMatrix):
                data = data.toDataMatrix()
            values = data.values
            index = data.index
            columns = data.columns
            objects = data.objects
        elif data is None or len(data) == 0:
            # this is a touch convoluted...
            if objects is not None:
                if isinstance(objects, DataMatrix):
                    if index is not None and objects.index is not index:
                        objects = objects.reindex(index)
                else:
                    objects = DataMatrix(objects, index=index)

                index = objects.index

            if index is None:
                N = 0
                index = NULL_INDEX
            else:
                N = len(index)

            if columns is None:
                K = 0
                columns = NULL_INDEX
            else:
                K = len(columns)

            values = np.empty((N, K), dtype=dtype)
            values[:] = NaN
        else:
            raise Exception('DataMatrix constructor not properly called!')

        self.values = values
        self.index = index
        self.columns = columns
        self.objects = objects

    def _initDict(self, data, index, columns, objects, dtype):
        """
        Segregate Series based on type and coerce into matrices.

        Needs to handle a lot of exceptional cases.

        Somehow this got outrageously complicated
        """
        # pre-filter out columns if we passed it
        if columns is not None:
            colset = set(columns)
            data = dict((k, v) for k, v in data.iteritems() if k in colset)

        index = _extract_index(data, index)

        objectDict = {}
        if objects is not None and isinstance(objects, dict):
            objectDict.update(objects)

        valueDict = {}
        for k, v in data.iteritems():
            if isinstance(v, Series):
                if v.index is not index:
                    # Forces alignment. No need to copy data since we
                    # are putting it into an ndarray later
                    v = v.reindex(index)
            else:
                if isinstance(v, dict):
                    v = [v.get(i, NaN) for i in index]
                else:
                    assert (len(v) == len(index))

                try:
                    v = Series(v, dtype=dtype, index=index)
                except Exception:
                    v = Series(v, index=index)

            if issubclass(v.dtype.type, (np.bool_, float, int)):
                valueDict[k] = v
            else:
                objectDict[k] = v

        if columns is None:
            columns = Index(_try_sort(valueDict))
            objectColumns = Index(_try_sort(objectDict))
        else:
            objectColumns = Index([c for c in columns if c in objectDict])
            columns = Index([c for c in columns if c not in objectDict])

        if len(valueDict) == 0:
            dtype = np.object_
            valueDict = objectDict
            columns = objectColumns
        else:
            dtypes = set(v.dtype for v in valueDict.values())

            if len(dtypes) > 1:
                dtype = np.float_
            else:
                dtype = list(dtypes)[0]

            if len(objectDict) > 0:
                new_objects = DataMatrix(objectDict,
                                         dtype=np.object_,
                                         index=index,
                                         columns=objectColumns)
                if isinstance(objects, DataMatrix):
                    objects = objects.join(new_objects, how='left')
                else:
                    objects = new_objects

        values = np.empty((len(index), len(columns)), dtype=dtype)

        for i, col in enumerate(columns):
            if col in valueDict:
                values[:, i] = valueDict[col]
            else:
                values[:, i] = np.NaN

        return index, columns, values, objects

    def _initMatrix(self, values, index, columns, dtype):
        if not isinstance(values, np.ndarray):
            arr = np.array(values)
            if issubclass(arr.dtype.type, basestring):
                arr = np.array(values, dtype=object, copy=True)

            values = arr

        if values.ndim == 1:
            N = values.shape[0]
            if N == 0:
                values = values.reshape((values.shape[0], 0))
            else:
                values = values.reshape((values.shape[0], 1))

        if dtype is not None:
            try:
                values = values.astype(dtype)
            except Exception:
                pass

        N, K = values.shape

        if index is None:
            if N == 0:
                index = NULL_INDEX
            else:
                index = np.arange(N)

        if columns is None:
            if K == 0:
                columns = NULL_INDEX
            else:
                columns = np.arange(K)

        return index, columns, values

    @property
    def _constructor(self):
        return DataMatrix

    # Because of DataFrame property
    values = None

    def __array__(self):
        return self.values

    def __array_wrap__(self, result):
        return DataMatrix(result, index=self.index, columns=self.columns)

#-------------------------------------------------------------------------------
# DataMatrix-specific implementation of private API

    def _join_on(self, other, on):
        if len(other.index) == 0:
            return self

        if on not in self:
            raise Exception('%s column not contained in this frame!' % on)

        fillVec, mask = tseries.getMergeVec(self[on], other.index.indexMap)

        tmpMatrix = other.values.take(fillVec, axis=0)
        tmpMatrix[-mask] = NaN

        seriesDict = dict(
            (col, tmpMatrix[:, j]) for j, col in enumerate(other.columns))

        if getattr(other, 'objects'):
            objects = other.objects

            tmpMat = objects.values.take(fillVec, axis=0)
            tmpMat[-mask] = NaN
            objDict = dict(
                (col, tmpMat[:, j]) for j, col in enumerate(objects.columns))

            seriesDict.update(objDict)

        filledFrame = DataFrame(data=seriesDict, index=self.index)

        return self.join(filledFrame, how='left')

    def _reindex_index(self, index, method):
        if index is self.index:
            return self.copy()

        if not isinstance(index, Index):
            index = Index(index)

        if len(self.index) == 0:
            return DataMatrix(index=index, columns=self.columns)

        indexer, mask = common.get_indexer(self.index, index, method)
        mat = self.values.take(indexer, axis=0)

        notmask = -mask
        if len(index) > 0:
            if notmask.any():
                if issubclass(mat.dtype.type, np.int_):
                    mat = mat.astype(float)
                elif issubclass(mat.dtype.type, np.bool_):
                    mat = mat.astype(float)

                common.null_out_axis(mat, notmask, 0)

        if self.objects is not None and len(self.objects.columns) > 0:
            newObjects = self.objects.reindex(index)
        else:
            newObjects = None

        return DataMatrix(mat,
                          index=index,
                          columns=self.columns,
                          objects=newObjects)

    def _reindex_columns(self, columns):
        if len(columns) == 0:
            return DataMatrix(index=self.index)

        if not isinstance(columns, Index):
            columns = Index(columns)

        if self.objects is not None:
            object_columns = columns.intersection(self.objects.columns)
            columns = columns - object_columns

            objects = self.objects._reindex_columns(object_columns)
        else:
            objects = None

        if len(columns) > 0 and len(self.columns) == 0:
            return DataMatrix(index=self.index,
                              columns=columns,
                              objects=objects)

        indexer, mask = common.get_indexer(self.columns, columns, None)
        mat = self.values.take(indexer, axis=1)

        notmask = -mask
        if len(mask) > 0:
            if notmask.any():
                if issubclass(mat.dtype.type, np.int_):
                    mat = mat.astype(float)
                elif issubclass(mat.dtype.type, np.bool_):
                    mat = mat.astype(float)

                common.null_out_axis(mat, notmask, 1)

        return DataMatrix(mat,
                          index=self.index,
                          columns=columns,
                          objects=objects)

    def _rename_columns_inplace(self, mapper):
        self.columns = [mapper(x) for x in self.columns]

        if self.objects is not None:
            self.objects._rename_columns_inplace(mapper)

    def _combineFrame(self, other, func):
        """
        Methodology, briefly
        - Really concerned here about speed, space

        - Get new index
        - Reindex to new index
        - Determine newColumns and commonColumns
        - Add common columns over all (new) indices
        - Fill to new set of columns

        Could probably deal with some Cython action in here at some point
        """
        need_reindex = False

        if self.index.equals(other.index):
            newIndex = self.index
        else:
            newIndex = self.index.union(other.index)
            need_reindex = True

        if not self and not other:
            return DataMatrix(index=newIndex)
        elif not self:
            return other * NaN
        elif not other:
            return self * NaN

        if self.columns.equals(other.columns):
            newColumns = self.columns
        else:
            newColumns = self.columns.union(other.columns)
            need_reindex = True or need_reindex

        if need_reindex:
            myReindex = self.reindex(index=newIndex, columns=newColumns)
            hisReindex = other.reindex(index=newIndex, columns=newColumns)
        else:
            myReindex = self
            hisReindex = other

        myValues = myReindex.values
        hisValues = hisReindex.values

        return DataMatrix(func(myValues, hisValues),
                          index=newIndex,
                          columns=newColumns)

    def _combineSeries(self, other, func):
        newIndex = self.index
        newCols = self.columns

        if len(self) == 0:
            # Ambiguous case
            return DataMatrix(index=self.index,
                              columns=self.columns,
                              objects=self.objects)

        if self.index._allDates and other.index._allDates:
            # Operate row-wise
            if self.index.equals(other.index):
                newIndex = self.index
                other_vals = other.values
                values = self.values
            else:
                newIndex = self.index + other.index

                if other.index.equals(newIndex):
                    other_vals = other.values
                else:
                    other_vals = other.reindex(newIndex).values

                if self.index.equals(newIndex):
                    values = self.values
                else:
                    values = self.reindex(newIndex).values

            resultMatrix = func(values.T, other_vals).T
        else:
            if len(other) == 0:
                return self * NaN

            newCols = self.columns.union(other.index)

            # Operate column-wise
            this = self.reindex(columns=newCols)
            other = other.reindex(newCols).values

            resultMatrix = func(this.values, other)

        # TODO: deal with objects
        return DataMatrix(resultMatrix, index=newIndex, columns=newCols)

    def _combineFunc(self, other, func):
        """
        Combine DataMatrix objects with other Series- or DataFrame-like objects

        This is the core method used for the overloaded arithmetic methods

        Result hierarchy
        ----------------
        DataMatrix + DataFrame --> DataMatrix
        DataMatrix + DataMatrix --> DataMatrix
        DataMatrix + Series --> DataMatrix
        DataMatrix + constant --> DataMatrix

        The reason for 'upcasting' the result is that if addition succeed,
        we can assume that the input DataFrame was homogeneous.
        """
        newIndex = self.index
        if isinstance(other, DataFrame):
            return self._combineFrame(other, func)

        elif isinstance(other, Series):
            return self._combineSeries(other, func)

        else:
            if not self:
                return self

            # Constant of some kind
            newCols = self.columns
            resultMatrix = func(self.values, other)

        # TODO: deal with objects
        return DataMatrix(resultMatrix, index=newIndex, columns=newCols)

#-------------------------------------------------------------------------------
# Properties for index and columns

    _columns = None

    def _get_columns(self):
        return self._columns

    def _set_columns(self, cols):
        if len(cols) != self.values.shape[1]:
            raise Exception('Columns length %d did not match values %d!' %
                            (len(cols), self.values.shape[1]))

        if not isinstance(cols, Index):
            cols = Index(cols)

        self._columns = cols

    columns = property(fget=_get_columns, fset=_set_columns)

    def _set_index(self, index):
        if len(index) > 0:
            if len(index) != self.values.shape[0]:
                raise Exception('Index length %d did not match values %d!' %
                                (len(index), self.values.shape[0]))

        if not isinstance(index, Index):
            index = Index(index)

        self._index = index

        if self.objects is not None:
            self.objects._index = index

    def _get_index(self):
        return self._index

    index = property(fget=_get_index, fset=_set_index)

    #-------------------------------------------------------------------------------
    # "Magic methods"

    def __getstate__(self):
        if self.objects is not None:
            objects = self.objects._matrix_state(pickle_index=False)
        else:
            objects = None

        state = self._matrix_state()

        return (state, objects)

    def _matrix_state(self, pickle_index=True):
        columns = _pickle_array(self.columns)

        if pickle_index:
            index = _pickle_array(self.index)
        else:
            index = None

        return self.values, index, columns

    def __setstate__(self, state):
        (vals, idx, cols), object_state = state

        self.values = vals
        self.index = _unpickle_array(idx)
        self.columns = _unpickle_array(cols)

        if object_state:
            ovals, _, ocols = object_state
            self.objects = DataMatrix(ovals,
                                      index=self.index,
                                      columns=_unpickle_array(ocols))
        else:
            self.objects = None

    def __nonzero__(self):
        N, K = self.values.shape
        if N == 0 or K == 0:
            if self.objects is None:
                return False
            else:
                return self.objects.__nonzero__()
        else:
            return True

    def __neg__(self):
        mycopy = self.copy()
        mycopy.values = -mycopy.values
        return mycopy

    def __repr__(self):
        """Return a string representation for a particular DataMatrix"""
        buffer = StringIO()

        if len(self.cols()) == 0:
            buffer.write('Empty DataMatrix\nIndex: %s' % repr(self.index))
        elif 0 < len(self.index) < 500 and self.values.shape[1] < 10:
            self.toString(buffer=buffer)
        else:
            print >> buffer, str(self.__class__)
            self.info(buffer=buffer)

        return buffer.getvalue()

    def __getitem__(self, item):
        """
        Retrieve column, slice, or subset from DataMatrix.

        Possible inputs
        ---------------
        single value : retrieve a column as a Series
        slice : reindex to indices specified by slice
        boolean vector : like slice but more general, reindex to indices
          where the input vector is True

        Examples
        --------
        column = dm['A']

        dmSlice = dm[:20] # First 20 rows

        dmSelect = dm[dm.count(axis=1) > 10]

        Notes
        -----
        This is a magic method. Do NOT call explicity.
        """
        if isinstance(item, slice):
            indexRange = self.index[item]
            return self.reindex(indexRange)

        elif isinstance(item, np.ndarray):
            if len(item) != len(self.index):
                raise Exception('Item wrong length %d instead of %d!' %
                                (len(item), len(self.index)))
            newIndex = self.index[item]
            return self.reindex(newIndex)
        else:
            if self.objects is not None and item in self.objects:
                return self.objects[item]
            else:
                return self._getSeries(item)

    _dataTypes = [np.float_, np.bool_, np.int_]

    def __setitem__(self, key, value):
        """
        Add series to DataMatrix in specified column.

        If series is a numpy-array (not a Series/TimeSeries), it must be the
        same length as the DataMatrix's index or an error will be thrown.

        Series/TimeSeries will be conformed to the DataMatrix's index to
        ensure homogeneity.
        """
        if hasattr(value, '__iter__'):
            if isinstance(value, Series):
                if value.index.equals(self.index):
                    # no need to copy
                    value = value.values
                else:
                    value = value.reindex(self.index).values
            else:
                assert (len(value) == len(self.index))

                if not isinstance(value, np.ndarray):
                    value = np.array(value)
                    if value.dtype.type == np.str_:
                        value = np.array(value, dtype=object)
        else:
            value = np.repeat(value, len(self.index))

        if self.values.dtype == np.object_:
            self._insert_object_dtype(key, value)
        else:
            self._insert_float_dtype(key, value)

    def _insert_float_dtype(self, key, value):
        isObject = value.dtype not in self._dataTypes

        if key in self.columns:
            loc = self.columns.indexMap[key]
            self.values[:, loc] = value
        elif isObject:
            if self.objects is None:
                self.objects = DataMatrix({key: value}, index=self.index)
            else:
                self.objects[key] = value
        elif len(self.columns) == 0:
            self.values = value.reshape((len(value), 1)).astype(np.float)
            self.columns = Index([key])
        else:
            try:
                loc = self.columns.searchsorted(key)
            except TypeError:
                loc = len(self.columns)

            if loc == self.values.shape[1]:
                newValues = np.c_[self.values, value]
                newColumns = Index(np.concatenate((self.columns, [key])))
            elif loc == 0:
                newValues = np.c_[value, self.values]
                newColumns = Index(np.concatenate(([key], self.columns)))
            else:
                newValues = np.c_[self.values[:, :loc], value,
                                  self.values[:, loc:]]
                toConcat = (self.columns[:loc], [key], self.columns[loc:])
                newColumns = Index(np.concatenate(toConcat))
            self.values = newValues
            self.columns = newColumns

    def _insert_object_dtype(self, key, value):
        if key in self.columns:
            loc = self.columns.indexMap[key]
            self.values[:, loc] = value
        elif len(self.columns) == 0:
            self.values = value.reshape((len(value), 1)).copy()
            self.columns = Index([key])
        else:
            try:
                loc = self.columns.searchsorted(key)
            except TypeError:
                loc = len(self.columns)

            if loc == self.values.shape[1]:
                newValues = np.c_[self.values, value]
                newColumns = Index(np.concatenate((self.columns, [key])))
            elif loc == 0:
                newValues = np.c_[value, self.values]
                newColumns = Index(np.concatenate(([key], self.columns)))
            else:
                newValues = np.c_[self.values[:, :loc], value,
                                  self.values[:, loc:]]
                toConcat = (self.columns[:loc], [key], self.columns[loc:])
                newColumns = Index(np.concatenate(toConcat))
            self.values = newValues
            self.columns = newColumns

    def __delitem__(self, key):
        """
        Delete column from DataMatrix
        """
        if key in self.columns:
            loc = self.columns.indexMap[key]
            if loc == self.values.shape[1] - 1:
                newValues = self.values[:, :loc]
                newColumns = self.columns[:loc]
            else:
                newValues = np.c_[self.values[:, :loc], self.values[:,
                                                                    loc + 1:]]
                newColumns = Index(
                    np.concatenate(
                        (self.columns[:loc], self.columns[loc + 1:])))
            self.values = newValues
            self.columns = newColumns
        else:
            if self.objects is not None and key in self.objects:
                del self.objects[key]
            else:
                raise KeyError('%s' % key)

    def __iter__(self):
        """Iterate over columns of the frame."""
        return iter(self.columns)

    def __contains__(self, key):
        """True if DataMatrix has this column"""
        hasCol = key in self.columns
        if hasCol:
            return True
        else:
            if self.objects is not None and key in self.objects:
                return True
            return False

    def iteritems(self):
        return self._series.iteritems()

#-------------------------------------------------------------------------------
# Helper methods

# For DataFrame compatibility

    def _getSeries(self, item=None, loc=None):
        if loc is None:
            try:
                loc = self.columns.indexMap[item]
            except KeyError:
                raise Exception('%s not here!' % item)
        return Series(self.values[:, loc], index=self.index)

    def _getSeriesDict(self):
        series = {}
        for i, col in enumerate(self.columns):
            series[col] = self._getSeries(loc=i)
        if self.objects is not None:
            for i, col in enumerate(self.objects.columns):
                series[col] = self.objects._getSeries(loc=i)

        return series

    _series = property(_getSeriesDict)

    #-------------------------------------------------------------------------------
    # Outputting

    def toString(self,
                 buffer=sys.stdout,
                 columns=None,
                 colSpace=15,
                 nanRep='NaN',
                 formatters=None,
                 float_format=None):
        """
        Output a string version of this DataMatrix
        """
        _pf = common._pfixed
        formatters = formatters or {}

        if columns is None:
            columns = self.columns
            values = self.values
            if self.objects:
                columns = list(columns) + list(self.objects.columns)
                values = np.column_stack(
                    (values.astype(object), self.objects.values))
        else:
            columns = [c for c in columns if c in self]
            values = self.asMatrix(columns)

        ident = lambda x: x

        idxSpace = max([len(str(idx)) for idx in self.index]) + 4

        if len(self.cols()) == 0:
            buffer.write('DataMatrix is empty!\n')
            buffer.write(repr(self.index))
        else:
            buffer.write(_pf('', idxSpace))
            for h in columns:
                buffer.write(_pf(h, colSpace))
            buffer.write('\n')

            for i, idx in enumerate(self.index):
                buffer.write(_pf(idx, idxSpace))
                for j, col in enumerate(columns):
                    formatter = formatters.get(col, ident)
                    buffer.write(
                        _pf(formatter(values[i, j]),
                            colSpace,
                            float_format=float_format,
                            nanRep=nanRep))
                buffer.write('\n')

    def info(self, buffer=sys.stdout):
        """
        Concise summary of a DataMatrix, used in __repr__ when very large.
        """
        print >> buffer, 'Index: %s entries' % len(self.index),
        if len(self.index) > 0:
            print >> buffer, ', %s to %s' % (self.index[0], self.index[-1])
        else:
            print >> buffer, ''

        if len(self.columns) == 0:
            print >> buffer, 'DataMatrix is empty!'
            print >> buffer, repr(self.index)
            return

        print >> buffer, 'Data columns:'
        space = max([len(str(k)) for k in self.cols()]) + 4

        counts = self.count()

        cols = self.cols()
        assert (len(cols) == len(counts))

        columns = []
        for col, count in counts.iteritems():
            columns.append('%s%d  non-null values' %
                           (common._pfixed(col, space), count))

        dtypeLine = ''

        nf = len(self.columns)
        df = self.values.dtype

        if self.objects is not None:
            no = len(self.objects.columns)
            do = self.objects.values.dtype
            dtypeLine = '\ndtypes: %s(%d), %s(%d)' % (df, nf, do, no)
        else:
            dtypeLine = '\ndtype: %s(%d)' % (df, nf)

        buffer.write('\n'.join(columns) + dtypeLine)


#-------------------------------------------------------------------------------
# Public methods

    def apply(self, func, axis=0):
        """
        Applies func to columns (Series) of this DataMatrix and returns either
        a DataMatrix (if the function produces another series) or a Series
        indexed on the column names of the DataFrame if the function produces
        a value.

        Parameters
        ----------
        func : function
            Function to apply to each column

        Examples
        --------

            >>> df.apply(numpy.sqrt) --> DataMatrix
            >>> df.apply(numpy.sum) --> Series

        N.B.: Do NOT use functions that might toy with the index.
        """
        if not len(self.cols()):
            return self

        if isinstance(func, np.ufunc):
            results = func(self.values)
            return DataMatrix(data=results,
                              index=self.index,
                              columns=self.columns,
                              objects=self.objects)
        else:
            return DataFrame.apply(self, func, axis=axis)

    def applymap(self, func):
        """
        Apply a function to a DataMatrix that is intended to operate
        elementwise, i.e. like doing
            map(func, series) for each series in the DataMatrix

        Parameters
        ----------
        func : function
            Python function, returns a single value from a single value

        Note : try to avoid using this function if you can, very slow.
        """
        npfunc = np.frompyfunc(func, 1, 1)
        results = npfunc(self.values)
        try:
            results = results.astype(self.values.dtype)
        except Exception:
            pass

        return DataMatrix(results, index=self.index, columns=self.columns)

    def append(self, other):
        """
        Glue together DataFrame objects having non-overlapping indices

        Parameters
        ----------
        other : DataFrame
        """
        if not other:
            return self.copy()

        if not self:
            return other.copy()

        if (isinstance(other, DataMatrix)
                and self.columns.equals(other.columns)):

            idx = Index(np.concatenate([self.index, other.index]))
            mat = np.vstack((self.values, other.values))

            if other.objects is None:
                objects = self.objects
            elif self.objects is None:
                objects = other.objects
            else:
                objects = self.objects.append(other.objects)

            if objects:
                objects = objects.reindex(idx)

            dm = DataMatrix(mat, idx, self.columns, objects=objects)
            return dm
        else:
            return super(DataMatrix, self).append(other)

    def asMatrix(self, columns=None):
        """
        Convert the DataMatrix to its Numpy-array matrix representation

        Columns are presented in sorted order unless a specific list
        of columns is provided.

        Parameters
        ----------
        columns : list-like
            columns to use in producing matrix, must all be contained

        Returns
        -------
        ndarray
        """
        if columns is None:
            values = self.values.copy()

            if self.objects:
                values = np.column_stack((values, self.objects.values))

            return values
        else:
            if not isinstance(columns, Index):
                columns = Index(columns)

            values = self.values
            order = self.columns

            if self.objects:
                idxMap = self.objects.columns.indexMap
                indexer = [idxMap[col] for col in columns if col in idxMap]

                obj_values = self.objects.values.take(indexer, axis=1)

                values = np.column_stack((values, obj_values))
                order = Index(np.concatenate((order, self.objects.columns)))

                # now put in the right order

            values = _reorder_columns(values, order, columns)

            return values

    def cols(self):
        """Return sorted list of frame's columns"""
        if self.objects is not None and len(self.objects.columns) > 0:
            return list(self.columns.union(self.objects.columns))
        else:
            return list(self.columns)

    def copy(self):
        """
        Make a copy of this DataMatrix
        """
        if self.objects:
            objects = self.objects.copy()
        else:
            objects = None

        return DataMatrix(self.values.copy(),
                          index=self.index,
                          columns=self.columns,
                          objects=objects)

    def cumsum(self, axis=0):
        """
        Return DataMatrix of cumulative sums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        y : DataMatrix
        """
        y = np.array(self.values, subok=True)
        if not issubclass(y.dtype.type, np.int_):
            mask = np.isnan(self.values)
            y[mask] = 0
            result = y.cumsum(axis)
            has_obs = (-mask).astype(int).cumsum(axis) > 0
            result[-has_obs] = np.NaN
        else:
            result = y.cumsum(axis)

        return DataMatrix(result,
                          index=self.index,
                          columns=self.columns,
                          objects=self.objects)

    def fill(self, value=None, method='pad'):
        """
        Fill NaN values using the specified method.

        Member Series / TimeSeries are filled separately.

        Parameters
        ----------
        value : any kind (should be same type as array)
            Value to use to fill holes (e.g. 0)

        method : {'backfill', 'pad', None}
            Method to use for filling holes in new inde

        Returns
        -------
        y : DataMatrix

        See also
        --------
        DataMatrix.reindex, DataMatrix.asfreq
        """
        if value is None:
            result = {}
            series = self._series
            for col, s in series.iteritems():
                result[col] = s.fill(method=method, value=value)

            return DataMatrix(result, index=self.index, objects=self.objects)
        else:
            # Float type values
            if len(self.columns) == 0:
                return self

            vals = self.values.copy()
            vals.flat[common.isnull(vals.ravel())] = value

            objects = None

            if self.objects is not None:
                objects = self.objects.copy()

            return DataMatrix(vals,
                              index=self.index,
                              columns=self.columns,
                              objects=objects)

    def xs(self, key):
        """
        Returns a row from the DataMatrix as a Series object.

        Parameters
        ----------
        key : some index contained in the index

        Returns
        -------
        Series
        """
        if key not in self.index:
            raise Exception('No cross-section for %s' % key)

        loc = self.index.indexMap[key]
        theSlice = self.values[loc, :].copy()
        xsIndex = self.columns

        result = Series(theSlice, index=xsIndex)

        if self.objects is not None and len(self.objects.columns) > 0:
            result = result.append(self.objects.getXS(key))

        return result

    @property
    def T(self):
        """
        Returns a DataMatrix with the rows/columns switched.
        """
        if self.objects is not None:
            objectsT = self.objects.values.T
            valuesT = self.values.T
            newValues = np.concatenate((valuesT, objectsT), axis=0)
            newIndex = Index(
                np.concatenate((self.columns, self.objects.columns)))

            return DataMatrix(newValues, index=newIndex, columns=self.index)
        else:
            return DataMatrix(data=self.values.T,
                              index=self.columns,
                              columns=self.index)

    def shift(self, periods, offset=None, timeRule=None):
        """
        Shift the underlying series of the DataMatrix and Series objects within
        by given number (positive or negative) of periods.

        Parameters
        ----------
        periods : int (+ or -)
            Number of periods to move
        offset : DateOffset, optional
            Increment to use from datetools module
        timeRule : string
            Time rule to use by name

        Returns
        -------
        DataMatrix
        """
        if periods == 0:
            return self

        if timeRule is not None and offset is None:
            offset = datetools.getOffset(timeRule)

        N = len(self)

        if offset is None:
            newIndex = self.index
            indexer = np.zeros(N, dtype=int)
            if periods > 0:
                indexer[periods:] = np.arange(N - periods)
                newValues = self.values.take(indexer, axis=0)
                newValues[:periods] = NaN
            else:
                indexer[:periods] = np.arange(-periods, N)
                newValues = self.values.take(indexer, axis=0)
                newValues[periods:] = NaN
        else:
            newIndex = self.index.shift(periods, offset)
            newValues = self.values.copy()

        if self.objects is not None:
            shifted_objects = self.objects.shift(periods,
                                                 offset=offset,
                                                 timeRule=timeRule)

            shifted_objects.index = newIndex
        else:
            shifted_objects = None

        return DataMatrix(data=newValues,
                          index=newIndex,
                          columns=self.columns,
                          objects=shifted_objects)

    def cap(self, threshold):
        """
        Trim values at threshold

        Returns
        -------
        DataMatrix
        """
        return DataMatrix(np.where(self.values > threshold, threshold,
                                   self.values),
                          index=self.index,
                          columns=self.columns,
                          objects=self.objects)

    def floor(self, threshold):
        """
        Trim values below threshold

        Returns
        -------
        DataMatrix
        """
        return DataMatrix(np.where(self.values < threshold, threshold,
                                   self.values),
                          index=self.index,
                          columns=self.columns,
                          objects=self.objects)

    def min(self, axis=0):
        """
        Return array or Series of minimums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        Series or TimeSeries
        """
        values = self.values.copy()
        np.putmask(values, -np.isfinite(values), np.inf)
        return Series(values.min(axis), index=self._get_agg_axis(axis))

    def max(self, axis=0):
        """
        Return array or Series of maximums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        Series or TimeSeries
        """
        values = self.values.copy()
        np.putmask(values, -np.isfinite(values), -np.inf)
        return Series(values.max(axis), index=self._get_agg_axis(axis))
Ejemplo n.º 4
0
 def test_equals(self):
     same_values = Index(self.index, dtype=object)
     self.assert_(self.index.equals(same_values))
     self.assert_(same_values.equals(self.index))
Ejemplo n.º 5
0
class DataMatrix(DataFrame):
    """
    Matrix version of DataFrame, optimized for cross-section operations,
    numerical computation, and other operations that do not require the
    frame to change size.

    Parameters
    ----------
    data : numpy ndarray or dict of sequence-like objects
        Dict can contain Series, arrays, or list-like objects
        Constructor can understand various kinds of inputs
    index : Index or array-like
        Index to use for resulting frame (optional if provided dict of Series)
    columns : Index or array-like
        Required if data is ndarray
    dtype : dtype, default None (infer)
        Data type to force

    Notes
    -----
    Transposing is much faster in this regime, as is calling getXS, so please
    take note of this.
    """
    objects = None
    def __init__(self, data=None, index=None, columns=None, dtype=None,
                 objects=None):

        if isinstance(data, dict) and len(data) > 0:
            (index, columns,
             values, objects) = self._initDict(data, index, columns, objects,
                                               dtype)
        elif isinstance(data, (np.ndarray, list)):
            (index, columns, values) = self._initMatrix(data, index,
                                                        columns, dtype)

            if objects is not None:
                if isinstance(objects, DataMatrix):
                    if not objects.index.equals(index):
                        objects = objects.reindex(index)
                else:
                    objects = DataMatrix(objects, index=index)
        elif isinstance(data, DataFrame):
            if not isinstance(data, DataMatrix):
                data = data.toDataMatrix()
            values = data.values
            index = data.index
            columns = data.columns
            objects = data.objects
        elif data is None or len(data) == 0:
            # this is a touch convoluted...
            if objects is not None:
                if isinstance(objects, DataMatrix):
                    if index is not None and objects.index is not index:
                        objects = objects.reindex(index)
                else:
                    objects = DataMatrix(objects, index=index)

                index = objects.index

            if index is None:
                N = 0
                index = NULL_INDEX
            else:
                N = len(index)

            if columns is None:
                K = 0
                columns = NULL_INDEX
            else:
                K = len(columns)

            values = np.empty((N, K), dtype=dtype)
            values[:] = NaN
        else:
            raise Exception('DataMatrix constructor not properly called!')

        self.values = values
        self.index = index
        self.columns = columns
        self.objects = objects

    def _initDict(self, data, index, columns, objects, dtype):
        """
        Segregate Series based on type and coerce into matrices.

        Needs to handle a lot of exceptional cases.

        Somehow this got outrageously complicated
        """
        # pre-filter out columns if we passed it
        if columns is not None:
            colset = set(columns)
            data = dict((k, v) for k, v in data.iteritems() if k in colset)

        index = _extract_index(data, index)

        objectDict = {}
        if objects is not None and isinstance(objects, dict):
            objectDict.update(objects)

        valueDict = {}
        for k, v in data.iteritems():
            if isinstance(v, Series):
                if v.index is not index:
                    # Forces alignment. No need to copy data since we
                    # are putting it into an ndarray later
                    v = v.reindex(index)
            else:
                if isinstance(v, dict):
                    v = [v.get(i, NaN) for i in index]
                else:
                    assert(len(v) == len(index))

                try:
                    v = Series(v, dtype=dtype, index=index)
                except Exception:
                    v = Series(v, index=index)

            if issubclass(v.dtype.type, (np.bool_, float, int)):
                valueDict[k] = v
            else:
                objectDict[k] = v

        if columns is None:
            columns = Index(_try_sort(valueDict))
            objectColumns = Index(_try_sort(objectDict))
        else:
            objectColumns = Index([c for c in columns if c in objectDict])
            columns = Index([c for c in columns if c not in objectDict])

        if len(valueDict) == 0:
            dtype = np.object_
            valueDict = objectDict
            columns = objectColumns
        else:
            dtypes = set(v.dtype for v in valueDict.values())

            if len(dtypes) > 1:
                dtype = np.float_
            else:
                dtype = list(dtypes)[0]

            if len(objectDict) > 0:
                new_objects = DataMatrix(objectDict,
                                         dtype=np.object_,
                                         index=index,
                                         columns=objectColumns)
                if isinstance(objects, DataMatrix):
                    objects = objects.join(new_objects, how='left')
                else:
                    objects = new_objects

        values = np.empty((len(index), len(columns)), dtype=dtype)

        for i, col in enumerate(columns):
            if col in valueDict:
                values[:, i] = valueDict[col]
            else:
                values[:, i] = np.NaN

        return index, columns, values, objects

    def _initMatrix(self, values, index, columns, dtype):
        if not isinstance(values, np.ndarray):
            arr = np.array(values)
            if issubclass(arr.dtype.type, basestring):
                arr = np.array(values, dtype=object, copy=True)

            values = arr

        if values.ndim == 1:
            N = values.shape[0]
            if N == 0:
                values = values.reshape((values.shape[0], 0))
            else:
                values = values.reshape((values.shape[0], 1))

        if dtype is not None:
            try:
                values = values.astype(dtype)
            except Exception:
                pass

        N, K = values.shape

        if index is None:
            if N == 0:
                index = NULL_INDEX
            else:
                index = np.arange(N)

        if columns is None:
            if K == 0:
                columns = NULL_INDEX
            else:
                columns = np.arange(K)

        return index, columns, values

    @property
    def _constructor(self):
        return DataMatrix

    # Because of DataFrame property
    values = None

    def __array__(self):
        return self.values

    def __array_wrap__(self, result):
        return DataMatrix(result, index=self.index, columns=self.columns)

#-------------------------------------------------------------------------------
# DataMatrix-specific implementation of private API

    def _join_on(self, other, on):
        if len(other.index) == 0:
            return self

        if on not in self:
            raise Exception('%s column not contained in this frame!' % on)

        fillVec, mask = tseries.getMergeVec(self[on],
                                            other.index.indexMap)

        tmpMatrix = other.values.take(fillVec, axis=0)
        tmpMatrix[-mask] = NaN

        seriesDict = dict((col, tmpMatrix[:, j])
                           for j, col in enumerate(other.columns))

        if getattr(other, 'objects'):
            objects = other.objects

            tmpMat = objects.values.take(fillVec, axis=0)
            tmpMat[-mask] = NaN
            objDict = dict((col, tmpMat[:, j])
                           for j, col in enumerate(objects.columns))

            seriesDict.update(objDict)

        filledFrame = DataFrame(data=seriesDict, index=self.index)

        return self.join(filledFrame, how='left')

    def _reindex_index(self, index, method):
        if index is self.index:
            return self.copy()

        if not isinstance(index, Index):
            index = Index(index)

        if len(self.index) == 0:
            return DataMatrix(index=index, columns=self.columns)

        indexer, mask = common.get_indexer(self.index, index, method)
        mat = self.values.take(indexer, axis=0)

        notmask = -mask
        if len(index) > 0:
            if notmask.any():
                if issubclass(mat.dtype.type, np.int_):
                    mat = mat.astype(float)
                elif issubclass(mat.dtype.type, np.bool_):
                    mat = mat.astype(float)

                common.null_out_axis(mat, notmask, 0)

        if self.objects is not None and len(self.objects.columns) > 0:
            newObjects = self.objects.reindex(index)
        else:
            newObjects = None

        return DataMatrix(mat, index=index, columns=self.columns,
                          objects=newObjects)

    def _reindex_columns(self, columns):
        if len(columns) == 0:
            return DataMatrix(index=self.index)

        if not isinstance(columns, Index):
            columns = Index(columns)

        if self.objects is not None:
            object_columns = columns.intersection(self.objects.columns)
            columns = columns - object_columns

            objects = self.objects._reindex_columns(object_columns)
        else:
            objects = None

        if len(columns) > 0 and len(self.columns) == 0:
            return DataMatrix(index=self.index, columns=columns,
                              objects=objects)

        indexer, mask = common.get_indexer(self.columns, columns, None)
        mat = self.values.take(indexer, axis=1)

        notmask = -mask
        if len(mask) > 0:
            if notmask.any():
                if issubclass(mat.dtype.type, np.int_):
                    mat = mat.astype(float)
                elif issubclass(mat.dtype.type, np.bool_):
                    mat = mat.astype(float)

                common.null_out_axis(mat, notmask, 1)

        return DataMatrix(mat, index=self.index, columns=columns,
                          objects=objects)

    def _rename_columns_inplace(self, mapper):
        self.columns = [mapper(x) for x in self.columns]

        if self.objects is not None:
            self.objects._rename_columns_inplace(mapper)

    def _combineFrame(self, other, func):
        """
        Methodology, briefly
        - Really concerned here about speed, space

        - Get new index
        - Reindex to new index
        - Determine newColumns and commonColumns
        - Add common columns over all (new) indices
        - Fill to new set of columns

        Could probably deal with some Cython action in here at some point
        """
        need_reindex = False

        if self.index.equals(other.index):
            newIndex = self.index
        else:
            newIndex = self.index.union(other.index)
            need_reindex = True

        if not self and not other:
            return DataMatrix(index=newIndex)
        elif not self:
            return other * NaN
        elif not other:
            return self * NaN

        if self.columns.equals(other.columns):
            newColumns = self.columns
        else:
            newColumns = self.columns.union(other.columns)
            need_reindex = True or need_reindex

        if need_reindex:
            myReindex = self.reindex(index=newIndex,
                                     columns=newColumns)
            hisReindex = other.reindex(index=newIndex,
                                       columns=newColumns)
        else:
            myReindex = self
            hisReindex = other

        myValues = myReindex.values
        hisValues = hisReindex.values

        return DataMatrix(func(myValues, hisValues),
                          index=newIndex, columns=newColumns)

    def _combineSeries(self, other, func):
        newIndex = self.index
        newCols = self.columns

        if len(self) == 0:
            # Ambiguous case
            return DataMatrix(index=self.index, columns=self.columns,
                              objects=self.objects)

        if self.index._allDates and other.index._allDates:
            # Operate row-wise
            if self.index.equals(other.index):
                newIndex = self.index
                other_vals = other.values
                values = self.values
            else:
                newIndex = self.index + other.index

                if other.index.equals(newIndex):
                    other_vals = other.values
                else:
                    other_vals = other.reindex(newIndex).values

                if self.index.equals(newIndex):
                    values = self.values
                else:
                    values = self.reindex(newIndex).values

            resultMatrix = func(values.T, other_vals).T
        else:
            if len(other) == 0:
                return self * NaN

            newCols = self.columns.union(other.index)

            # Operate column-wise
            this = self.reindex(columns=newCols)
            other = other.reindex(newCols).values

            resultMatrix = func(this.values, other)

        # TODO: deal with objects
        return DataMatrix(resultMatrix, index=newIndex, columns=newCols)

    def _combineFunc(self, other, func):
        """
        Combine DataMatrix objects with other Series- or DataFrame-like objects

        This is the core method used for the overloaded arithmetic methods

        Result hierarchy
        ----------------
        DataMatrix + DataFrame --> DataMatrix
        DataMatrix + DataMatrix --> DataMatrix
        DataMatrix + Series --> DataMatrix
        DataMatrix + constant --> DataMatrix

        The reason for 'upcasting' the result is that if addition succeed,
        we can assume that the input DataFrame was homogeneous.
        """
        newIndex = self.index
        if isinstance(other, DataFrame):
            return self._combineFrame(other, func)

        elif isinstance(other, Series):
            return self._combineSeries(other, func)

        else:
            if not self:
                return self

            # Constant of some kind
            newCols = self.columns
            resultMatrix = func(self.values, other)

        # TODO: deal with objects
        return DataMatrix(resultMatrix, index=newIndex, columns=newCols)

#-------------------------------------------------------------------------------
# Properties for index and columns

    _columns = None
    def _get_columns(self):
        return self._columns

    def _set_columns(self, cols):
        if len(cols) != self.values.shape[1]:
            raise Exception('Columns length %d did not match values %d!' %
                            (len(cols), self.values.shape[1]))

        if not isinstance(cols, Index):
            cols = Index(cols)

        self._columns = cols

    columns = property(fget=_get_columns, fset=_set_columns)

    def _set_index(self, index):
        if len(index) > 0:
            if len(index) != self.values.shape[0]:
                raise Exception('Index length %d did not match values %d!' %
                                (len(index), self.values.shape[0]))

        if not isinstance(index, Index):
            index = Index(index)

        self._index = index

        if self.objects is not None:
            self.objects._index = index

    def _get_index(self):
        return self._index

    index = property(fget=_get_index, fset=_set_index)

#-------------------------------------------------------------------------------
# "Magic methods"

    def __getstate__(self):
        if self.objects is not None:
            objects = self.objects._matrix_state(pickle_index=False)
        else:
            objects = None

        state = self._matrix_state()

        return (state, objects)

    def _matrix_state(self, pickle_index=True):
        columns = _pickle_array(self.columns)

        if pickle_index:
            index = _pickle_array(self.index)
        else:
            index = None

        return self.values, index, columns

    def __setstate__(self, state):
        (vals, idx, cols), object_state = state

        self.values = vals
        self.index = _unpickle_array(idx)
        self.columns = _unpickle_array(cols)

        if object_state:
            ovals, _, ocols = object_state
            self.objects = DataMatrix(ovals,
                                      index=self.index,
                                      columns=_unpickle_array(ocols))
        else:
            self.objects = None

    def __nonzero__(self):
        N, K = self.values.shape
        if N == 0 or K == 0:
            if self.objects is None:
                return False
            else:
                return self.objects.__nonzero__()
        else:
            return True

    def __neg__(self):
        mycopy = self.copy()
        mycopy.values = -mycopy.values
        return mycopy

    def __repr__(self):
        """Return a string representation for a particular DataMatrix"""
        buffer = StringIO()

        if len(self.cols()) == 0:
            buffer.write('Empty DataMatrix\nIndex: %s' % repr(self.index))
        elif 0 < len(self.index) < 500 and self.values.shape[1] < 10:
            self.toString(buffer=buffer)
        else:
            print >> buffer, str(self.__class__)
            self.info(buffer=buffer)

        return buffer.getvalue()

    def __getitem__(self, item):
        """
        Retrieve column, slice, or subset from DataMatrix.

        Possible inputs
        ---------------
        single value : retrieve a column as a Series
        slice : reindex to indices specified by slice
        boolean vector : like slice but more general, reindex to indices
          where the input vector is True

        Examples
        --------
        column = dm['A']

        dmSlice = dm[:20] # First 20 rows

        dmSelect = dm[dm.count(axis=1) > 10]

        Notes
        -----
        This is a magic method. Do NOT call explicity.
        """
        if isinstance(item, slice):
            indexRange = self.index[item]
            return self.reindex(indexRange)

        elif isinstance(item, np.ndarray):
            if len(item) != len(self.index):
                raise Exception('Item wrong length %d instead of %d!' %
                                (len(item), len(self.index)))
            newIndex = self.index[item]
            return self.reindex(newIndex)
        else:
            if self.objects is not None and item in self.objects:
                return self.objects[item]
            else:
                return self._getSeries(item)

    _dataTypes = [np.float_, np.bool_, np.int_]
    def __setitem__(self, key, value):
        """
        Add series to DataMatrix in specified column.

        If series is a numpy-array (not a Series/TimeSeries), it must be the
        same length as the DataMatrix's index or an error will be thrown.

        Series/TimeSeries will be conformed to the DataMatrix's index to
        ensure homogeneity.
        """
        if hasattr(value, '__iter__'):
            if isinstance(value, Series):
                if value.index.equals(self.index):
                    # no need to copy
                    value = value.values
                else:
                    value = value.reindex(self.index).values
            else:
                assert(len(value) == len(self.index))

                if not isinstance(value, np.ndarray):
                    value = np.array(value)
                    if value.dtype.type == np.str_:
                        value = np.array(value, dtype=object)
        else:
            value = np.repeat(value, len(self.index))

        if self.values.dtype == np.object_:
            self._insert_object_dtype(key, value)
        else:
            self._insert_float_dtype(key, value)

    def _insert_float_dtype(self, key, value):
        isObject = value.dtype not in self._dataTypes

        if key in self.columns:
            loc = self.columns.indexMap[key]
            self.values[:, loc] = value
        elif isObject:
            if self.objects is None:
                self.objects = DataMatrix({key : value},
                                          index=self.index)
            else:
                self.objects[key] = value
        elif len(self.columns) == 0:
            self.values = value.reshape((len(value), 1)).astype(np.float)
            self.columns = Index([key])
        else:
            try:
                loc = self.columns.searchsorted(key)
            except TypeError:
                loc = len(self.columns)

            if loc == self.values.shape[1]:
                newValues = np.c_[self.values, value]
                newColumns = Index(np.concatenate((self.columns, [key])))
            elif loc == 0:
                newValues = np.c_[value, self.values]
                newColumns = Index(np.concatenate(([key], self.columns)))
            else:
                newValues = np.c_[self.values[:, :loc], value,
                                  self.values[:, loc:]]
                toConcat = (self.columns[:loc], [key], self.columns[loc:])
                newColumns = Index(np.concatenate(toConcat))
            self.values = newValues
            self.columns = newColumns

    def _insert_object_dtype(self, key, value):
        if key in self.columns:
            loc = self.columns.indexMap[key]
            self.values[:, loc] = value
        elif len(self.columns) == 0:
            self.values = value.reshape((len(value), 1)).copy()
            self.columns = Index([key])
        else:
            try:
                loc = self.columns.searchsorted(key)
            except TypeError:
                loc = len(self.columns)

            if loc == self.values.shape[1]:
                newValues = np.c_[self.values, value]
                newColumns = Index(np.concatenate((self.columns, [key])))
            elif loc == 0:
                newValues = np.c_[value, self.values]
                newColumns = Index(np.concatenate(([key], self.columns)))
            else:
                newValues = np.c_[self.values[:, :loc], value,
                                  self.values[:, loc:]]
                toConcat = (self.columns[:loc], [key], self.columns[loc:])
                newColumns = Index(np.concatenate(toConcat))
            self.values = newValues
            self.columns = newColumns


    def __delitem__(self, key):
        """
        Delete column from DataMatrix
        """
        if key in self.columns:
            loc = self.columns.indexMap[key]
            if loc == self.values.shape[1] - 1:
                newValues = self.values[:, :loc]
                newColumns = self.columns[:loc]
            else:
                newValues = np.c_[self.values[:, :loc], self.values[:, loc+1:]]
                newColumns = Index(np.concatenate((self.columns[:loc],
                                                   self.columns[loc+1:])))
            self.values = newValues
            self.columns = newColumns
        else:
            if self.objects is not None and key in self.objects:
                del self.objects[key]
            else:
                raise KeyError('%s' % key)

    def __iter__(self):
        """Iterate over columns of the frame."""
        return iter(self.columns)

    def __contains__(self, key):
        """True if DataMatrix has this column"""
        hasCol = key in self.columns
        if hasCol:
            return True
        else:
            if self.objects is not None and key in self.objects:
                return True
            return False

    def iteritems(self):
        return self._series.iteritems()

#-------------------------------------------------------------------------------
# Helper methods

    # For DataFrame compatibility
    def _getSeries(self, item=None, loc=None):
        if loc is None:
            try:
                loc = self.columns.indexMap[item]
            except KeyError:
                raise Exception('%s not here!' % item)
        return Series(self.values[:, loc], index=self.index)

    def _getSeriesDict(self):
        series = {}
        for i, col in enumerate(self.columns):
            series[col] = self._getSeries(loc=i)
        if self.objects is not None:
            for i, col in enumerate(self.objects.columns):
                series[col] = self.objects._getSeries(loc=i)

        return series
    _series = property(_getSeriesDict)

#-------------------------------------------------------------------------------
# Outputting

    def toString(self, buffer=sys.stdout, columns=None, colSpace=15,
                 nanRep='NaN', formatters=None, float_format=None):
        """
        Output a string version of this DataMatrix
        """
        _pf = common._pfixed
        formatters = formatters or {}

        if columns is None:
            columns = self.columns
            values = self.values
            if self.objects:
                columns = list(columns) + list(self.objects.columns)
                values = np.column_stack((values.astype(object),
                                          self.objects.values))
        else:
            columns = [c for c in columns if c in self]
            values = self.asMatrix(columns)

        ident = lambda x: x

        idxSpace = max([len(str(idx)) for idx in self.index]) + 4

        if len(self.cols()) == 0:
            buffer.write('DataMatrix is empty!\n')
            buffer.write(repr(self.index))
        else:
            buffer.write(_pf('', idxSpace))
            for h in columns:
                buffer.write(_pf(h, colSpace))
            buffer.write('\n')

            for i, idx in enumerate(self.index):
                buffer.write(_pf(idx, idxSpace))
                for j, col in enumerate(columns):
                    formatter = formatters.get(col, ident)
                    buffer.write(_pf(formatter(values[i, j]), colSpace,
                                     float_format=float_format,
                                     nanRep=nanRep))
                buffer.write('\n')

    def info(self, buffer=sys.stdout):
        """
        Concise summary of a DataMatrix, used in __repr__ when very large.
        """
        print >> buffer, 'Index: %s entries' % len(self.index),
        if len(self.index) > 0:
            print >> buffer, ', %s to %s' % (self.index[0], self.index[-1])
        else:
            print >> buffer, ''

        if len(self.columns) == 0:
            print >> buffer, 'DataMatrix is empty!'
            print >> buffer, repr(self.index)
            return

        print >> buffer, 'Data columns:'
        space = max([len(str(k)) for k in self.cols()]) + 4

        counts = self.count()

        cols = self.cols()
        assert(len(cols) == len(counts))

        columns = []
        for col, count in counts.iteritems():
            columns.append('%s%d  non-null values' %
                           (common._pfixed(col, space), count))

        dtypeLine = ''

        nf = len(self.columns)
        df = self.values.dtype

        if self.objects is not None:
            no = len(self.objects.columns)
            do = self.objects.values.dtype
            dtypeLine = '\ndtypes: %s(%d), %s(%d)' % (df, nf, do, no)
        else:
            dtypeLine = '\ndtype: %s(%d)' % (df, nf)

        buffer.write('\n'.join(columns) + dtypeLine)


#-------------------------------------------------------------------------------
# Public methods

    def apply(self, func, axis=0):
        """
        Applies func to columns (Series) of this DataMatrix and returns either
        a DataMatrix (if the function produces another series) or a Series
        indexed on the column names of the DataFrame if the function produces
        a value.

        Parameters
        ----------
        func : function
            Function to apply to each column

        Examples
        --------

            >>> df.apply(numpy.sqrt) --> DataMatrix
            >>> df.apply(numpy.sum) --> Series

        N.B.: Do NOT use functions that might toy with the index.
        """
        if not len(self.cols()):
            return self

        if isinstance(func, np.ufunc):
            results = func(self.values)
            return DataMatrix(data=results, index=self.index,
                              columns=self.columns, objects=self.objects)
        else:
            return DataFrame.apply(self, func, axis=axis)

    def applymap(self, func):
        """
        Apply a function to a DataMatrix that is intended to operate
        elementwise, i.e. like doing
            map(func, series) for each series in the DataMatrix

        Parameters
        ----------
        func : function
            Python function, returns a single value from a single value

        Note : try to avoid using this function if you can, very slow.
        """
        npfunc = np.frompyfunc(func, 1, 1)
        results = npfunc(self.values)
        try:
            results = results.astype(self.values.dtype)
        except Exception:
            pass

        return DataMatrix(results, index=self.index, columns=self.columns)

    def append(self, other):
        """
        Glue together DataFrame objects having non-overlapping indices

        Parameters
        ----------
        other : DataFrame
        """
        if not other:
            return self.copy()

        if not self:
            return other.copy()

        if (isinstance(other, DataMatrix) and
            self.columns.equals(other.columns)):

            idx = Index(np.concatenate([self.index, other.index]))
            mat = np.vstack((self.values, other.values))

            if other.objects is None:
                objects = self.objects
            elif self.objects is None:
                objects = other.objects
            else:
                objects = self.objects.append(other.objects)

            if objects:
                objects = objects.reindex(idx)

            dm = DataMatrix(mat, idx, self.columns, objects=objects)
            return dm
        else:
            return super(DataMatrix, self).append(other)

    def asMatrix(self, columns=None):
        """
        Convert the DataMatrix to its Numpy-array matrix representation

        Columns are presented in sorted order unless a specific list
        of columns is provided.

        Parameters
        ----------
        columns : list-like
            columns to use in producing matrix, must all be contained

        Returns
        -------
        ndarray
        """
        if columns is None:
            values = self.values.copy()

            if self.objects:
                values = np.column_stack((values, self.objects.values))

            return values
        else:
            if not isinstance(columns, Index):
                columns = Index(columns)

            values = self.values
            order = self.columns

            if self.objects:
                idxMap = self.objects.columns.indexMap
                indexer = [idxMap[col] for col in columns if col in idxMap]

                obj_values = self.objects.values.take(indexer, axis=1)

                values = np.column_stack((values, obj_values))
                order = Index(np.concatenate((order, self.objects.columns)))

                # now put in the right order

            values = _reorder_columns(values, order, columns)

            return values

    def cols(self):
        """Return sorted list of frame's columns"""
        if self.objects is not None and len(self.objects.columns) > 0:
            return list(self.columns.union(self.objects.columns))
        else:
            return list(self.columns)

    def copy(self):
        """
        Make a copy of this DataMatrix
        """
        if self.objects:
            objects = self.objects.copy()
        else:
            objects = None

        return DataMatrix(self.values.copy(), index=self.index,
                          columns=self.columns, objects=objects)

    def cumsum(self, axis=0):
        """
        Return DataMatrix of cumulative sums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        y : DataMatrix
        """
        y = np.array(self.values, subok=True)
        if not issubclass(y.dtype.type, np.int_):
            mask = np.isnan(self.values)
            y[mask] = 0
            result = y.cumsum(axis)
            has_obs = (-mask).astype(int).cumsum(axis) > 0
            result[-has_obs] = np.NaN
        else:
            result = y.cumsum(axis)

        return DataMatrix(result, index=self.index,
                          columns=self.columns, objects=self.objects)

    def fill(self, value=None, method='pad'):
        """
        Fill NaN values using the specified method.

        Member Series / TimeSeries are filled separately.

        Parameters
        ----------
        value : any kind (should be same type as array)
            Value to use to fill holes (e.g. 0)

        method : {'backfill', 'pad', None}
            Method to use for filling holes in new inde

        Returns
        -------
        y : DataMatrix

        See also
        --------
        DataMatrix.reindex, DataMatrix.asfreq
        """
        if value is None:
            result = {}
            series = self._series
            for col, s in series.iteritems():
                result[col] = s.fill(method=method, value=value)

            return DataMatrix(result, index=self.index, objects=self.objects)
        else:
            # Float type values
            if len(self.columns) == 0:
                return self

            vals = self.values.copy()
            vals.flat[common.isnull(vals.ravel())] = value

            objects = None

            if self.objects is not None:
                objects = self.objects.copy()

            return DataMatrix(vals, index=self.index, columns=self.columns,
                              objects=objects)

    def xs(self, key):
        """
        Returns a row from the DataMatrix as a Series object.

        Parameters
        ----------
        key : some index contained in the index

        Returns
        -------
        Series
        """
        if key not in self.index:
            raise Exception('No cross-section for %s' % key)

        loc = self.index.indexMap[key]
        theSlice = self.values[loc, :].copy()
        xsIndex = self.columns

        result = Series(theSlice, index=xsIndex)

        if self.objects is not None and len(self.objects.columns) > 0:
            result = result.append(self.objects.getXS(key))

        return result

    @property
    def T(self):
        """
        Returns a DataMatrix with the rows/columns switched.
        """
        if self.objects is not None:
            objectsT = self.objects.values.T
            valuesT = self.values.T
            newValues = np.concatenate((valuesT, objectsT), axis=0)
            newIndex = Index(np.concatenate((self.columns,
                                             self.objects.columns)))

            return DataMatrix(newValues, index=newIndex, columns=self.index)
        else:
            return DataMatrix(data=self.values.T, index=self.columns,
                              columns=self.index)

    def shift(self, periods, offset=None, timeRule=None):
        """
        Shift the underlying series of the DataMatrix and Series objects within
        by given number (positive or negative) of periods.

        Parameters
        ----------
        periods : int (+ or -)
            Number of periods to move
        offset : DateOffset, optional
            Increment to use from datetools module
        timeRule : string
            Time rule to use by name

        Returns
        -------
        DataMatrix
        """
        if periods == 0:
            return self

        if timeRule is not None and offset is None:
            offset = datetools.getOffset(timeRule)

        N = len(self)

        if offset is None:
            newIndex = self.index
            indexer = np.zeros(N, dtype=int)
            if periods > 0:
                indexer[periods:] = np.arange(N - periods)
                newValues = self.values.take(indexer, axis=0)
                newValues[:periods] = NaN
            else:
                indexer[:periods] = np.arange(-periods, N)
                newValues = self.values.take(indexer, axis=0)
                newValues[periods:] = NaN
        else:
            newIndex = self.index.shift(periods, offset)
            newValues = self.values.copy()

        if self.objects is not None:
            shifted_objects = self.objects.shift(periods, offset=offset,
                                                 timeRule=timeRule)

            shifted_objects.index = newIndex
        else:
            shifted_objects = None

        return DataMatrix(data=newValues, index=newIndex, columns=self.columns,
                          objects=shifted_objects)

    def cap(self, threshold):
        """
        Trim values at threshold

        Returns
        -------
        DataMatrix
        """
        return DataMatrix(np.where(self.values > threshold,
                                   threshold, self.values),
                          index=self.index, columns=self.columns,
                          objects=self.objects)

    def floor(self, threshold):
        """
        Trim values below threshold

        Returns
        -------
        DataMatrix
        """
        return DataMatrix(np.where(self.values < threshold,
                                   threshold, self.values),
                          index=self.index, columns=self.columns,
                          objects=self.objects)

    def min(self, axis=0):
        """
        Return array or Series of minimums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        Series or TimeSeries
        """
        values = self.values.copy()
        np.putmask(values, -np.isfinite(values), np.inf)
        return Series(values.min(axis), index=self._get_agg_axis(axis))

    def max(self, axis=0):
        """
        Return array or Series of maximums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        Series or TimeSeries
        """
        values = self.values.copy()
        np.putmask(values, -np.isfinite(values), -np.inf)
        return Series(values.max(axis), index=self._get_agg_axis(axis))