Example #1
0
    def union_many(self, others):
        """
        A bit of a hack to accelerate unioning a collection of indexes
        """
        this = self

        for other in others:
            if not isinstance(this, DatetimeIndex):
                this = Index.union(this, other)
                continue

            if not isinstance(other, DatetimeIndex):
                try:
                    other = DatetimeIndex(other)
                except TypeError:
                    pass

            this, other = this._maybe_utc_convert(other)

            if this._can_fast_union(other):
                this = this._fast_union(other)
            else:
                tz = this.tz
                this = Index.union(this, other)
                if isinstance(this, DatetimeIndex):
                    this.tz = tz

        if this.freq is None:
            this.offset = to_offset(this.inferred_freq)
        return this
Example #2
0
    def _reindex_columns(self, columns):
        if len(columns) == 0:
            return DataMatrix(index=self.index)

        if not isinstance(columns, Index):
            columns = Index(columns)

        if self.objects is not None:
            object_columns = columns.intersection(self.objects.columns)
            columns = columns - object_columns

            objects = self.objects._reindex_columns(object_columns)
        else:
            objects = None

        if len(columns) > 0 and len(self.columns) == 0:
            return DataMatrix(index=self.index, columns=columns,
                              objects=objects)

        indexer, mask = common.get_indexer(self.columns, columns, None)
        mat = self.values.take(indexer, axis=1)

        notmask = -mask
        if len(mask) > 0:
            if notmask.any():
                if issubclass(mat.dtype.type, np.int_):
                    mat = mat.astype(float)
                elif issubclass(mat.dtype.type, np.bool_):
                    mat = mat.astype(float)

                common.null_out_axis(mat, notmask, 1)

        return DataMatrix(mat, index=self.index, columns=columns,
                          objects=objects)
Example #3
0
    def union(self, other):
        """
        Specialized union for DateRange objects. If combine
        overlapping ranges with the same DateOffset, will be much
        faster than Index.union

        Parameters
        ----------
        other : DateRange or array-like

        Returns
        -------
        y : Index or DateRange
        """
        if not isinstance(other, DateRange) or other.offset != self.offset:
            return Index.union(self.view(Index), other)

        offset = self.offset

        # to make our life easier, "sort" the two ranges
        if self[0] <= other[0]:
            left, right = self, other
        else:
            left, right = other, self

        left_end = left[-1]
        right_start = right[0]

        # Only need to "adjoin", not overlap
        if (left_end + offset) >= right_start:
            return left._fast_union(right)
        else:
            return Index.union(self, other)
Example #4
0
    def get_loc(self, key, method=None, tolerance=None):
        """
        Get integer location for requested label

        Returns
        -------
        loc : int
        """
        if tolerance is not None:
            # try converting tolerance now, so errors don't get swallowed by
            # the try/except clauses below
            tolerance = self._convert_tolerance(tolerance)

        if _is_convertible_to_td(key):
            key = Timedelta(key)
            return Index.get_loc(self, key, method, tolerance)

        try:
            return Index.get_loc(self, key, method, tolerance)
        except (KeyError, ValueError, TypeError):
            try:
                return self._get_string_slice(key)
            except (TypeError, KeyError, ValueError):
                pass

            try:
                stamp = Timedelta(key)
                return Index.get_loc(self, stamp, method, tolerance)
            except (KeyError, ValueError):
                raise KeyError(key)
Example #5
0
    def test_format_with_name_time_info(self):
        # bug I fixed 12/20/2011
        inc = timedelta(hours=4)
        dates = Index([dt + inc for dt in self.dateIndex], name='something')

        formatted = dates.format(name=True)
        self.assert_(formatted[0] == 'something')
Example #6
0
    def rename_axis(self, mapper, axis=1):
        new_axis = Index([mapper(x) for x in self.axes[axis]])
        new_axis._verify_integrity()

        new_axes = list(self.axes)
        new_axes[axis] = new_axis
        return BlockManager(self.blocks, new_axes)
Example #7
0
    def test_join_non_int_index(self):
        other = Index([3, 6, 7, 8, 10], dtype=object)

        outer = self.index.join(other, how='outer')
        outer2 = other.join(self.index, how='outer')
        expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14,
                          16, 18], dtype=object)
        self.assert_(outer.equals(outer2))
        self.assert_(outer.equals(expected))

        inner = self.index.join(other, how='inner')
        inner2 = other.join(self.index, how='inner')
        expected = Index([6, 8, 10], dtype=object)
        self.assert_(inner.equals(inner2))
        self.assert_(inner.equals(expected))

        left = self.index.join(other, how='left')
        self.assert_(left.equals(self.index))

        left2 = other.join(self.index, how='left')
        self.assert_(left2.equals(other))

        right = self.index.join(other, how='right')
        self.assert_(right.equals(other))

        right2 = other.join(self.index, how='right')
        self.assert_(right2.equals(self.index))
Example #8
0
    def get_loc(self, key, method=None):
        """
        Get integer location for requested label

        Returns
        -------
        loc : int
        """
        if _is_convertible_to_td(key):
            key = Timedelta(key)
            return Index.get_loc(self, key, method=method)

        try:
            return Index.get_loc(self, key, method=method)
        except (KeyError, ValueError, TypeError):
            try:
                return self._get_string_slice(key)
            except (TypeError, KeyError, ValueError):
                pass

            try:
                stamp = Timedelta(key)
                return Index.get_loc(self, stamp, method=method)
            except (KeyError, ValueError):
                raise KeyError(key)
Example #9
0
    def _make_labels(self):
        if self._was_factor:  # pragma: no cover
            raise Exception('Should not call this method grouping by level')
        else:
            values = self.grouper
            if values.dtype != np.object_:
                values = values.astype('O')

            # khash
            rizer = lib.Factorizer(len(values))
            labels, counts = rizer.factorize(values, sort=False)

            uniques = Index(rizer.uniques, name=self.name)
            if self.sort and len(counts) > 0:
                sorter = uniques.argsort()
                reverse_indexer = np.empty(len(sorter), dtype=np.int32)
                reverse_indexer.put(sorter, np.arange(len(sorter)))

                mask = labels < 0
                labels = reverse_indexer.take(labels)
                np.putmask(labels, mask, -1)

                uniques = uniques.take(sorter)
                counts = counts.take(sorter)

            self._labels = labels
            self._group_index = uniques
            self._counts = counts
Example #10
0
    def test_format_datetime_with_time(self):
        t = Index([datetime(2012, 2, 7), datetime(2012, 2, 7, 23)])

        result = t.format()
        expected = ['2012-02-07 00:00:00', '2012-02-07 23:00:00']
        self.assert_(len(result) == 2)
        self.assertEquals(result, expected)
Example #11
0
    def test_format(self):
        self._check_method_works(Index.format)

        index = Index([datetime.now()])
        formatted = index.format()
        expected = str(index[0])
        self.assertEquals(formatted, expected)
Example #12
0
    def test_intersect_str_dates(self):
        dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]

        i1 = Index(dt_dates, dtype=object)
        i2 = Index(["aa"], dtype=object)
        res = i2.intersection(i1)

        self.assert_(len(res) == 0)
Example #13
0
    def test_intersection(self):
        other = Index([1, 2, 3, 4, 5])
        result = self.index.intersection(other)
        expected = np.sort(np.intersect1d(self.index.values, other.values))
        self.assert_(np.array_equal(result, expected))

        result = other.intersection(self.index)
        expected = np.sort(np.asarray(np.intersect1d(self.index.values, other.values)))
        self.assert_(np.array_equal(result, expected))
Example #14
0
    def test_reindex_level(self):
        idx = Index(["one"])

        target, indexer = self.index.reindex(idx, level="second")
        target2, indexer2 = idx.reindex(self.index, idx, level="second")

        exp_index = self.index.join(idx, level="second", how="left")
        self.assert_(target.equals(exp_index))
        self.assert_(target2.equals(exp_index))
Example #15
0
    def test_reindex_level(self):
        idx = Index(['one'])

        target, indexer = self.index.reindex(idx, level='second')
        target2, indexer2 = idx.reindex(self.index, idx, level='second')

        exp_index = self.index.join(idx, level='second', how='left')
        self.assert_(target.equals(exp_index))
        self.assert_(target2.equals(exp_index))
Example #16
0
    def intersection(self, other):
        """
        Specialized intersection for DatetimeIndex objects. May be much faster
        than Index.intersection

        Parameters
        ----------
        other : DatetimeIndex or array-like

        Returns
        -------
        y : Index or DatetimeIndex
        """
        if not isinstance(other, DatetimeIndex):
            try:
                other = DatetimeIndex(other)
            except TypeError:
                pass
            result = Index.intersection(self, other)
            if isinstance(result, DatetimeIndex):
                if result.freq is None:
                    result.offset = to_offset(result.inferred_freq)
            return result

        elif (
            other.offset is None
            or self.offset is None
            or other.offset != self.offset
            or not other.offset.isAnchored()
            or (not self.is_monotonic or not other.is_monotonic)
        ):
            result = Index.intersection(self, other)
            if isinstance(result, DatetimeIndex):
                if result.freq is None:
                    result.offset = to_offset(result.inferred_freq)
            return result

        if len(self) == 0:
            return self
        if len(other) == 0:
            return other
        # to make our life easier, "sort" the two ranges
        if self[0] <= other[0]:
            left, right = self, other
        else:
            left, right = other, self

        end = min(left[-1], right[-1])
        start = right[0]

        if end < start:
            return type(self)(data=[])
        else:
            lslice = slice(*left.slice_locs(start, end))
            left_chunk = left.values[lslice]
            return self._view_like(left_chunk)
Example #17
0
def _sort_labels(uniques, left, right):
    if not isinstance(uniques, np.ndarray):
        # tuplesafe
        uniques = Index(uniques).values

    sorter = uniques.argsort()

    reverse_indexer = np.empty(len(sorter), dtype=np.int32)
    reverse_indexer.put(sorter, np.arange(len(sorter)))
    return reverse_indexer.take(left), reverse_indexer.take(right)
Example #18
0
    def test_append_multiple(self):
        index = Index(["a", "b", "c", "d", "e", "f"])

        foos = [index[:2], index[2:4], index[4:]]
        result = foos[0].append(foos[1:])
        self.assert_(result.equals(index))

        # empty
        result = index.append([])
        self.assert_(result.equals(index))
Example #19
0
    def test_slice_locs(self):
        idx = Index([0, 1, 2, 5, 6, 7, 9, 10])
        n = len(idx)

        self.assertEquals(idx.slice_locs(start=2), (2, n))
        self.assertEquals(idx.slice_locs(start=3), (3, n))
        self.assertEquals(idx.slice_locs(3, 8), (3, 6))
        self.assertEquals(idx.slice_locs(5, 10), (3, n))
        self.assertEquals(idx.slice_locs(end=8), (0, 6))
        self.assertEquals(idx.slice_locs(end=9), (0, 7))
Example #20
0
    def test_append_multiple(self):
        index = Index(['a', 'b', 'c', 'd', 'e', 'f'])

        foos = [index[:2], index[2:4], index[4:]]
        result = foos[0].append(foos[1:])
        self.assert_(result.equals(index))

        # empty
        result = index.append([])
        self.assert_(result.equals(index))
Example #21
0
    def test_union_noncomparable(self):
        from datetime import datetime, timedelta
        # corner case, non-Int64Index
        now = datetime.now()
        other = Index([now + timedelta(i) for i in xrange(4)])
        result = self.index.union(other)
        expected = np.concatenate((self.index, other))
        self.assert_(np.array_equal(result, expected))

        result = other.union(self.index)
        expected = np.concatenate((other, self.index))
        self.assert_(np.array_equal(result, expected))
Example #22
0
    def test_append_empty_preserve_name(self):
        left = Index([], name="foo")
        right = Index([1, 2, 3], name="foo")

        result = left.append(right)
        self.assert_(result.name == "foo")

        left = Index([], name="foo")
        right = Index([1, 2, 3], name="bar")

        result = left.append(right)
        self.assert_(result.name is None)
Example #23
0
    def rename_items(self, mapper, copydata=True):
        new_items = Index([mapper(x) for x in self.items])
        new_items._verify_integrity()

        new_blocks = []
        for block in self.blocks:
            newb = block.copy(deep=copydata)
            newb.set_ref_items(new_items, maybe_rename=True)
            new_blocks.append(newb)
        new_axes = list(self.axes)
        new_axes[0] = new_items
        return BlockManager(new_blocks, new_axes)
Example #24
0
    def union(self, other):
        if isinstance(other, DateRange) and other.offset == self.offset:
            # overlap condition
            if self[-1] >= other[0] or other[-1] >= self[0]:
                start = min(self[0], other[0])
                end = max(self[-1], other[-1])

                return DateRange(start, end, offset=self.offset)
            else:
                return Index.union(self, other)
        else:
            return Index.union(self, other)
Example #25
0
    def _set_grouper(self, obj, sort=False):
        """
        given an object and the specifcations, setup the internal grouper for this particular specification

        Parameters
        ----------
        obj : the subject object

        """

        # NOTE: the following code is based on the base Grouper class with
        #       additional hook to specify custom sorter

        if self.key is not None and self.level is not None:
            raise ValueError(
                "The Grouper cannot specify both a key and a level!")

        # the key must be a valid info item
        if self.key is not None:
            key = self.key
            if key not in obj._info_axis:
                raise KeyError("The grouper name {0} is not found".format(key))
            ax = Index(obj[key], name=key)

        else:
            ax = obj._get_axis(self.axis)
            if self.level is not None:
                level = self.level

                # if a level is given it must be a mi level or
                # equivalent to the axis name
                if isinstance(ax, MultiIndex):
                    level = ax._get_level_number(level)
                    ax = Index(ax.get_level_values(level), name=ax.names[level])

                else:
                    if level not in (0, ax.name):
                        raise ValueError(
                            "The level {0} is not valid".format(level))

        # possibly sort
        if (self.sort or sort) and not ax.is_monotonic:
            # The following line is different from the base class for
            # possible extension.
            ax, indexer = self._make_sorter(ax)
            self.indexer = indexer
            obj = obj.take(indexer, axis=self.axis, convert=False,
                           is_copy=False)

        self.obj = obj
        self.grouper = ax
        return self.grouper
Example #26
0
    def test_isin(self):
        values = ['foo', 'bar']

        idx = Index(['qux', 'baz', 'foo', 'bar'])
        result = idx.isin(values)
        expected = np.array([False, False, True, True])
        self.assert_(np.array_equal(result, expected))

        # empty, return dtype bool
        idx = Index([])
        result = idx.isin(values)
        self.assert_(len(result) == 0)
        self.assert_(result.dtype == np.bool_)
Example #27
0
    def test_slice_locs(self):
        idx = Index([0, 1, 2, 5, 6, 7, 9, 10])
        n = len(idx)

        self.assertEquals(idx.slice_locs(start=2), (2, n))
        self.assertEquals(idx.slice_locs(start=3), (3, n))
        self.assertEquals(idx.slice_locs(3, 8), (3, 6))
        self.assertEquals(idx.slice_locs(5, 10), (3, n))
        self.assertEquals(idx.slice_locs(end=8), (0, 6))
        self.assertEquals(idx.slice_locs(end=9), (0, 7))

        idx2 = idx[::-1]
        self.assertRaises(KeyError, idx2.slice_locs, 8, 2)
        self.assertRaises(KeyError, idx2.slice_locs, 7, 3)
Example #28
0
    def __setstate__(self, aug_state):
        """Necessary for making this object picklable"""
        index_state = aug_state[:1]
        offset = aug_state[1]

        # for backwards compatibility
        if len(aug_state) > 2:
            tzinfo = aug_state[2]
        else: # pragma: no cover
            tzinfo = None

        self.offset = offset
        self.tzinfo = tzinfo
        Index.__setstate__(self, *index_state)
Example #29
0
    def _init_dict(self, data, index, columns, dtype=None):
        # pre-filter out columns if we passed it
        if columns is not None:
            columns = ensure_index(columns)
            data = {k: v for k, v in compat.iteritems(data) if k in columns}
        else:
            keys = com.dict_keys_to_ordered_list(data)
            columns = Index(keys)

        if index is None:
            index = extract_index(list(data.values()))

        def sp_maker(x):
            return SparseArray(x, kind=self._default_kind,
                               fill_value=self._default_fill_value,
                               copy=True, dtype=dtype)
        sdict = {}
        for k, v in compat.iteritems(data):
            if isinstance(v, Series):
                # Force alignment, no copy necessary
                if not v.index.equals(index):
                    v = v.reindex(index)

                if not isinstance(v, SparseSeries):
                    v = sp_maker(v.values)
            elif isinstance(v, SparseArray):
                v = v.copy()
            else:
                if isinstance(v, dict):
                    v = [v.get(i, np.nan) for i in index]

                v = sp_maker(v)

            if index is not None and len(v) != len(index):
                msg = "Length of passed values is {}, index implies {}"
                raise ValueError(msg.format(len(v), len(index)))
            sdict[k] = v

        if len(columns.difference(sdict)):
            # TODO: figure out how to handle this case, all nan's?
            # add in any other columns we want to have (completeness)
            nan_arr = np.empty(len(index), dtype='float64')
            nan_arr.fill(np.nan)
            nan_arr = SparseArray(nan_arr, kind=self._default_kind,
                                  fill_value=self._default_fill_value,
                                  copy=False)
            sdict.update((c, nan_arr) for c in columns if c not in sdict)

        return to_manager(sdict, columns, index)
Example #30
0
    def union(self, other):
        """
        Specialized union for TimedeltaIndex objects. If combine
        overlapping ranges with the same DateOffset, will be much
        faster than Index.union

        Parameters
        ----------
        other : TimedeltaIndex or array-like

        Returns
        -------
        y : Index or TimedeltaIndex
        """
        self._assert_can_do_setop(other)
        if not isinstance(other, TimedeltaIndex):
            try:
                other = TimedeltaIndex(other)
            except (TypeError, ValueError):
                pass
        this, other = self, other

        if this._can_fast_union(other):
            return this._fast_union(other)
        else:
            result = Index.union(this, other)
            if isinstance(result, TimedeltaIndex):
                if result.freq is None:
                    result.freq = to_offset(result.inferred_freq)
            return result
Example #31
0
    def shift(self, n, offset=None):
        """
        Specialized shift which produces a DateRange

        Parameters
        ----------
        n : int
            Periods to shift by
        offset : DateOffset or timedelta-like, optional

        Returns
        -------
        shifted : DateRange
        """
        if offset is not None and offset != self.offset:
            return Index.shift(self, n, offset)

        if n == 0:
            # immutable so OK
            return self

        start = self[0] + n * self.offset
        end = self[-1] + n * self.offset
        return DateRange(start, end, offset=self.offset, name=self.name)
Example #32
0
    def get_value(self, series, key):
        """
        Fast lookup of value from 1-dimensional ndarray. Only use this if you
        know what you're doing
        """
        try:
            return Index.get_value(self, series, key)
        except KeyError:

            try:
                loc = self._get_string_slice(key)
                return series[loc]
            except (TypeError, ValueError, KeyError):
                pass

            if isinstance(key, time):
                locs = self._indices_at_time(key)
                return series.take(locs)

            stamp = Timestamp(key)
            try:
                return self._engine.get_value(series, stamp)
            except KeyError:
                raise KeyError(stamp)
Example #33
0
    def get_value(self, series, key):
        """
        Fast lookup of value from 1-dimensional ndarray. Only use this if you
        know what you're doing
        """

        if _is_convertible_to_td(key):
            key = Timedelta(key)
            return self.get_value_maybe_box(series, key)

        try:
            return _maybe_box(self, Index.get_value(self, series, key), series,
                              key)
        except KeyError:
            try:
                loc = self._get_string_slice(key)
                return series[loc]
            except (TypeError, ValueError, KeyError):
                pass

            try:
                return self.get_value_maybe_box(series, key)
            except (TypeError, ValueError, KeyError):
                raise KeyError(key)
Example #34
0
    def shift(self, n, freq=None):
        """
        Specialized shift which produces a DatetimeIndex

        Parameters
        ----------
        n : int
            Periods to shift by
        freq : DateOffset or timedelta-like, optional

        Returns
        -------
        shifted : DatetimeIndex
        """
        if freq is not None and freq != self.offset:
            if isinstance(freq, basestring):
                freq = to_offset(freq)
            result = Index.shift(self, n, freq)
            result.tz = self.tz

            return result

        if n == 0:
            # immutable so OK
            return self

        if self.offset is None:
            raise ValueError("Cannot shift with no offset")

        start = self[0] + n * self.offset
        end = self[-1] + n * self.offset
        return DatetimeIndex(start=start,
                             end=end,
                             freq=self.offset,
                             name=self.name,
                             tz=self.tz)
Example #35
0
    def _wrap_result(self, result, **kwargs):

        # leave as it is to keep extract and get_dummies results
        # can be merged to _wrap_result_expand in v0.17
        from pandas.core.series import Series
        from pandas.core.frame import DataFrame
        from pandas.core.index import Index

        if not hasattr(result, 'ndim'):
            return result
        name = kwargs.get('name') or getattr(result, 'name',
                                             None) or self.series.name

        if result.ndim == 1:
            if isinstance(self.series, Index):
                # if result is a boolean np.array, return the np.array
                # instead of wrapping it into a boolean Index (GH 8875)
                if is_bool_dtype(result):
                    return result
                return Index(result, name=name)
            return Series(result, index=self.series.index, name=name)
        else:
            assert result.ndim < 3
            return DataFrame(result, index=self.series.index)
Example #36
0
    def test_join_level(self):
        def _check_how(other, how):
            join_index, lidx, ridx = other.join(self.index,
                                                how=how,
                                                level='second',
                                                return_indexers=True)

            exp_level = other.join(self.index.levels[1], how=how)
            self.assert_(join_index.levels[0].equals(self.index.levels[0]))
            self.assert_(join_index.levels[1].equals(exp_level))

            # pare down levels
            mask = np.array([x[1] in exp_level for x in self.index],
                            dtype=bool)
            exp_values = self.index.values[mask]
            self.assert_(np.array_equal(join_index.values, exp_values))

            if how in ('outer', 'inner'):
                join_index2, ridx2, lidx2 = \
                    self.index.join(other, how=how, level='second',
                                    return_indexers=True)

                self.assert_(join_index.equals(join_index2))
                self.assert_(np.array_equal(lidx, lidx2))
                self.assert_(np.array_equal(ridx, ridx2))
                self.assert_(np.array_equal(join_index2.values, exp_values))

        def _check_all(other):
            _check_how(other, 'outer')
            _check_how(other, 'inner')
            _check_how(other, 'left')
            _check_how(other, 'right')

        _check_all(Index(['three', 'one', 'two']))
        _check_all(Index(['one']))
        _check_all(Index(['one', 'three']))

        # some corner cases
        idx = Index(['three', 'one', 'two'])
        result = idx.join(self.index, level='second')
        self.assert_(isinstance(result, MultiIndex))

        self.assertRaises(Exception, self.index.join, self.index, level=1)
Example #37
0
    def get_chunk(self, rows=None):
        if rows is not None and self.skip_footer:
            raise ValueError('skip_footer not supported for iteration')

        try:
            content = self._get_lines(rows)
        except StopIteration:
            if self._first_chunk:
                content = []
            else:
                raise

        # done with first read, next time raise StopIteration
        self._first_chunk = False

        if len(content) == 0:  # pragma: no cover
            if self.index_col is not None:
                if np.isscalar(self.index_col):
                    index = Index([], name=self.index_name)
                else:
                    index = MultiIndex.from_arrays([[]] * len(self.index_col),
                                                   names=self.index_name)
            else:
                index = Index([])

            return DataFrame(index=index, columns=self.columns)

        zipped_content = list(lib.to_object_array(content).T)

        # no index column specified, so infer that's what is wanted
        if self.index_col is not None:
            if np.isscalar(self.index_col):
                index = zipped_content.pop(self.index_col)
            else:  # given a list of index
                index = []
                for idx in self.index_col:
                    index.append(zipped_content[idx])
                # remove index items from content and columns, don't pop in loop
                for i in reversed(sorted(self.index_col)):
                    zipped_content.pop(i)

            if np.isscalar(self.index_col):
                if self.parse_dates:
                    index = lib.try_parse_dates(index, parser=self.date_parser)
                index, na_count = _convert_types(index, self.na_values)
                index = Index(index, name=self.index_name)
                if self.verbose and na_count:
                    print 'Found %d NA values in the index' % na_count
            else:
                arrays = []
                for arr in index:
                    if self.parse_dates:
                        arr = lib.try_parse_dates(arr, parser=self.date_parser)
                    arr, _ = _convert_types(arr, self.na_values)
                    arrays.append(arr)
                index = MultiIndex.from_arrays(arrays, names=self.index_name)
        else:
            index = Index(np.arange(len(content)))

        if not index._verify_integrity():
            dups = index.get_duplicates()
            idx_str = 'Index' if not self.implicit_idx else 'Implicit index'
            err_msg = ('%s (columns %s) have duplicate values %s' %
                       (idx_str, self.index_col, str(dups)))
            raise Exception(err_msg)

        if len(self.columns) != len(zipped_content):
            raise Exception('wrong number of columns')

        data = dict((k, v) for k, v in izip(self.columns, zipped_content))

        # apply converters
        for col, f in self.converters.iteritems():
            if isinstance(col, int) and col not in self.columns:
                col = self.columns[col]
            data[col] = lib.map_infer(data[col], f)

        data = _convert_to_ndarrays(data, self.na_values, self.verbose)

        return DataFrame(data=data, columns=self.columns, index=index)
Example #38
0
def _default_index(n):
    from pandas.core.index import Index
    return Index(np.arange(n))
Example #39
0
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None,
          timezones=None):
    """
    Create empty DataFrame to assign into

    In the simplest case, will return a Pandas dataframe of the given size,
    with columns of the given names and types. The second return value `views`
    is a dictionary of numpy arrays into which you can assign values that
    show up in the dataframe.

    For categorical columns, you get two views to assign into: if the
    column name is "col", you get both "col" (the category codes) and
    "col-catdef" (the category labels).

    For a single categorical index, you should use the `.set_categories`
    method of the appropriate "-catdef" columns, passing an Index of values

    ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)``

    Multi-indexes work a lot like categoricals, even if the types of each
    index are not themselves categories, and will also have "-catdef" entries
    in the views. However, these will be Dummy instances, providing only a
    ``.set_categories`` method, to be used as above.

    Parameters
    ----------
    types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples
        applies to non-categorical columns. If there are only categorical
        columns, an empty string of None will do.
    size: int
        Number of rows to allocate
    cats: dict {col: labels}
        Location and labels for categorical columns, e.g., {1: ['mary', 'mo]}
        will create column index 1 (inserted amongst the numerical columns)
        with two possible values. If labels is an integers, `{'col': 5}`,
        will generate temporary labels using range. If None, or column name
        is missing, will assume 16-bit integers (a reasonable default).
    cols: list of labels
        assigned column names, including categorical ones.
    index_types: list of str
        For one of more index columns, make them have this type. See general
        description, above, for caveats about multi-indexing. If None, the
        index will be the default RangeIndex.
    index_names: list of str
        Names of the index column(s), if using
    timezones: dict {col: timezone_str}
        for timestamp type columns, apply this timezone to the pandas series;
        the numpy view will be UTC.

    Returns
    -------
    - dataframe with correct shape and data-types
    - list of numpy views, in order, of the columns of the dataframe. Assign
        to this.
    """
    views = {}
    timezones = timezones or {}

    if isinstance(types, STR_TYPE):
        types = types.split(',')
    cols = cols if cols is not None else range(len(types))

    def cat(col):
        if cats is None or col not in cats:
            return RangeIndex(0, 2**14)
        elif isinstance(cats[col], int):
            return RangeIndex(0, cats[col])
        else:  # explicit labels list
            return cats[col]

    df = OrderedDict()
    for t, col in zip(types, cols):
        if str(t) == 'category':
            df[six.text_type(col)] = Categorical([], categories=cat(col),
                                                 fastpath=True)
        else:
            d = np.empty(0, dtype=t)
            if d.dtype.kind == "M" and six.text_type(col) in timezones:
                d = Series(d).dt.tz_localize(timezones[six.text_type(col)])
            df[six.text_type(col)] = d

    df = DataFrame(df)
    if not index_types:
        index = RangeIndex(size)
    elif len(index_types) == 1:
        t, col = index_types[0], index_names[0]
        if col is None:
            raise ValueError('If using an index, must give an index name')
        if str(t) == 'category':
            c = Categorical([], categories=cat(col), fastpath=True)
            vals = np.zeros(size, dtype=c.codes.dtype)
            index = CategoricalIndex(c)
            index._data._codes = vals
            views[col] = vals
            views[col+'-catdef'] = index._data
        else:
            d = np.empty(size, dtype=t)
            index = Index(d)
            views[col] = index.values
    else:
        index = MultiIndex([[]], [[]])
        # index = MultiIndex.from_arrays(indexes)
        index._levels = list()
        index._labels = list()
        for i, col in enumerate(index_names):
            index._levels.append(Index([None]))

            def set_cats(values, i=i, col=col, **kwargs):
                values.name = col
                if index._levels[i][0] is None:
                    index._levels[i] = values
                elif not index._levels[i].equals(values):
                    raise RuntimeError("Different dictionaries encountered"
                                       " while building categorical")

            x = Dummy()
            x._set_categories = set_cats

            d = np.zeros(size, dtype=int)
            index._labels.append(d)
            views[col] = d
            views[col+'-catdef'] = x

    axes = [df._data.axes[0], index]

    # allocate and create blocks
    blocks = []
    for block in df._data.blocks:
        if block.is_categorical:
            categories = block.values.categories
            code = np.zeros(shape=size, dtype=block.values.codes.dtype)
            values = Categorical(values=code, categories=categories,
                                 fastpath=True)
            new_block = block.make_block_same_class(values=values)
        elif getattr(block.dtype, 'tz', None):
            new_shape = (size, )
            values = np.empty(shape=new_shape, dtype="M8[ns]")
            new_block = block.make_block_same_class(
                    values=values, dtype=block.values.dtype)
        else:
            new_shape = (block.values.shape[0], size)
            values = np.empty(shape=new_shape, dtype=block.values.dtype)
            new_block = block.make_block_same_class(values=values)

        blocks.append(new_block)

    # create block manager
    df = DataFrame(BlockManager(blocks, axes))

    # create views
    for block in df._data.blocks:
        dtype = block.dtype
        inds = block.mgr_locs.indexer
        if isinstance(inds, slice):
            inds = list(range(inds.start, inds.stop, inds.step))
        for i, ind in enumerate(inds):
            col = df.columns[ind]
            if is_categorical_dtype(dtype):
                views[col] = block.values._codes
                views[col+'-catdef'] = block.values
            elif getattr(block.dtype, 'tz', None):
                views[col] = np.asarray(block.values, dtype='M8[ns]')
            else:
                views[col] = block.values[i]

    if index_names:
        df.index.names = [
            None if re.match(r'__index_level_\d+__', n) else n
            for n in index_names
        ]
    return df, views
Example #40
0
 def asobject(self):
     from pandas.core.index import Index
     return Index(self._box_values(self.asi8), name=self.name, dtype=object)
Example #41
0
    def groupby_agg(self, by, axis, agg, groupby_args, **kwargs):
        # Currently we only expect 'by' to be a projection of the same frame.
        # If 'by' holds a list of columns/series, then we create such projection
        # to re-use code.
        if not isinstance(by, DFAlgQueryCompiler):
            if is_list_like(by):
                by_cols = []
                by_frames = []
                for obj in by:
                    if isinstance(obj, str):
                        by_cols.append(obj)
                    elif hasattr(obj, "_query_compiler"):
                        by_frames.append(obj._query_compiler._modin_frame)
                    else:
                        raise NotImplementedError("unsupported groupby args")
                by_cols = Index.__new__(Index,
                                        data=by_cols,
                                        dtype=self.columns.dtype)
                by_frame = self.mask(col_indices=by_cols)
                if by_frames:
                    by_frame = by_frame._concat(axis=1,
                                                other_modin_frames=by_frames,
                                                ignore_index=True)
            else:
                raise NotImplementedError("unsupported groupby args")
        else:
            by_frame = by._modin_frame

        if axis != 0:
            raise NotImplementedError("groupby is supported for axis = 0 only")

        base = by_frame._find_common_projections_base(self)
        if base is None:
            raise NotImplementedError("unsupported groupby args")

        if groupby_args["level"] is not None:
            raise NotImplementedError("levels are not supported for groupby")

        groupby_cols = by_frame.columns.tolist()
        agg_cols = [col for col in self.columns if col not in by_frame.columns]

        # Create new base where all required columns are computed. We don't allow
        # complex expressions to be a group key or an aggeregate operand.
        assert isinstance(by_frame._op, TransformNode), "unexpected by_frame"
        exprs = OrderedDict(((col, by_frame.ref(col)) for col in groupby_cols))
        exprs.update(((col, self.ref(col)) for col in agg_cols))
        exprs = translate_exprs_to_base(exprs, base)
        base_cols = Index.__new__(Index,
                                  data=list(exprs.keys()),
                                  dtype=self.columns.dtype)
        base = self.__constructor__(
            columns=base_cols,
            dtypes=self._dtypes_for_exprs(exprs),
            op=TransformNode(base, exprs, fold=True),
            index_cols=None,
            force_execution_mode=self._force_execution_mode,
        )

        new_columns = []
        index_cols = None

        if groupby_args["as_index"]:
            index_cols = groupby_cols.copy()
        else:
            new_columns = groupby_cols.copy()

        new_dtypes = by_frame._dtypes[groupby_cols].tolist()

        agg_exprs = OrderedDict()
        if isinstance(agg, str):
            for col in agg_cols:
                agg_exprs[col] = AggregateExpr(agg, base.ref(col))
        else:
            assert isinstance(agg, dict), "unsupported aggregate type"
            multiindex = any(isinstance(v, list) for v in agg.values())
            for k, v in agg.items():
                if isinstance(v, list):
                    for item in v:
                        agg_exprs[(k, item)] = AggregateExpr(item, base.ref(k))
                else:
                    col_name = (k, v) if multiindex else k
                    agg_exprs[col_name] = AggregateExpr(v, base.ref(k))
        new_columns.extend(agg_exprs.keys())
        new_dtypes.extend((x._dtype for x in agg_exprs.values()))
        new_columns = Index.__new__(Index,
                                    data=new_columns,
                                    dtype=self.columns.dtype)

        new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args)
        new_frame = self.__constructor__(
            columns=new_columns,
            dtypes=new_dtypes,
            op=new_op,
            index_cols=index_cols,
            force_execution_mode=self._force_execution_mode,
        )

        return new_frame
Example #42
0
 def _wrap_union_result(self, other, result):
     # If we are here, _can_fast_union is false or other is not a
     # DateRange, so their union has to be an Index.
     name = self.name if self.name == other.name else None
     return Index(result, name=name)
Example #43
0
    def __reduce__(self):
        """Necessary for making this object picklable"""
        a, b, state = Index.__reduce__(self)
        aug_state = state, self.offset, self.tzinfo

        return a, b, aug_state
Example #44
0
 def _insert_column(self, key):
     self.columns = Index(np.concatenate((self.columns, [key])))
Example #45
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """
        Apply aggregation function or functions to groups, yielding most likely
        Series but in some cases DataFrame depending on the output of the
        aggregation function

        Parameters
        ----------
        func_or_funcs : function or list / dict of functions
            List/dict of functions will produce DataFrame with column names
            determined by the function names themselves (list) or the keys in
            the dict

        Notes
        -----
        agg is an alias for aggregate. Use it.

        Example
        -------
        >>> series
        bar    1.0
        baz    2.0
        qot    3.0
        qux    4.0

        >>> mapper = lambda x: x[0] # first letter
        >>> grouped = series.groupby(mapper)

        >>> grouped.aggregate(np.sum)
        b    3.0
        q    7.0

        >>> grouped.aggregate([np.sum, np.mean, np.std])
           mean  std  sum
        b  1.5   0.5  3
        q  3.5   0.5  7

        >>> grouped.agg({'result' : lambda x: x.mean() / x.std(),
        ...              'total' : np.sum})
           result  total
        b  2.121   3
        q  4.95    7

        See also
        --------
        apply, transform

        Returns
        -------
        Series or DataFrame
        """
        if isinstance(func_or_funcs, basestring):
            return getattr(self, func_or_funcs)(*args, **kwargs)

        if hasattr(func_or_funcs,'__iter__'):
            ret = self._aggregate_multiple_funcs(func_or_funcs)
        else:
            if len(self.groupings) > 1:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)

            try:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)
            except Exception:
                result = self._aggregate_named(func_or_funcs, *args, **kwargs)

            index = Index(sorted(result), name=self.groupings[0].name)
            ret = Series(result, index=index)

        if not self.as_index:  # pragma: no cover
            print 'Warning, ignoring as_index=True'

        return ret
Example #46
0
def to_numeric(arg, errors='raise', downcast=None):
    """
    Convert argument to a numeric type.

    Parameters
    ----------
    arg : list, tuple, 1-d array, or Series
    errors : {'ignore', 'raise', 'coerce'}, default 'raise'
        - If 'raise', then invalid parsing will raise an exception
        - If 'coerce', then invalid parsing will be set as NaN
        - If 'ignore', then invalid parsing will return the input
    downcast : {'integer', 'signed', 'unsigned', 'float'} , default None
        If not None, and if the data has been successfully cast to a
        numerical dtype (or if the data was numeric to begin with),
        downcast that resulting data to the smallest numerical dtype
        possible according to the following rules:

        - 'integer' or 'signed': smallest signed int dtype (min.: np.int8)
        - 'unsigned': smallest unsigned int dtype (min.: np.uint8)
        - 'float': smallest float dtype (min.: np.float32)

        As this behaviour is separate from the core conversion to
        numeric values, any errors raised during the downcasting
        will be surfaced regardless of the value of the 'errors' input.

        In addition, downcasting will only occur if the size
        of the resulting data's dtype is strictly larger than
        the dtype it is to be cast to, so if none of the dtypes
        checked satisfy that specification, no downcasting will be
        performed on the data.

        .. versionadded:: 0.19.0

    Returns
    -------
    ret : numeric if parsing succeeded.
        Return type depends on input.  Series if Series, otherwise ndarray

    Examples
    --------
    Take separate series and convert to numeric, coercing when told to

    >>> import pandas as pd
    >>> s = pd.Series(['1.0', '2', -3])
    >>> pd.to_numeric(s)
    0    1.0
    1    2.0
    2   -3.0
    dtype: float64
    >>> pd.to_numeric(s, downcast='float')
    0    1.0
    1    2.0
    2   -3.0
    dtype: float32
    >>> pd.to_numeric(s, downcast='signed')
    0    1
    1    2
    2   -3
    dtype: int8
    >>> s = pd.Series(['apple', '1.0', '2', -3])
    >>> pd.to_numeric(s, errors='ignore')
    0    apple
    1      1.0
    2        2
    3       -3
    dtype: object
    >>> pd.to_numeric(s, errors='coerce')
    0    NaN
    1    1.0
    2    2.0
    3   -3.0
    dtype: float64
    """
    if downcast not in (None, 'integer', 'signed', 'unsigned', 'float'):
        raise ValueError('invalid downcasting method provided')

    is_series = False
    is_index = False
    is_scalar = False

    if isinstance(arg, pd.Series):
        is_series = True
        values = arg.values
    elif isinstance(arg, pd.Index):
        is_index = True
        values = arg.asi8
        if values is None:
            values = arg.values
    elif isinstance(arg, (list, tuple)):
        values = np.array(arg, dtype='O')
    elif np.isscalar(arg):
        if is_number(arg):
            return arg
        is_scalar = True
        values = np.array([arg], dtype='O')
    elif getattr(arg, 'ndim', 1) > 1:
        raise TypeError('arg must be a list, tuple, 1-d array, or Series')
    else:
        values = arg

    try:
        if is_numeric_dtype(values):
            pass
        elif is_datetime_or_timedelta_dtype(values):
            values = values.astype(np.int64)
        else:
            values = _ensure_object(values)
            coerce_numeric = False if errors in ('ignore', 'raise') else True
            values = lib.maybe_convert_numeric(values,
                                               set(),
                                               coerce_numeric=coerce_numeric)

    except Exception:
        if errors == 'raise':
            raise

    # attempt downcast only if the data has been successfully converted
    # to a numerical dtype and if a downcast method has been specified
    if downcast is not None and is_numeric_dtype(values):
        typecodes = None

        if downcast in ('integer', 'signed'):
            typecodes = np.typecodes['Integer']
        elif downcast == 'unsigned' and np.min(values) > 0:
            typecodes = np.typecodes['UnsignedInteger']
        elif downcast == 'float':
            typecodes = np.typecodes['Float']

            # pandas support goes only to np.float32,
            # as float dtypes smaller than that are
            # extremely rare and not well supported
            float_32_char = np.dtype(np.float32).char
            float_32_ind = typecodes.index(float_32_char)
            typecodes = typecodes[float_32_ind:]

        if typecodes is not None:
            # from smallest to largest
            for dtype in typecodes:
                if np.dtype(dtype).itemsize < values.dtype.itemsize:
                    values = _possibly_downcast_to_dtype(values, dtype)

                    # successful conversion
                    if values.dtype == dtype:
                        break

    if is_series:
        return pd.Series(values, index=arg.index, name=arg.name)
    elif is_index:
        # because we want to coerce to numeric if possible,
        # do not use _shallow_copy_with_infer
        return Index(values, name=arg.name)
    elif is_scalar:
        return values[0]
    else:
        return values
Example #47
0
    def __init__(self,
                 obj,
                 path_or_buf=None,
                 sep=",",
                 na_rep='',
                 float_format=None,
                 cols=None,
                 header=True,
                 index=True,
                 index_label=None,
                 mode='w',
                 nanRep=None,
                 encoding=None,
                 compression=None,
                 quoting=None,
                 line_terminator='\n',
                 chunksize=None,
                 tupleize_cols=False,
                 quotechar='"',
                 date_format=None,
                 doublequote=True,
                 escapechar=None,
                 decimal='.'):

        self.obj = obj

        if path_or_buf is None:
            path_or_buf = StringIO()

        self.path_or_buf = _expand_user(_stringify_path(path_or_buf))
        self.sep = sep
        self.na_rep = na_rep
        self.float_format = float_format
        self.decimal = decimal

        self.header = header
        self.index = index
        self.index_label = index_label
        self.mode = mode
        self.encoding = encoding
        self.compression = compression

        if quoting is None:
            quoting = csvlib.QUOTE_MINIMAL
        self.quoting = quoting

        if quoting == csvlib.QUOTE_NONE:
            # prevents crash in _csv
            quotechar = None
        self.quotechar = quotechar

        self.doublequote = doublequote
        self.escapechar = escapechar

        self.line_terminator = line_terminator

        self.date_format = date_format

        self.tupleize_cols = tupleize_cols
        self.has_mi_columns = (isinstance(obj.columns, MultiIndex)
                               and not self.tupleize_cols)

        # validate mi options
        if self.has_mi_columns:
            if cols is not None:
                raise TypeError("cannot specify cols with a MultiIndex on the "
                                "columns")

        if cols is not None:
            if isinstance(cols, Index):
                cols = cols.to_native_types(na_rep=na_rep,
                                            float_format=float_format,
                                            date_format=date_format,
                                            quoting=self.quoting)
            else:
                cols = list(cols)
            self.obj = self.obj.loc[:, cols]

        # update columns to include possible multiplicity of dupes
        # and make sure sure cols is just a list of labels
        cols = self.obj.columns
        if isinstance(cols, Index):
            cols = cols.to_native_types(na_rep=na_rep,
                                        float_format=float_format,
                                        date_format=date_format,
                                        quoting=self.quoting)
        else:
            cols = list(cols)

        # save it
        self.cols = cols

        # preallocate data 2d list
        self.blocks = self.obj._data.blocks
        ncols = sum(b.shape[0] for b in self.blocks)
        self.data = [None] * ncols

        if chunksize is None:
            chunksize = (100000 // (len(self.cols) or 1)) or 1
        self.chunksize = int(chunksize)

        self.data_index = obj.index
        if (isinstance(self.data_index, (DatetimeIndex, PeriodIndex))
                and date_format is not None):
            self.data_index = Index([
                x.strftime(date_format) if notna(x) else ''
                for x in self.data_index
            ])

        self.nlevels = getattr(self.data_index, 'nlevels', 1)
        if not index:
            self.nlevels = 0
Example #48
0
 def _get_fresh_axis(self):
     return Index(np.arange(len(self._get_concat_axis())))
Example #49
0
    def _getitem_iterable(self, key, axis=0):
        labels = self.obj._get_axis(axis)

        def _reindex(keys, level=None):
            try:
                return self.obj.reindex_axis(keys, axis=axis, level=level)
            except AttributeError:
                # Series
                if axis != 0:
                    raise AssertionError('axis must be 0')
                return self.obj.reindex(keys, level=level)

        if com._is_bool_indexer(key):
            key = _check_bool_indexer(labels, key)
            inds, = key.nonzero()
            return self.obj.take(inds, axis=axis, convert=False)
        else:
            if isinstance(key, Index):
                # want Index objects to pass through untouched
                keyarr = key
            else:
                # asarray can be unsafe, NumPy strings are weird
                keyarr = _asarray_tuplesafe(key)

            if _is_integer_dtype(keyarr):
                if labels.inferred_type != 'integer':
                    keyarr = np.where(keyarr < 0,
                                      len(labels) + keyarr, keyarr)

                if labels.inferred_type == 'mixed-integer':
                    indexer = labels.get_indexer(keyarr)
                    if (indexer >= 0).all():
                        self.obj.take(indexer, axis=axis, convert=True)
                    else:
                        return self.obj.take(keyarr, axis=axis)
                elif not labels.inferred_type == 'integer':

                    return self.obj.take(keyarr, axis=axis)

            # this is not the most robust, but...
            if (isinstance(labels, MultiIndex) and
                    not isinstance(keyarr[0], tuple)):
                level = 0
            else:
                level = None

            if labels.is_unique and Index(keyarr).is_unique:
                return _reindex(keyarr, level=level)
            else:
                indexer, missing = labels.get_indexer_non_unique(keyarr)
                check = indexer != -1
                result = self.obj.take(indexer[check], axis=axis, convert=False)

                # need to merge the result labels and the missing labels
                if len(missing):
                    l = np.arange(len(indexer))

                    missing = com._ensure_platform_int(missing)
                    missing_labels = keyarr.take(missing)
                    missing_indexer = com._ensure_int64(l[~check])
                    cur_labels = result._get_axis(axis).values
                    cur_indexer = com._ensure_int64(l[check])

                    new_labels = np.empty(tuple([len(indexer)]),dtype=object)
                    new_labels[cur_indexer]     = cur_labels
                    new_labels[missing_indexer] = missing_labels
                    new_indexer = (Index(cur_indexer) + Index(missing_indexer)).values
                    new_indexer[missing_indexer] = -1

                    # reindex with the specified axis
                    ndim = self.obj.ndim
                    if axis+1 > ndim:
                        raise AssertionError("invalid indexing error with non-unique index")

                    args = [None] * (2*ndim)
                    args[2*axis] = new_labels
                    args[2*axis+1] = new_indexer

                    result = result._reindex_with_indexers(*args, copy=False, fill_value=np.nan)

                return result
Example #50
0
 def is_monotonic(self):
     # return if my group orderings are monotonic
     return Index(self.group_info[0]).is_monotonic
Example #51
0
def _unstack_multiple(data, clocs, fill_value=None):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    ccodes = [index.codes[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rcodes = [index.codes[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(ccodes, shape, sort=False, xnull=False)

    comp_ids, obs_ids = compress_group_index(group_index, sort=False)
    recons_codes = decons_obs_group_ids(comp_ids,
                                        obs_ids,
                                        shape,
                                        ccodes,
                                        xnull=False)

    if rlocs == []:
        # Everything is in clocs, so the dummy df has a regular index
        dummy_index = Index(obs_ids, name="__placeholder__")
    else:
        dummy_index = MultiIndex(
            levels=rlevels + [obs_ids],
            codes=rcodes + [comp_ids],
            names=rnames + ["__placeholder__"],
            verify_integrity=False,
        )

    if isinstance(data, Series):
        dummy = data.copy()
        dummy.index = dummy_index

        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
        new_levels = clevels
        new_names = cnames
        new_codes = recons_codes
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [v if i > v else v - 1 for v in clocs]

            return result

        dummy = data.copy()
        dummy.index = dummy_index

        unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_codes = [unstcols.codes[0]]
        for rec in recons_codes:
            new_codes.append(rec.take(unstcols.codes[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             codes=new_codes,
                             names=new_names,
                             verify_integrity=False)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
Example #52
0
class SparseDataFrame(DataFrame):
    """
    DataFrame containing sparse floating point data in the form of SparseSeries
    objects

    Parameters
    ----------
    data : same types as can be passed to DataFrame
    index : array-like, optional
    column : array-like, optional
    default_kind : {'block', 'integer'}, default 'block'
        Default sparse kind for converting Series to SparseSeries. Will not
        override SparseSeries passed into constructor
    default_fill_value : float
        Default fill_value for converting Series to SparseSeries. Will not
        override SparseSeries passed in
    """
    _verbose_info = False
    _columns = None
    _series = None
    _is_mixed_type = False
    ndim = 2

    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 default_kind='block',
                 default_fill_value=None):
        if default_fill_value is None:
            default_fill_value = np.nan

        self.default_kind = default_kind
        self.default_fill_value = default_fill_value

        if isinstance(data, dict):
            sdict, columns, index = self._init_dict(data, index, columns)
        elif isinstance(data, (np.ndarray, list)):
            sdict, columns, index = self._init_matrix(data, index, columns)
        elif isinstance(data, DataFrame):
            sdict, columns, index = self._init_dict(data, data.index,
                                                    data.columns)
        elif data is None:
            sdict = {}

            if index is None:
                index = Index([])

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    sdict[c] = Series(np.nan, index=index)

        self._series = sdict
        self.columns = columns
        self.index = index

    def _from_axes(self, data, axes):
        columns, index = axes
        return self._constructor(data, index=index, columns=columns)

    @cache_readonly
    def _data(self):
        return _SparseMockBlockManager(self)

    def _consolidate_inplace(self):
        # do nothing when DataFrame calls this method
        pass

    def convert_objects(self):
        # XXX
        return self

    @property
    def _constructor(self):
        def wrapper(data, index=None, columns=None):
            return SparseDataFrame(data,
                                   index=index,
                                   columns=columns,
                                   default_fill_value=self.default_fill_value,
                                   default_kind=self.default_kind)

        return wrapper

    def _init_dict(self, data, index, columns, dtype=None):
        # pre-filter out columns if we passed it
        if columns is not None:
            columns = _ensure_index(columns)
            data = dict((k, v) for k, v in data.iteritems() if k in columns)
        else:
            columns = Index(_try_sort(data.keys()))

        if index is None:
            index = extract_index(data)

        sp_maker = lambda x: SparseSeries(x,
                                          index=index,
                                          kind=self.default_kind,
                                          fill_value=self.default_fill_value,
                                          copy=True)

        sdict = {}
        for k, v in data.iteritems():
            if isinstance(v, Series):
                # Force alignment, no copy necessary
                if not v.index.equals(index):
                    v = v.reindex(index)

                if not isinstance(v, SparseSeries):
                    v = sp_maker(v)
            else:
                if isinstance(v, dict):
                    v = [v.get(i, nan) for i in index]

                v = sp_maker(v)
            sdict[k] = v

        # TODO: figure out how to handle this case, all nan's?
        # add in any other columns we want to have (completeness)
        nan_vec = np.empty(len(index))
        nan_vec.fill(nan)
        for c in columns:
            if c not in sdict:
                sdict[c] = sp_maker(nan_vec)

        return sdict, columns, index

    def _init_matrix(self, data, index, columns, dtype=None):
        data = _prep_ndarray(data, copy=False)
        N, K = data.shape
        if index is None:
            index = _default_index(N)
        if columns is None:
            columns = _default_index(K)

        if len(columns) != K:
            raise Exception('Column length mismatch: %d vs. %d' %
                            (len(columns), K))
        if len(index) != N:
            raise Exception('Index length mismatch: %d vs. %d' %
                            (len(index), N))

        data = dict([(idx, data[:, i]) for i, idx in enumerate(columns)])
        return self._init_dict(data, index, columns, dtype)

    def __array_wrap__(self, result):
        return SparseDataFrame(result,
                               index=self.index,
                               columns=self.columns,
                               default_kind=self.default_kind,
                               default_fill_value=self.default_fill_value)

    def __getstate__(self):
        series = dict(
            (k, (v.sp_index, v.sp_values)) for k, v in self.iteritems())
        columns = self.columns
        index = self.index

        return (series, columns, index, self.default_fill_value,
                self.default_kind)

    def __setstate__(self, state):
        series, cols, idx, fv, kind = state

        if not isinstance(cols, Index):  # pragma: no cover
            columns = _unpickle_array(cols)
        else:
            columns = cols

        if not isinstance(idx, Index):  # pragma: no cover
            index = _unpickle_array(idx)
        else:
            index = idx

        series_dict = {}
        for col, (sp_index, sp_values) in series.iteritems():
            series_dict[col] = SparseSeries(sp_values,
                                            sparse_index=sp_index,
                                            fill_value=fv)

        self._series = series_dict
        self.index = index
        self.columns = columns
        self.default_fill_value = fv
        self.default_kind = kind

    def to_dense(self):
        """
        Convert to dense DataFrame

        Returns
        -------
        df : DataFrame
        """
        data = dict((k, v.to_dense()) for k, v in self.iteritems())
        return DataFrame(data, index=self.index)

    def astype(self, dtype):
        raise NotImplementedError

    def copy(self, deep=True):
        """
        Make a copy of this SparseDataFrame
        """
        series = dict((k, v.copy()) for k, v in self.iteritems())
        return SparseDataFrame(series,
                               index=self.index,
                               columns=self.columns,
                               default_fill_value=self.default_fill_value,
                               default_kind=self.default_kind)

    @property
    def density(self):
        """
        Ratio of non-sparse points to total (dense) data points
        represented in the frame
        """
        tot_nonsparse = sum(
            [ser.sp_index.npoints for _, ser in self.iteritems()])
        tot = len(self.index) * len(self.columns)
        return tot_nonsparse / float(tot)

    #----------------------------------------------------------------------
    # Support different internal rep'n of SparseDataFrame

    def _set_item(self, key, value):
        sp_maker = lambda x: SparseSeries(x,
                                          index=self.index,
                                          fill_value=self.default_fill_value,
                                          kind=self.default_kind)
        if hasattr(value, '__iter__'):
            if isinstance(value, Series):
                clean_series = value.reindex(self.index)
                if not isinstance(value, SparseSeries):
                    clean_series = sp_maker(clean_series)
            else:
                clean_series = sp_maker(value)

            self._series[key] = clean_series
        # Scalar
        else:
            self._series[key] = sp_maker(value)

        if key not in self.columns:
            self._insert_column(key)

    def _insert_column(self, key):
        self.columns = Index(np.concatenate((self.columns, [key])))

    def __delitem__(self, key):
        """
        Delete column from DataFrame
        """
        loc = self.columns.get_loc(key)
        del self._series[key]
        self._delete_column_index(loc)

    def _delete_column_index(self, loc):
        if loc == len(self.columns) - 1:
            new_columns = self.columns[:loc]
        else:
            new_columns = Index(
                np.concatenate((self.columns[:loc], self.columns[loc + 1:])))
        self.columns = new_columns

    _index = None

    def _set_index(self, index):
        self._index = _ensure_index(index)
        for v in self._series.values():
            v.index = self._index

    def _get_index(self):
        return self._index

    def _get_columns(self):
        return self._columns

    def _set_columns(self, cols):
        if len(cols) != len(self._series):
            raise Exception('Columns length %d did not match data %d!' %
                            (len(cols), len(self._series)))
        self._columns = _ensure_index(cols)

    index = property(fget=_get_index, fset=_set_index)
    columns = property(fget=_get_columns, fset=_set_columns)

    def __getitem__(self, item):
        """
        Retrieve column or slice from DataFrame
        """
        try:
            # unsure about how kludgy this is
            s = self._series[item]
            s.name = item
            return s
        except (TypeError, KeyError):
            if isinstance(item, slice):
                date_rng = self.index[item]
                return self.reindex(date_rng)

            elif isinstance(item, np.ndarray):
                if len(item) != len(self.index):
                    raise Exception('Item wrong length %d instead of %d!' %
                                    (len(item), len(self.index)))
                newIndex = self.index[item]
                return self.reindex(newIndex)
            else:  # pragma: no cover
                raise

    @Appender(DataFrame.get_value.__doc__, indents=0)
    def get_value(self, index, col):
        s = self._series[col]
        return s.get_value(index)

    def set_value(self, index, col, value):
        """
        Put single value at passed column and index

        Parameters
        ----------
        index : row label
        col : column label
        value : scalar value

        Notes
        -----
        This method *always* returns a new object. It is currently not
        particularly efficient (and potentially very expensive) but is provided
        for API compatibility with DataFrame

        Returns
        -------
        frame : DataFrame
        """
        dense = self.to_dense().set_value(index, col, value)
        return dense.to_sparse(kind=self.default_kind,
                               fill_value=self.default_fill_value)

    def _slice(self, slobj, axis=0):
        if axis == 0:
            new_index = self.index[slobj]
            new_columns = self.columns
        else:
            new_index = self.index
            new_columns = self.columns[slobj]

        return self.reindex(index=new_index, columns=new_columns)

    def as_matrix(self, columns=None):
        """
        Convert the frame to its Numpy-array matrix representation

        Columns are presented in sorted order unless a specific list
        of columns is provided.
        """
        if columns is None:
            columns = self.columns

        if len(columns) == 0:
            return np.zeros((len(self.index), 0), dtype=float)

        return np.array([self[col].values for col in columns]).T

    values = property(as_matrix)

    def xs(self, key, axis=0, copy=False):
        """
        Returns a row (cross-section) from the SparseDataFrame as a Series
        object.

        Parameters
        ----------
        key : some index contained in the index

        Returns
        -------
        xs : Series
        """
        if axis == 1:
            data = self[key]
            return data

        i = self.index.get_loc(key)
        series = self._series
        values = [series[k][i] for k in self.columns]
        return Series(values, index=self.columns)

    #----------------------------------------------------------------------
    # Arithmetic-related methods

    def _combine_frame(self, other, func, fill_value=None, level=None):
        this, other = self.align(other, join='outer', level=level, copy=False)
        new_index, new_columns = this.index, this.columns

        if level is not None:
            raise NotImplementedError

        if self.empty and other.empty:
            return SparseDataFrame(index=new_index)

        new_data = {}
        if fill_value is not None:
            # TODO: be a bit more intelligent here
            for col in new_columns:
                if col in this and col in other:
                    dleft = this[col].to_dense()
                    dright = other[col].to_dense()
                    result = dleft._binop(dright, func, fill_value=fill_value)
                    result = result.to_sparse(fill_value=this[col].fill_value)
                    new_data[col] = result
        else:
            for col in new_columns:
                if col in this and col in other:
                    new_data[col] = func(this[col], other[col])

        return self._constructor(data=new_data,
                                 index=new_index,
                                 columns=new_columns)

    def _combine_match_index(self, other, func, fill_value=None):
        new_data = {}

        if fill_value is not None:
            raise NotImplementedError

        new_index = self.index.union(other.index)
        this = self
        if self.index is not new_index:
            this = self.reindex(new_index)

        if other.index is not new_index:
            other = other.reindex(new_index)

        for col, series in this.iteritems():
            new_data[col] = func(series.values, other.values)

        return self._constructor(new_data,
                                 index=new_index,
                                 columns=self.columns)

    def _combine_match_columns(self, other, func, fill_value):
        # patched version of DataFrame._combine_match_columns to account for
        # NumPy circumventing __rsub__ with float64 types, e.g.: 3.0 - series,
        # where 3.0 is numpy.float64 and series is a SparseSeries. Still
        # possible for this to happen, which is bothersome

        if fill_value is not None:
            raise NotImplementedError

        new_data = {}

        union = intersection = self.columns

        if not union.equals(other.index):
            union = other.index.union(self.columns)
            intersection = other.index.intersection(self.columns)

        for col in intersection:
            new_data[col] = func(self[col], float(other[col]))

        return self._constructor(new_data, index=self.index, columns=union)

    def _combine_const(self, other, func):
        new_data = {}
        for col, series in self.iteritems():
            new_data[col] = func(series, other)

        return self._constructor(data=new_data,
                                 index=self.index,
                                 columns=self.columns)

    def _reindex_index(self,
                       index,
                       method,
                       copy,
                       level,
                       fill_value=np.nan,
                       limit=None):
        if level is not None:
            raise Exception('Reindex by level not supported for sparse')

        if self.index.equals(index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            return SparseDataFrame(index=index, columns=self.columns)

        indexer = self.index.get_indexer(index, method, limit=limit)
        indexer = com._ensure_platform_int(indexer)
        mask = indexer == -1
        need_mask = mask.any()

        new_series = {}
        for col, series in self.iteritems():
            values = series.values
            new = values.take(indexer)

            if need_mask:
                np.putmask(new, mask, fill_value)

            new_series[col] = new

        return SparseDataFrame(new_series,
                               index=index,
                               columns=self.columns,
                               default_fill_value=self.default_fill_value)

    def _reindex_columns(self, columns, copy, level, fill_value, limit=None):
        if level is not None:
            raise Exception('Reindex by level not supported for sparse')

        if com.notnull(fill_value):
            raise NotImplementedError

        if limit:
            raise NotImplementedError

        # TODO: fill value handling
        sdict = dict((k, v) for k, v in self.iteritems() if k in columns)
        return SparseDataFrame(sdict,
                               index=self.index,
                               columns=columns,
                               default_fill_value=self.default_fill_value)

    def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer,
                               copy, fill_value):
        if columns is None:
            columns = self.columns

        new_arrays = {}
        for col in columns:
            if col not in self:
                continue
            if row_indexer is not None:
                new_arrays[col] = com.take_1d(self[col].values,
                                              row_indexer,
                                              fill_value=fill_value)
            else:
                new_arrays[col] = self[col]

        return self._constructor(new_arrays, index=index, columns=columns)

    def _rename_index_inplace(self, mapper):
        self.index = [mapper(x) for x in self.index]

    def _rename_columns_inplace(self, mapper):
        new_series = {}
        new_columns = []

        for col in self.columns:
            new_col = mapper(col)
            if new_col in new_series:  # pragma: no cover
                raise Exception('Non-unique mapping!')
            new_series[new_col] = self[col]
            new_columns.append(new_col)

        self.columns = new_columns
        self._series = new_series

    def take(self, indices, axis=0):
        """
        Analogous to ndarray.take, return SparseDataFrame corresponding to
        requested indices along an axis

        Parameters
        ----------
        indices : list / array of ints
        axis : {0, 1}

        Returns
        -------
        taken : SparseDataFrame
        """
        indices = com._ensure_platform_int(indices)
        new_values = self.values.take(indices, axis=axis)
        if axis == 0:
            new_columns = self.columns
            new_index = self.index.take(indices)
        else:
            new_columns = self.columns.take(indices)
            new_index = self.index
        return self._constructor(new_values,
                                 index=new_index,
                                 columns=new_columns)

    def add_prefix(self, prefix):
        f = (('%s' % prefix) + '%s').__mod__
        return self.rename(columns=f)

    def add_suffix(self, suffix):
        f = ('%s' + ('%s' % suffix)).__mod__
        return self.rename(columns=f)

    def _join_compat(self,
                     other,
                     on=None,
                     how='left',
                     lsuffix='',
                     rsuffix='',
                     sort=False):
        if on is not None:
            raise NotImplementedError
        else:
            return self._join_index(other, how, lsuffix, rsuffix)

    def _join_index(self, other, how, lsuffix, rsuffix):
        if isinstance(other, Series):
            assert (other.name is not None)
            other = SparseDataFrame({other.name: other},
                                    default_fill_value=self.default_fill_value)

        join_index = self.index.join(other.index, how=how)

        this = self.reindex(join_index)
        other = other.reindex(join_index)

        this, other = this._maybe_rename_join(other, lsuffix, rsuffix)

        result_series = this._series
        other_series = other._series
        result_series.update(other_series)

        return self._constructor(result_series, index=join_index)

    def _maybe_rename_join(self, other, lsuffix, rsuffix):
        intersection = self.columns.intersection(other.columns)

        if len(intersection) > 0:
            if not lsuffix and not rsuffix:
                raise Exception('columns overlap: %s' % intersection)

            def lrenamer(x):
                if x in intersection:
                    return '%s%s' % (x, lsuffix)
                return x

            def rrenamer(x):
                if x in intersection:
                    return '%s%s' % (x, rsuffix)
                return x

            this = self.rename(columns=lrenamer)
            other = other.rename(columns=rrenamer)
        else:
            this = self

        return this, other

    def transpose(self):
        """
        Returns a DataFrame with the rows/columns switched.
        """
        return SparseDataFrame(self.values.T,
                               index=self.columns,
                               columns=self.index,
                               default_fill_value=self.default_fill_value,
                               default_kind=self.default_kind)

    T = property(transpose)

    @Appender(DataFrame.count.__doc__)
    def count(self, axis=0, **kwds):
        return self.apply(lambda x: x.count(), axis=axis)

    def cumsum(self, axis=0):
        """
        Return SparseDataFrame of cumulative sums over requested axis.

        Parameters
        ----------
        axis : {0, 1}
            0 for row-wise, 1 for column-wise

        Returns
        -------
        y : SparseDataFrame
        """
        return self.apply(lambda x: x.cumsum(), axis=axis)

    def shift(self, periods, freq=None, **kwds):
        """
        Analogous to DataFrame.shift
        """
        from pandas.core.series import _resolve_offset

        offset = _resolve_offset(freq, kwds)

        new_series = {}
        if offset is None:
            new_index = self.index
            for col, s in self.iteritems():
                new_series[col] = s.shift(periods)
        else:
            new_index = self.index.shift(periods, offset)
            for col, s in self.iteritems():
                new_series[col] = SparseSeries(s.sp_values,
                                               index=new_index,
                                               sparse_index=s.sp_index,
                                               fill_value=s.fill_value)

        return SparseDataFrame(new_series,
                               index=new_index,
                               columns=self.columns,
                               default_fill_value=self.default_fill_value,
                               default_kind=self.default_kind)

    def apply(self, func, axis=0, broadcast=False):
        """
        Analogous to DataFrame.apply, for SparseDataFrame

        Parameters
        ----------
        func : function
            Function to apply to each column
        axis : {0, 1}
        broadcast : bool, default False
            For aggregation functions, return object of same size with values
            propagated

        Returns
        -------
        applied : Series or SparseDataFrame
        """
        if not len(self.columns):
            return self

        if isinstance(func, np.ufunc):
            new_series = {}
            for k, v in self.iteritems():
                applied = func(v)
                applied.fill_value = func(applied.fill_value)
                new_series[k] = applied
            return SparseDataFrame(new_series,
                                   index=self.index,
                                   columns=self.columns,
                                   default_fill_value=self.default_fill_value,
                                   default_kind=self.default_kind)
        else:
            if not broadcast:
                return self._apply_standard(func, axis)
            else:
                return self._apply_broadcast(func, axis)

    def applymap(self, func):
        """
        Apply a function to a DataFrame that is intended to operate
        elementwise, i.e. like doing map(func, series) for each series in the
        DataFrame

        Parameters
        ----------
        func : function
            Python function, returns a single value from a single value

        Returns
        -------
        applied : DataFrame
        """
        return self.apply(lambda x: map(func, x))

    @Appender(DataFrame.fillna.__doc__)
    def fillna(self, value=None, method='pad', inplace=False, limit=None):
        new_series = {}
        for k, v in self.iterkv():
            new_series[k] = v.fillna(value=value, method=method, limit=limit)

        if inplace:
            self._series = new_series
            return self
        else:
            return self._constructor(new_series,
                                     index=self.index,
                                     columns=self.columns)
Example #53
0
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 default_kind=None,
                 default_fill_value=None,
                 dtype=None,
                 copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if is_scipy_sparse(data):
            mgr = self._init_spmatrix(data,
                                      index,
                                      columns,
                                      dtype=dtype,
                                      fill_value=default_fill_value)
        elif isinstance(data, dict):
            mgr = self._init_dict(data, index, columns, dtype=dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns, dtype=dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns, dtype=dtype)
        elif isinstance(data, Series):
            mgr = self._init_dict(data.to_frame(),
                                  data.index,
                                  columns=None,
                                  dtype=dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data,
                                 axes=dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif data is None:
            data = DataFrame()

            if index is None:
                index = Index([])
            else:
                index = ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan,
                                          index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        else:
            msg = ('SparseDataFrame called with unknown type "{data_type}" '
                   'for data argument')
            raise TypeError(msg.format(data_type=type(data).__name__))

        generic.NDFrame.__init__(self, mgr)
Example #54
0
 def get_duplicates(self):
     values = Index.get_duplicates(self)
     return self._simple_new(values)
Example #55
0
class SparsePanel(Panel):

    """
    Sparse version of Panel

    Parameters
    ----------
    frames : dict of DataFrame objects
    items : array-like
    major_axis : array-like
    minor_axis : array-like
    default_kind : {'block', 'integer'}, default 'block'
        Default sparse kind for converting Series to SparseSeries. Will not
        override SparseSeries passed into constructor
    default_fill_value : float
        Default fill_value for converting Series to SparseSeries. Will not
        override SparseSeries passed in

    Notes
    -----
    """
    ndim = 3
    _typ = 'panel'
    _subtyp = 'sparse_panel'

    def __init__(self, frames, items=None, major_axis=None, minor_axis=None,
                 default_fill_value=np.nan, default_kind='block',
                 copy=False):
        if isinstance(frames, np.ndarray):
            new_frames = {}
            for item, vals in zip(items, frames):
                new_frames[item] = \
                    SparseDataFrame(vals, index=major_axis,
                                    columns=minor_axis,
                                    default_fill_value=default_fill_value,
                                    default_kind=default_kind)
            frames = new_frames

        if not (isinstance(frames, dict)):
            raise AssertionError()

        self.default_fill_value = fill_value = default_fill_value
        self.default_kind = kind = default_kind

        # pre-filter, if necessary
        if items is None:
            items = Index(sorted(frames.keys()))
        items = _ensure_index(items)

        (clean_frames,
         major_axis,
         minor_axis) = _convert_frames(frames, major_axis,
                                       minor_axis, kind=kind,
                                       fill_value=fill_value)

        self._frames = clean_frames

        # do we want to fill missing ones?
        for item in items:
            if item not in clean_frames:
                raise Exception('column %s not found in data' % item)

        self._items = items
        self.major_axis = major_axis
        self.minor_axis = minor_axis

    def _consolidate_inplace(self):  # pragma: no cover
        # do nothing when DataFrame calls this method
        pass

    def __array_wrap__(self, result):
        return SparsePanel(result, items=self.items,
                           major_axis=self.major_axis,
                           minor_axis=self.minor_axis,
                           default_kind=self.default_kind,
                           default_fill_value=self.default_fill_value)

    @classmethod
    def from_dict(cls, data):
        """
        Analogous to Panel.from_dict
        """
        return SparsePanel(data)

    def to_dense(self):
        """
        Convert SparsePanel to (dense) Panel

        Returns
        -------
        dense : Panel
        """
        return Panel(self.values, self.items, self.major_axis,
                     self.minor_axis)

    def as_matrix(self):
        return self.values

    @property
    def values(self):
        # return dense values
        return np.array([self._frames[item].values
                         for item in self.items])

    # need a special property for items to make the field assignable

    _items = None

    def _get_items(self):
        return self._items

    def _set_items(self, new_items):
        new_items = _ensure_index(new_items)
        if isinstance(new_items, MultiIndex):
            raise NotImplementedError

        # need to create new frames dict

        old_frame_dict = self._frames
        old_items = self._items
        self._frames = dict((new_k, old_frame_dict[old_k])
                            for new_k, old_k in zip(new_items, old_items))
        self._items = new_items
    items = property(fget=_get_items, fset=_set_items)

    # DataFrame's index
    major_axis = SparsePanelAxis('_major_axis', 'index')

    # DataFrame's columns / "items"
    minor_axis = SparsePanelAxis('_minor_axis', 'columns')

    def _get_item_cache(self, key):
        return self._frames[key]

    def __setitem__(self, key, value):
        if isinstance(value, DataFrame):
            value = value.reindex(index=self.major_axis,
                                  columns=self.minor_axis)
            if not isinstance(value, SparseDataFrame):
                value = value.to_sparse(fill_value=self.default_fill_value,
                                        kind=self.default_kind)
        else:
            raise ValueError('only DataFrame objects can be set currently')

        self._frames[key] = value

        if key not in self.items:
            self._items = Index(list(self.items) + [key])

    def set_value(self, item, major, minor, value):
        """
        Quickly set single value at (item, major, minor) location

        Parameters
        ----------
        item : item label (panel item)
        major : major axis label (panel item row)
        minor : minor axis label (panel item column)
        value : scalar

        Notes
        -----
        This method *always* returns a new object. It is not particularly
        efficient but is provided for API compatibility with Panel

        Returns
        -------
        panel : SparsePanel
        """
        dense = self.to_dense().set_value(item, major, minor, value)
        return dense.to_sparse(kind=self.default_kind,
                               fill_value=self.default_fill_value)

    def __delitem__(self, key):
        loc = self.items.get_loc(key)
        indices = lrange(loc) + lrange(loc + 1, len(self.items))
        del self._frames[key]
        self._items = self._items.take(indices)

    def __getstate__(self):
        # pickling
        return (self._frames, com._pickle_array(self.items),
                com._pickle_array(self.major_axis),
                com._pickle_array(self.minor_axis),
                self.default_fill_value, self.default_kind)

    def __setstate__(self, state):
        frames, items, major, minor, fv, kind = state

        self.default_fill_value = fv
        self.default_kind = kind
        self._items = _ensure_index(com._unpickle_array(items))
        self._major_axis = _ensure_index(com._unpickle_array(major))
        self._minor_axis = _ensure_index(com._unpickle_array(minor))
        self._frames = frames

    def copy(self):
        """
        Make a (shallow) copy of the sparse panel

        Returns
        -------
        copy : SparsePanel
        """
        return SparsePanel(self._frames.copy(), items=self.items,
                           major_axis=self.major_axis,
                           minor_axis=self.minor_axis,
                           default_fill_value=self.default_fill_value,
                           default_kind=self.default_kind)

    def to_frame(self, filter_observations=True):
        """
        Convert SparsePanel to (dense) DataFrame

        Returns
        -------
        frame : DataFrame
        """
        if not filter_observations:
            raise TypeError('filter_observations=False not supported for '
                            'SparsePanel.to_long')

        I, N, K = self.shape
        counts = np.zeros(N * K, dtype=int)

        d_values = {}
        d_indexer = {}

        for item in self.items:
            frame = self[item]

            values, major, minor = _stack_sparse_info(frame)

            # values are stacked column-major
            indexer = minor * N + major
            counts.put(indexer, counts.take(indexer) + 1)  # cuteness

            d_values[item] = values
            d_indexer[item] = indexer

        # have full set of observations for each item
        mask = counts == I

        # for each item, take mask values at index locations for those sparse
        # values, and use that to select values
        values = np.column_stack([d_values[item][mask.take(d_indexer[item])]
                                  for item in self.items])

        inds, = mask.nonzero()

        # still column major
        major_labels = inds % N
        minor_labels = inds // N

        index = MultiIndex(levels=[self.major_axis, self.minor_axis],
                           labels=[major_labels, minor_labels])

        df = DataFrame(values, index=index, columns=self.items)
        return df.sortlevel(level=0)

    to_long = deprecate('to_long', to_frame)
    toLong = deprecate('toLong', to_frame)

    def reindex(self, major=None, items=None, minor=None, major_axis=None,
                minor_axis=None, copy=False):
        """
        Conform / reshape panel axis labels to new input labels

        Parameters
        ----------
        major : array-like, default None
        items : array-like, default None
        minor : array-like, default None
        copy : boolean, default False
            Copy underlying SparseDataFrame objects

        Returns
        -------
        reindexed : SparsePanel
        """
        major = com._mut_exclusive(major, major_axis)
        minor = com._mut_exclusive(minor, minor_axis)

        if com._all_none(items, major, minor):
            raise ValueError('Must specify at least one axis')

        major = self.major_axis if major is None else major
        minor = self.minor_axis if minor is None else minor

        if items is not None:
            new_frames = {}
            for item in items:
                if item in self._frames:
                    new_frames[item] = self._frames[item]
                else:
                    raise NotImplementedError('Reindexing with new items not yet '
                                              'supported')
        else:
            new_frames = self._frames

        if copy:
            new_frames = dict((k, v.copy()) for k, v in compat.iteritems(new_frames))

        return SparsePanel(new_frames, items=items,
                           major_axis=major,
                           minor_axis=minor,
                           default_fill_value=self.default_fill_value,
                           default_kind=self.default_kind)

    def _combine(self, other, func, axis=0):
        if isinstance(other, DataFrame):
            return self._combineFrame(other, func, axis=axis)
        elif isinstance(other, Panel):
            return self._combinePanel(other, func)
        elif np.isscalar(other):
            new_frames = dict((k, func(v, other))
                              for k, v in compat.iteritems(self))
            return self._new_like(new_frames)

    def _combineFrame(self, other, func, axis=0):
        index, columns = self._get_plane_axes(axis)
        axis = self._get_axis_number(axis)

        other = other.reindex(index=index, columns=columns)

        if axis == 0:
            new_values = func(self.values, other.values)
        elif axis == 1:
            new_values = func(self.values.swapaxes(0, 1), other.values.T)
            new_values = new_values.swapaxes(0, 1)
        elif axis == 2:
            new_values = func(self.values.swapaxes(0, 2), other.values)
            new_values = new_values.swapaxes(0, 2)

        # TODO: make faster!
        new_frames = {}
        for item, item_slice in zip(self.items, new_values):
            old_frame = self[item]
            ofv = old_frame.default_fill_value
            ok = old_frame.default_kind
            new_frames[item] = SparseDataFrame(item_slice,
                                               index=self.major_axis,
                                               columns=self.minor_axis,
                                               default_fill_value=ofv,
                                               default_kind=ok)

        return self._new_like(new_frames)

    def _new_like(self, new_frames):
        return SparsePanel(new_frames, self.items, self.major_axis,
                           self.minor_axis,
                           default_fill_value=self.default_fill_value,
                           default_kind=self.default_kind)

    def _combinePanel(self, other, func):
        items = self.items + other.items
        major = self.major_axis + other.major_axis
        minor = self.minor_axis + other.minor_axis

        # could check that everything's the same size, but forget it

        this = self.reindex(items=items, major=major, minor=minor)
        other = other.reindex(items=items, major=major, minor=minor)

        new_frames = {}
        for item in items:
            new_frames[item] = func(this[item], other[item])

        if not isinstance(other, SparsePanel):
            new_default_fill = self.default_fill_value
        else:
            # maybe unnecessary
            new_default_fill = func(self.default_fill_value,
                                    other.default_fill_value)

        return SparsePanel(new_frames, items, major, minor,
                           default_fill_value=new_default_fill,
                           default_kind=self.default_kind)

    def major_xs(self, key):
        """
        Return slice of panel along major axis

        Parameters
        ----------
        key : object
            Major axis label

        Returns
        -------
        y : DataFrame
            index -> minor axis, columns -> items
        """
        slices = dict((k, v.xs(key)) for k, v in compat.iteritems(self))
        return DataFrame(slices, index=self.minor_axis, columns=self.items)

    def minor_xs(self, key):
        """
        Return slice of panel along minor axis

        Parameters
        ----------
        key : object
            Minor axis label

        Returns
        -------
        y : SparseDataFrame
            index -> major axis, columns -> items
        """
        slices = dict((k, v[key]) for k, v in compat.iteritems(self))
        return SparseDataFrame(slices, index=self.major_axis,
                               columns=self.items,
                               default_fill_value=self.default_fill_value,
                               default_kind=self.default_kind)
Example #56
0
def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
    """
    Encode input values as an enumerated type or categorical variable

    Parameters
    ----------
    values : ndarray (1-d)
        Sequence
    sort : boolean, default False
        Sort by values
    order : deprecated
    na_sentinel : int, default -1
        Value to mark "not found"
    size_hint : hint to the hashtable sizer

    Returns
    -------
    labels : the indexer to the original array
    uniques : ndarray (1-d) or Index
        the unique values. Index is returned when passed values is Index or Series

    note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
    """
    if order is not None:
        msg = "order is deprecated. See https://github.com/pydata/pandas/issues/6926"
        warn(msg, FutureWarning, stacklevel=2)

    from pandas.core.index import Index
    from pandas.core.series import Series
    vals = np.asarray(values)

    is_datetime = com.is_datetime64_dtype(vals)
    is_timedelta = com.is_timedelta64_dtype(vals)
    (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)

    table = hash_klass(size_hint or len(vals))
    uniques = vec_klass()
    labels = table.get_labels(vals, uniques, 0, na_sentinel)

    labels = com._ensure_platform_int(labels)

    uniques = uniques.to_array()

    if sort and len(uniques) > 0:
        try:
            sorter = uniques.argsort()
        except:
            # unorderable in py3 if mixed str/int
            t = hash_klass(len(uniques))
            t.map_locations(com._ensure_object(uniques))

            # order ints before strings
            ordered = np.concatenate([
                np.sort(
                    np.array([e for i, e in enumerate(uniques) if f(e)],
                             dtype=object)) for f in [
                                 lambda x: not isinstance(x, string_types),
                                 lambda x: isinstance(x, string_types)
                             ]
            ])
            sorter = com._ensure_platform_int(
                t.lookup(com._ensure_object(ordered)))

        reverse_indexer = np.empty(len(sorter), dtype=np.int_)
        reverse_indexer.put(sorter, np.arange(len(sorter)))

        mask = labels < 0
        labels = reverse_indexer.take(labels)
        np.putmask(labels, mask, -1)

        uniques = uniques.take(sorter)

    if is_datetime:
        uniques = uniques.astype('M8[ns]')
    elif is_timedelta:
        uniques = uniques.astype('m8[ns]')
    if isinstance(values, Index):
        uniques = values._shallow_copy(uniques, name=None)
    elif isinstance(values, Series):
        uniques = Index(uniques)
    return labels, uniques
Example #57
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(
                dtype) or is_period or is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
Example #58
0
def simpleParser(nestedList, colNames=None, header=0, indexCol=0):
    """
    Workhorse function for processing nested list into DataFrame

    Should be replaced by np.genfromtxt
    """
    naValues = set([
        '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '1.#INF', '-1.#INF',
        '1.#INF000000', 'NA', 'NULL', 'NaN', 'nan', ''
    ])
    lines = nestedList
    data = {}
    if header is not None:
        columns = []
        for i, c in enumerate(lines[header]):
            if c == '':
                columns.append('Unnamed: ' + string.ascii_uppercase[i])
            else:
                columns.append(c)

        content = lines[header + 1:]

        colCounts = dict(((col, 0) for col in columns))
        for i, col in enumerate(columns):
            if columns.count(col) > 1:
                columns[i] = col + str(colCounts[col])
                colCounts[col] += 1
    else:
        if not colNames:
            columns = string.ascii_uppercase[:len(lines[0])]
        else:
            columns = colNames
        content = lines

    for i, (c, col) in enumerate(izip(columns, izip(*content))):
        if i == indexCol:
            data[c] = col
            continue
        data[c] = []
        for val in col:
            if val in naValues:
                val = np.nan
            else:
                try:
                    tmp = val
                    val = np.float64(val)
                    if np.isinf(val):
                        val = tmp
                except Exception:
                    pass
            data[c].append(val)

    if header is not None:
        if 'date' in columns[0].lower() or 'Unnamed' in columns[0]:
            dates = []
            for s in data[columns[0]]:
                try:
                    dates.append(parser.parse(s))
                except Exception:
                    dates.append(s)
            data[columns[0]] = dates
    for c, values in data.iteritems():
        try:
            data[c] = np.array(values, dtype=np.float64)
        except Exception:
            data[c] = np.array(values, dtype=np.object_)
    if indexCol is not None:
        index = Index(data[columns[indexCol]])
        frameData = dict([(col, data[col]) for col in columns \
                        if col != columns[indexCol]])
        return DataFrame(data=frameData, index=index)
    else:
        index = np.arange(len(data.values()[0]))
        frameData = dict([(col, data[col]) for col in columns])
        return DataFrame(data=frameData, index=index)
Example #59
0
    def __init__(self,
                 data=None,
                 index=None,
                 columns=None,
                 default_kind=None,
                 default_fill_value=None,
                 dtype=None,
                 copy=False):

        # pick up the defaults from the Sparse structures
        if isinstance(data, SparseDataFrame):
            if index is None:
                index = data.index
            if columns is None:
                columns = data.columns
            if default_fill_value is None:
                default_fill_value = data.default_fill_value
            if default_kind is None:
                default_kind = data.default_kind
        elif isinstance(data, (SparseSeries, SparseArray)):
            if index is None:
                index = data.index
            if default_fill_value is None:
                default_fill_value = data.fill_value
            if columns is None and hasattr(data, 'name'):
                columns = [data.name]
            if columns is None:
                raise Exception("cannot pass a series w/o a name or columns")
            data = {columns[0]: data}

        if default_fill_value is None:
            default_fill_value = np.nan
        if default_kind is None:
            default_kind = 'block'

        self._default_kind = default_kind
        self._default_fill_value = default_fill_value

        if isinstance(data, dict):
            mgr = self._init_dict(data, index, columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, (np.ndarray, list)):
            mgr = self._init_matrix(data, index, columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, SparseDataFrame):
            mgr = self._init_mgr(data._data,
                                 dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif isinstance(data, DataFrame):
            mgr = self._init_dict(data, data.index, data.columns)
            if dtype is not None:
                mgr = mgr.astype(dtype)
        elif isinstance(data, BlockManager):
            mgr = self._init_mgr(data,
                                 axes=dict(index=index, columns=columns),
                                 dtype=dtype,
                                 copy=copy)
        elif data is None:
            data = {}

            if index is None:
                index = Index([])
            else:
                index = _ensure_index(index)

            if columns is None:
                columns = Index([])
            else:
                for c in columns:
                    data[c] = SparseArray(np.nan,
                                          index=index,
                                          kind=self._default_kind,
                                          fill_value=self._default_fill_value)
            mgr = dict_to_manager(data, columns, index)
            if dtype is not None:
                mgr = mgr.astype(dtype)

        NDFrame.__init__(self, mgr)
Example #60
0
    def __new__(cls,
                data,
                index=None,
                sparse_index=None,
                kind='block',
                fill_value=None,
                name=None,
                copy=False):

        is_sparse_array = isinstance(data, SparseArray)
        if fill_value is None:
            if is_sparse_array:
                fill_value = data.fill_value
            else:
                fill_value = nan

        if is_sparse_array:
            if isinstance(data, SparseSeries) and index is None:
                index = data.index
            elif index is not None:
                assert (len(index) == len(data))

            sparse_index = data.sp_index
            values = np.asarray(data)
        elif isinstance(data, (Series, dict)):
            if index is None:
                index = data.index

            data = Series(data)
            values, sparse_index = make_sparse(data,
                                               kind=kind,
                                               fill_value=fill_value)
        elif np.isscalar(data):  # pragma: no cover
            if index is None:
                raise Exception('must pass index!')

            values = np.empty(len(index))
            values.fill(data)

            # TODO: more efficient

            values, sparse_index = make_sparse(values,
                                               kind=kind,
                                               fill_value=fill_value)

        else:
            # array-like
            if sparse_index is None:
                values, sparse_index = make_sparse(data,
                                                   kind=kind,
                                                   fill_value=fill_value)
            else:
                values = data
                assert (len(values) == sparse_index.npoints)

        if index is None:
            index = Index(np.arange(sparse_index.length))
        index = _ensure_index(index)

        # Create array, do *not* copy data by default
        if copy:
            subarr = np.array(values, dtype=np.float64, copy=True)
        else:
            subarr = np.asarray(values, dtype=np.float64)

        if index.is_all_dates:
            cls = SparseTimeSeries

        # Change the class of the array to be the subclass type.
        output = subarr.view(cls)
        output.sp_index = sparse_index
        output.fill_value = np.float64(fill_value)
        output.index = index
        output.name = name
        return output