Ejemplo n.º 1
0
    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
        if not self.drop_keys:
            # do nothing, already found in one of the DataFrames
            return

        # insert group keys
        for i, name in enumerate(self.join_names):
            if name in result:
                key_col = result[name]

                if name in self.left:
                    na_indexer = (left_indexer == -1).nonzero()[0]
                    right_na_indexer = right_indexer.take(na_indexer)
                    key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
                                                        right_na_indexer))
                else:
                    na_indexer = (right_indexer == -1).nonzero()[0]
                    left_na_indexer = right_indexer.take(na_indexer)
                    key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
                                                        left_na_indexer))
            else:
                # a faster way?
                key_col = com.take_1d(self.left_join_keys[i], left_indexer)
                na_indexer = (left_indexer == -1).nonzero()[0]
                right_na_indexer = right_indexer.take(na_indexer)
                key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
                                                    right_na_indexer))
                result.insert(i, name, key_col)
Ejemplo n.º 2
0
    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
        # insert group keys

        keys = zip(self.join_names, self.left_on, self.right_on)
        for i, (name, lname, rname) in enumerate(keys):
            if not _should_fill(lname, rname):
                continue

            if name in result:
                key_col = result[name]

                if name in self.left and left_indexer is not None:
                    na_indexer = (left_indexer == -1).nonzero()[0]
                    if len(na_indexer) == 0:
                        continue

                    right_na_indexer = right_indexer.take(na_indexer)
                    key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer))
                elif name in self.right and right_indexer is not None:
                    na_indexer = (right_indexer == -1).nonzero()[0]
                    if len(na_indexer) == 0:
                        continue

                    left_na_indexer = left_indexer.take(na_indexer)
                    key_col.put(na_indexer, com.take_1d(self.left_join_keys[i], left_na_indexer))
            elif left_indexer is not None:
                if name is None:
                    name = "key_%d" % i

                # a faster way?
                key_col = com.take_1d(self.left_join_keys[i], left_indexer)
                na_indexer = (left_indexer == -1).nonzero()[0]
                right_na_indexer = right_indexer.take(na_indexer)
                key_col.put(na_indexer, com.take_1d(self.right_join_keys[i], right_na_indexer))
                result.insert(i, name, key_col)
Ejemplo n.º 3
0
    def test_1d_bool(self):
        arr = np.array([0, 1, 0], dtype=bool)

        result = com.take_1d(arr, [0, 2, 2, 1])
        expected = arr.take([0, 2, 2, 1])
        self.assert_numpy_array_equal(result, expected)

        result = com.take_1d(arr, [0, 2, -1])
        self.assertEqual(result.dtype, np.object_)
Ejemplo n.º 4
0
Archivo: ops.py Proyecto: GaoYu/pandas
    def wrapper(left, right, name=name, na_op=na_op):

        if isinstance(right, pd.DataFrame):
            return NotImplemented

        time_converted = _TimeOp.maybe_convert_for_time_op(left, right, name,
                                                           na_op)

        if time_converted is None:
            lvalues, rvalues = left, right
            dtype = None
            wrap_results = lambda x: x
        elif time_converted is NotImplemented:
            return NotImplemented
        else:
            left, right = time_converted.left, time_converted.right
            lvalues, rvalues = time_converted.lvalues, time_converted.rvalues
            dtype = time_converted.dtype
            wrap_results = time_converted.wrap_results
            na_op = time_converted.na_op

        if isinstance(rvalues, pd.Series):
            rindex = getattr(rvalues, 'index', rvalues)
            name = _maybe_match_name(left, rvalues)
            lvalues = getattr(lvalues, 'values', lvalues)
            rvalues = getattr(rvalues, 'values', rvalues)
            if left.index.equals(rindex):
                index = left.index
            else:
                index, lidx, ridx = left.index.join(rindex, how='outer',
                                                    return_indexers=True)

                if lidx is not None:
                    lvalues = com.take_1d(lvalues, lidx)

                if ridx is not None:
                    rvalues = com.take_1d(rvalues, ridx)

            arr = na_op(lvalues, rvalues)

            return left._constructor(wrap_results(arr), index=index,
                                     name=name, dtype=dtype)
        else:
            # scalars
            if (hasattr(lvalues, 'values') and
                    not isinstance(lvalues, pd.DatetimeIndex)):
                lvalues = lvalues.values

            return left._constructor(wrap_results(na_op(lvalues, rvalues)),
                                     index=left.index, name=left.name,
                                     dtype=dtype)
Ejemplo n.º 5
0
    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
        # insert group keys

        keys = zip(self.join_names, self.left_on, self.right_on)
        for i, (name, lname, rname) in enumerate(keys):
            if not _should_fill(lname, rname):
                continue

            if name in result:
                key_indexer = result.columns.get_loc(name)

                if left_indexer is not None and right_indexer is not None:

                    if name in self.left:
                        if len(self.left) == 0:
                            continue

                        na_indexer = (left_indexer == -1).nonzero()[0]
                        if len(na_indexer) == 0:
                            continue

                        right_na_indexer = right_indexer.take(na_indexer)
                        result.iloc[na_indexer, key_indexer] = (
                            com.take_1d(self.right_join_keys[i],
                                        right_na_indexer))
                    elif name in self.right:
                        if len(self.right) == 0:
                            continue

                        na_indexer = (right_indexer == -1).nonzero()[0]
                        if len(na_indexer) == 0:
                            continue

                        left_na_indexer = left_indexer.take(na_indexer)
                        result.iloc[na_indexer, key_indexer] = (
                            com.take_1d(self.left_join_keys[i],
                                        left_na_indexer))
            elif left_indexer is not None \
                    and isinstance(self.left_join_keys[i], np.ndarray):

                if name is None:
                    name = 'key_%d' % i

                # a faster way?
                key_col = com.take_1d(self.left_join_keys[i], left_indexer)
                na_indexer = (left_indexer == -1).nonzero()[0]
                right_na_indexer = right_indexer.take(na_indexer)
                key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
                                                    right_na_indexer))
                result.insert(i, name, key_col)
Ejemplo n.º 6
0
        def _test_dtype(dtype, fill_value, out_dtype):
            data = np.random.randint(0, 2, 4).astype(dtype)

            indexer = [2, 1, 0, -1]

            result = com.take_1d(data, indexer, fill_value=fill_value)
            assert((result[[0, 1, 2]] == data[[2, 1, 0]]).all())
            assert(result[3] == fill_value)
            assert(result.dtype == out_dtype)

            indexer = [2, 1, 0, 1]

            result = com.take_1d(data, indexer, fill_value=fill_value)
            assert((result[[0, 1, 2, 3]] == data[indexer]).all())
            assert(result.dtype == dtype)
Ejemplo n.º 7
0
    def _reindex_indexer_items(self, new_items, indexer, fill_value):
        # TODO: less efficient than I'd like

        item_order = com.take_1d(self.items.values, indexer)

        # keep track of what items aren't found anywhere
        mask = np.zeros(len(item_order), dtype=bool)

        new_blocks = []
        for blk in self.blocks:
            blk_indexer = blk.items.get_indexer(item_order)
            selector = blk_indexer != -1
            # update with observed items
            mask |= selector

            if not selector.any():
                continue

            new_block_items = new_items.take(selector.nonzero()[0])
            new_values = com.take_fast(blk.values, blk_indexer[selector],
                                       None, False, axis=0)
            new_blocks.append(make_block(new_values, new_block_items,
                                         new_items))

        if not mask.all():
            na_items = new_items[-mask]
            na_block = self._make_na_block(na_items, new_items,
                                           fill_value=fill_value)
            new_blocks.append(na_block)
            new_blocks = _consolidate(new_blocks, new_items)

        return BlockManager(new_blocks, [new_items] + self.axes[1:])
Ejemplo n.º 8
0
    def _wrap_result(self, result, use_codes=True, name=None):

        # for category, we do the stuff on the categories, so blow it up
        # to the full series again
        # But for some operations, we have to do the stuff on the full values,
        # so make it possible to skip this step as the method already did this before
        # the transformation...
        if use_codes and self._is_categorical:
            result = take_1d(result, self._orig.cat.codes)

        # leave as it is to keep extract and get_dummies results
        # can be merged to _wrap_result_expand in v0.17
        from pandas.core.series import Series
        from pandas.core.frame import DataFrame
        from pandas.core.index import Index

        if not hasattr(result, 'ndim'):
            return result
        name = name or getattr(result, 'name', None) or self._orig.name

        if result.ndim == 1:
            if isinstance(self._orig, Index):
                # if result is a boolean np.array, return the np.array
                # instead of wrapping it into a boolean Index (GH 8875)
                if is_bool_dtype(result):
                    return result
                return Index(result, name=name)
            return Series(result, index=self._orig.index, name=name)
        else:
            assert result.ndim < 3
            return DataFrame(result, index=self._orig.index)
Ejemplo n.º 9
0
    def reindex(self, index=None, method=None, copy=True, limit=None):
        """
        Conform SparseSeries to new Index

        See Series.reindex docstring for general behavior

        Returns
        -------
        reindexed : SparseSeries
        """
        new_index = _ensure_index(index)

        if self.index.equals(new_index):
            if copy:
                return self.copy()
            else:
                return self

        if len(self.index) == 0:
            # FIXME: inelegant / slow
            values = np.empty(len(new_index), dtype=np.float64)
            values.fill(nan)
            return SparseSeries(values, index=new_index, fill_value=self.fill_value)

        new_index, fill_vec = self.index.reindex(index, method=method, limit=limit)
        new_values = common.take_1d(self.values, fill_vec)
        return SparseSeries(new_values, index=new_index, fill_value=self.fill_value, name=self.name)
Ejemplo n.º 10
0
    def _delegate_property_get(self, name):
        from pandas import Series

        result = getattr(self.values, name)

        # maybe need to upcast (ints)
        if isinstance(result, np.ndarray):
            if is_integer_dtype(result):
                result = result.astype('int64')
        elif not is_list_like(result):
            return result

        # blow up if we operate on categories
        if self.orig is not None:
            result = take_1d(result, self.orig.cat.codes)

        # return the result as a Series, which is by definition a copy
        result = Series(result, index=self.index, name=self.name)

        # setting this object will show a SettingWithCopyWarning/Error
        result.is_copy = ("modifications to a property of a datetimelike "
                          "object are not supported and are discarded. "
                          "Change values on the original.")

        return result
Ejemplo n.º 11
0
    def _reindex_with_indexers(self, reindexers, method=None, fill_value=None, limit=None,
                               copy=False, allow_dups=False):

        if method is not None or limit is not None:
            raise NotImplementedError("cannot reindex with a method or limit with sparse")

        if fill_value is None:
            fill_value = np.nan

        index,   row_indexer = reindexers.get(0, (None, None))
        columns, col_indexer = reindexers.get(1, (None, None))

        if columns is None:
            columns = self.columns

        new_arrays = {}
        for col in columns:
            if col not in self:
                continue
            if row_indexer is not None:
                new_arrays[col] = com.take_1d(
                    self[col].get_values(), row_indexer,
                    fill_value=fill_value)
            else:
                new_arrays[col] = self[col]

        return SparseDataFrame(new_arrays, index=index, columns=columns).__finalize__(self)
Ejemplo n.º 12
0
    def __array__(self, dtype=None):
        """ The numpy array interface.

        Returns
        -------
        values : numpy array
            A numpy array of the same dtype as categorical.levels.dtype
        """
        return com.take_1d(self.levels.values, self._codes)
Ejemplo n.º 13
0
    def take_nd(self, indexer, allow_fill=True, fill_value=None):
        """ Take the values by the indexer, fill with the fill_value. """
        if allow_fill and fill_value is None:
            fill_value = np.nan

        values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value)
        result = Categorical(values=values, levels=self.levels, ordered=self.ordered,
                             name=self.name, fastpath=True)
        return result
Ejemplo n.º 14
0
    def _maybe_add_join_keys(self, result, left_indexer, right_indexer):
        # insert group keys

        keys = zip(self.join_names, self.left_on, self.right_on)
        for i, (name, lname, rname) in enumerate(keys):
            if not _should_fill(lname, rname):
                continue

            if name in result:
                key_col = result[name]

                if left_indexer is not None and right_indexer is not None:

                    if name in self.left:
                        na_indexer = (left_indexer == -1).nonzero()[0]
                        if len(na_indexer) == 0:
                            continue

                        right_na_indexer = right_indexer.take(na_indexer)
                        key_col.put(
                            na_indexer, com.take_1d(self.right_join_keys[i],
                                                    right_na_indexer))
                    elif name in self.right:
                        na_indexer = (right_indexer == -1).nonzero()[0]
                        if len(na_indexer) == 0:
                            continue

                        left_na_indexer = left_indexer.take(na_indexer)
                        key_col.put(na_indexer, com.take_1d(self.left_join_keys[i],
                                                            left_na_indexer))

            elif left_indexer is not None \
                    and isinstance(self.left_join_keys[i], np.ndarray):

                if name is None:
                    name = 'key_%d' % i

                # a faster way?
                key_col = com.take_1d(self.left_join_keys[i], left_indexer)
                na_indexer = (left_indexer == -1).nonzero()[0]
                right_na_indexer = right_indexer.take(na_indexer)
                key_col.put(na_indexer, com.take_1d(self.right_join_keys[i],
                                                    right_na_indexer))
                result.insert(i, name, key_col)
Ejemplo n.º 15
0
    def take_nd(self, indexer, allow_fill=True, fill_value=None):
        """ Take the codes by the indexer, fill with the fill_value. """

        # filling must always be None/nan here
        # but is passed thru internally
        assert isnull(fill_value)

        codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
        result = Categorical(codes, levels=self.levels, ordered=self.ordered,
                             name=self.name, fastpath=True)
        return result
Ejemplo n.º 16
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1))
    else:
        raise NotImplementedError
Ejemplo n.º 17
0
    def _reindex_indexer(self, new_index, indexer, copy):
        if indexer is not None:
            new_values = com.take_1d(self.values.values, indexer)
        else:
            if copy:
                result = self.copy()
            else:
                result = self
            return result

        # be subclass-friendly
        return self._constructor(new_values, new_index, name=self.name)
Ejemplo n.º 18
0
        def _test_dtype(dtype, can_hold_na):
            data = np.random.randint(0, 2, 4).astype(dtype)

            indexer = [2, 1, 0, 1]
            out = np.empty(4, dtype=dtype)
            com.take_1d(data, indexer, out=out)
            expected = data.take(indexer)
            tm.assert_almost_equal(out, expected)

            indexer = [2, 1, 0, -1]
            out = np.empty(4, dtype=dtype)
            if can_hold_na:
                com.take_1d(data, indexer, out=out)
                expected = data.take(indexer)
                expected[3] = np.nan
                tm.assert_almost_equal(out, expected)
            else:
                self.assertRaises(Exception, com.take_1d, data,
                                  indexer, out=out)
                # no exception o/w
                data.take(indexer, out=out)
Ejemplo n.º 19
0
    def __array__(self, dtype=None):
        """ The numpy array interface.

        Returns
        -------
        values : numpy array
            A numpy array of either the specified dtype or, if dtype==None (default), the same
            dtype as categorical.levels.dtype
        """
        ret = com.take_1d(self.levels.values, self._codes)
        if dtype and dtype != self.levels.dtype:
            return np.asarray(ret, dtype)
        return ret
Ejemplo n.º 20
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError("axis 1 is not supported")
        return DataFrame(
            obj._data.reindex_indexer(new_axis=new_index,
                                      indexer=indexer,
                                      axis=1))
    else:
        raise ValueError("'obj' should be either a Series or a DataFrame")
Ejemplo n.º 21
0
    def _reindex_with_indexers(self, index, row_indexer, columns, col_indexer,
                               copy, fill_value):
        if columns is None:
            columns = self.columns

        new_arrays = {}
        for col in columns:
            if col not in self:
                continue
            if row_indexer is not None:
                new_arrays[col] = com.take_1d(self[col].values, row_indexer,
                                              fill_value=fill_value)
            else:
                new_arrays[col] = self[col]

        return self._constructor(new_arrays, index=index, columns=columns)
Ejemplo n.º 22
0
    def get_values(self):
        """ Return the values.

        For internal compatibility with pandas formatting.

        Returns
        -------
        values : numpy array
            A numpy array of the same dtype as categorical.levels.dtype or dtype string if periods
        """

        # if we are a period index, return a string repr
        if isinstance(self.levels, PeriodIndex):
            return com.take_1d(np.array(self.levels.to_native_types(), dtype=object),
                               self._codes)

        return np.array(self)
Ejemplo n.º 23
0
def _take_new_index(obj, indexer, new_index, axis=0):
    from pandas.core.api import Series, DataFrame
    from pandas.core.internals import BlockManager

    if isinstance(obj, Series):
        new_values = com.take_1d(obj.values, indexer)
        return Series(new_values, index=new_index, name=obj.name)
    elif isinstance(obj, DataFrame):
        if axis == 1:
            raise NotImplementedError
        data = obj._data

        new_blocks = [b.take(indexer, axis=1) for b in data.blocks]
        new_axes = list(data.axes)
        new_axes[1] = new_index
        new_data = BlockManager(new_blocks, new_axes)
        return DataFrame(new_data)
    else:
        raise NotImplementedError
Ejemplo n.º 24
0
    def _wrap_result_expand(self, result, expand=False):
        if not isinstance(expand, bool):
            raise ValueError("expand must be True or False")

        # for category, we do the stuff on the categories, so blow it up
        # to the full series again
        if self._is_categorical:
            result = take_1d(result, self._orig.cat.codes)

        from pandas.core.index import Index, MultiIndex
        if not hasattr(result, 'ndim'):
            return result

        if isinstance(self._orig, Index):
            name = getattr(result, 'name', None)
            # if result is a boolean np.array, return the np.array
            # instead of wrapping it into a boolean Index (GH 8875)
            if hasattr(result, 'dtype') and is_bool_dtype(result):
                return result

            if expand:
                result = list(result)
                return MultiIndex.from_tuples(result, names=name)
            else:
                return Index(result, name=name)
        else:
            index = self._orig.index
            if expand:

                def cons_row(x):
                    if is_list_like(x):
                        return x
                    else:
                        return [x]

                cons = self._orig._constructor_expanddim
                data = [cons_row(x) for x in result]
                return cons(data, index=index)
            else:
                name = getattr(result, 'name', None)
                cons = self._orig._constructor
                return cons(result, name=name, index=index)
Ejemplo n.º 25
0
    def _delegate_property_get(self, name):
        from pandas import Series

        result = getattr(self.values,name)

        # maybe need to upcast (ints)
        if isinstance(result, np.ndarray):
            if is_integer_dtype(result):
                result = result.astype('int64')
        elif not is_list_like(result):
            return result

        # blow up if we operate on categories
        if self.orig is not None:
            result = take_1d(result, self.orig.cat.codes)

        # return the result as a Series, which is by definition a copy
        result = Series(result, index=self.index, name=self.name)

        # setting this object will show a SettingWithCopyWarning/Error
        result.is_copy = ("modifications to a property of a datetimelike object are not "
                          "supported and are discarded. Change values on the original.")

        return result
Ejemplo n.º 26
0
    def _reindex_indexer_items(self, new_items, indexer, fill_value):
        # TODO: less efficient than I'd like

        item_order = com.take_1d(self.items.values, indexer)

        # keep track of what items aren't found anywhere
        mask = np.zeros(len(item_order), dtype=bool)

        new_blocks = []
        for blk in self.blocks:
            blk_indexer = blk.items.get_indexer(item_order)
            selector = blk_indexer != -1
            # update with observed items
            mask |= selector

            if not selector.any():
                continue

            new_block_items = new_items.take(selector.nonzero()[0])
            new_values = com.take_fast(blk.values,
                                       blk_indexer[selector],
                                       None,
                                       False,
                                       axis=0)
            new_blocks.append(
                make_block(new_values, new_block_items, new_items))

        if not mask.all():
            na_items = new_items[-mask]
            na_block = self._make_na_block(na_items,
                                           new_items,
                                           fill_value=fill_value)
            new_blocks.append(na_block)
            new_blocks = _consolidate(new_blocks, new_items)

        return BlockManager(new_blocks, [new_items] + self.axes[1:])
Ejemplo n.º 27
0
        def _test_dtype(dtype, can_hold_na, writeable=True):
            data = np.random.randint(0, 2, 4).astype(dtype)
            data.flags.writeable = writeable

            indexer = [2, 1, 0, 1]
            out = np.empty(4, dtype=dtype)
            com.take_1d(data, indexer, out=out)
            expected = data.take(indexer)
            tm.assert_almost_equal(out, expected)

            indexer = [2, 1, 0, -1]
            out = np.empty(4, dtype=dtype)
            if can_hold_na:
                com.take_1d(data, indexer, out=out)
                expected = data.take(indexer)
                expected[3] = np.nan
                tm.assert_almost_equal(out, expected)
            else:
                with tm.assertRaisesRegexp(TypeError, self.fill_error):
                    com.take_1d(data, indexer, out=out)
                # no exception o/w
                data.take(indexer, out=out)
Ejemplo n.º 28
0
 def take_1d_pg2_int64():
     com.take_1d(self.df.int64.values, self.indexer)
Ejemplo n.º 29
0
 def take_1d_pg2_float64(self):
     com.take_1d(self.df.float64.values, self.indexer)
Ejemplo n.º 30
0
 def __array__(self, dtype=None):
     return com.take_1d(self.levels.values, self.labels)