Exemple #1
0
    def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one
        ordered : bool
            If True (default), the arrays will be appended in the
            order given. If False, array data may be interleaved
            in blocks, which can greatly improve performance but
            results in non-deterministic ordering of elements.

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others],
                                       ordered=False))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c + off for c, off in \
                                   zip([self.codes] + [o.codes for o in others], idxoffsets)],
                                  ordered=ordered)
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Exemple #2
0
    def merge(self, others : List[Categorical]) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others]))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others]))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \
                                    + [o.codes for o in others], idxoffsets)])
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Exemple #3
0
    def broadcast(self, values : pdarray) -> pdarray:
        """
        Fill each group's segment with a constant value.

        Parameters
        ----------
        values : pdarray
            The values to put in each group's segment

        Returns
        -------
        pdarray
            The broadcast values
            
        Raises
        ------
        TypeError
            Raised if value is not a pdarray object
        ValueError
            Raised if the values array does not have one 
            value per segment

        Notes
        -----
        This function is a sparse analog of ``np.broadcast``. If a
        GroupBy object represents a sparse matrix (tensor), then
        this function takes a (dense) column vector and replicates
        each value to the non-zero elements in the corresponding row.

        The returned array is in permuted (grouped) order. To get
        back to the order of the array on which GroupBy was called,
        the user must invert the permutation (see below).

        Examples
        --------
        >>> a = ak.array([0, 1, 0, 1, 0])
        >>> values = ak.array([3, 5])
        >>> g = ak.GroupBy(a)
        # Result is in grouped order
        >>> g.broadcast(values)
        array([3, 3, 3, 5, 5]

        >>> b = ak.zeros_like(a)
        # Result is in original order
        >>> b[g.permutation] = g.broadcast(values)
        >>> b
        array([3, 5, 3, 5, 3])
        """

        if not isinstance(values, pdarray):
            raise TypeError("Vals must be pdarray")
        if values.size != self.segments.size:
            raise ValueError("Must have one value per segment")
        temp = zeros(self.size, values.dtype)
        if values.size == 0:
            return temp
        diffs = concatenate((array([values[0]]), values[1:] - values[:-1]))
        temp[self.segments] = diffs
        return cumsum(temp)
Exemple #4
0
    def broadcast(self, values : pdarray) -> pdarray:
        """
        Fill each group's segment with a constant value.

        Parameters
        ----------
        values : pdarray
            The values to put in each group's segment

        Returns
        -------
        pdarray
            The broadcast values
            
        Raises
        ------
        TypeError
            Raised if value is not a pdarray object
        ValueError
            Raised if the values array does not have one 
            value per segment

        Notes
        -----
        This function is a sparse analog of ``np.broadcast``. If a
        GroupBy object represents a sparse matrix (tensor), then
        this function takes a (dense) column vector and replicates
        each value to the non-zero elements in the corresponding row.

        The returned array is in permuted (grouped) order. To get
        back to the order of the array on which GroupBy was called,
        the user must invert the permutation (see below).

        Examples
        --------
        >>> a = ak.array([0, 1, 0, 1, 0])
        >>> values = ak.array([3, 5])
        >>> g = ak.GroupBy(a)
        # Result is in grouped order
        >>> g.broadcast(values)
        array([3, 3, 3, 5, 5]

        >>> b = ak.zeros_like(a)
        # Result is in original order
        >>> b[g.permutation] = g.broadcast(values)
        >>> b
        array([3, 5, 3, 5, 3])
        
        >>> a = ak.randint(1,5,10)
        >>> a
        array([3, 1, 4, 4, 4, 1, 3, 3, 2, 2])
        >>> g = ak.GroupBy(a)
        >>> keys,counts = g.count()
        >>> g.broadcast(counts > 2)
        array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
        >>> g.broadcast(counts == 3)
        array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
        >>> g.broadcast(counts < 4)
        array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
        """
        '''if values a boolean array, convert to an int64 array, which
           is needed for now because Arkouda does not support broadcasting
           of boolean arrays'''
        if values.dtype == np.bool:
            values = 1*values
        if values.size != self.segments.size:
            raise ValueError("Must have one value per segment")
        temp = zeros(self.size, values.dtype)
        if values.size == 0:
            return temp
        diffs = concatenate((array([values[0]]), values[1:] - values[:-1]))
        temp[self.segments] = diffs
        return cumsum(temp)
Exemple #5
0
    def _binop(self, other: Union[Categorical, str_scalars],
               op: str_scalars) -> pdarray:
        """
        Executes the requested binop on this Categorical instance and returns 
        the results within a pdarray object.

        Parameters
        ----------
        other : Union[Categorical,str_scalars]
            the other object is a Categorical object or string scalar
        op : str_scalars
            name of the binary operation to be performed 
      
        Returns
        -------
        pdarray
            encapsulating the results of the requested binop      

        Raises
    -   -----
        ValueError
            Raised if (1) the op is not in the self.BinOps set, or (2) if the
            sizes of this and the other instance don't match
        RuntimeError
            Raised if a server-side error is thrown while executing the
            binary operation
        """
        if op not in self.BinOps:
            raise NotImplementedError("Categorical: unsupported operator: {}".\
                                      format(op))
        if np.isscalar(other) and resolve_scalar_dtype(other) == "str":
            idxresult = self.categories._binop(other, op)
            return idxresult[self.codes]
        if self.size != cast(Categorical, other).size:
            raise ValueError("Categorical {}: size mismatch {} {}".\
                             format(op, self.size, cast(Categorical,other).size))
        if isinstance(other, Categorical):
            if (self.categories.size
                    == other.categories.size) and (self.categories
                                                   == other.categories).all():
                # Because categories are identical, codes can be compared directly
                return self.codes._binop(other.codes, op)
            else:
                # Remap both codes to the union of categories
                union = unique(
                    concatenate((self.categories, other.categories),
                                ordered=False))
                newinds = arange(union.size)
                # Inds of self.categories in unioned categories
                selfnewinds = newinds[in1d(union, self.categories)]
                # Need a permutation and segments to broadcast new codes
                if self.permutation is None or self.segments is None:
                    g = GroupBy(self.codes)
                    self.permutation = g.permutation
                    self.segments = g.segments
                # Form new codes by broadcasting new indices for unioned categories
                selfnewcodes = broadcast(self.segments, selfnewinds, self.size,
                                         self.permutation)
                # Repeat for other
                othernewinds = newinds[in1d(union, other.categories)]
                if other.permutation is None or other.segments is None:
                    g = GroupBy(other.codes)
                    other.permutation = g.permutation
                    other.segments = g.segments
                othernewcodes = broadcast(other.segments, othernewinds,
                                          other.size, other.permutation)
                # selfnewcodes and othernewcodes now refer to same unioned categories
                # and can be compared directly
                return selfnewcodes._binop(othernewcodes, op)
        else:
            raise NotImplementedError(
                ("Operations between Categorical and " +
                 "non-Categorical not yet implemented. " +
                 "Consider converting operands to Categorical."))