Ejemplo n.º 1
0
 def __init__(self, values, **kwargs) -> None:
     self.logger = getArkoudaLogger(name=__class__.__name__)  # type: ignore
     if 'codes' in kwargs and 'categories' in kwargs:
         # This initialization is called by Categorical.from_codes()
         # The values arg is ignored
         self.codes = kwargs['codes']
         self.categories = kwargs['categories']
         if 'permutation' in kwargs:
             self.permutation = cast(pdarray, kwargs['permutation'])
         if 'segments' in kwargs:
             self.segments = cast(pdarray, kwargs['segments'])
     else:
         # Typical initialization, called with values
         if not isinstance(values, Strings):
             raise ValueError(("Categorical: inputs other than " +
                               "Strings not yet supported"))
         g = GroupBy(values)
         self.categories = g.unique_keys
         self.codes = g.broadcast(arange(self.categories.size),
                                  permute=True)
         self.permutation = cast(pdarray, g.permutation)
         self.segments = g.segments
     # Always set these values
     self.size: int_scalars = self.codes.size
     self.nlevels = self.categories.size
     self.ndim = self.codes.ndim
     self.shape = self.codes.shape
     self.name: Optional[str] = None
Ejemplo n.º 2
0
 def __init__(self, values, **kwargs):
     if 'codes' in kwargs and 'categories' in kwargs:
         # This initialization is called by Categorical.from_codes()
         # The values arg is ignored
         self.codes = kwargs['codes']
         self.categories = kwargs['categories']            
         if 'permutation' in kwargs:
             self.permutation = kwargs['permutation']
         if 'segments' in kwargs:
             self.segments = kwargs['segments']
     else:
         # Typical initialization, called with values
         if not isinstance(values, Strings):
             raise ValueError("Categorical: inputs other than Strings not yet supported")
         g = GroupBy(values)
         self.categories = g.unique_keys
         self.codes = zeros(values.size, dtype=int64)
         self.codes[g.permutation] = g.broadcast(arange(self.categories.size))
         self.permutation = g.permutation
         self.segments = g.segments
     # Always set these values
     self.size = self.codes.size
     self.nlevels = self.categories.size
     self.ndim = self.codes.ndim
     self.shape = self.codes.shape
Ejemplo n.º 3
0
 def reset_categories(self):
     """
     Recompute the category labels, discarding any unused labels. This method
     is often useful after slicing or indexing a Categorical array, when the
     resulting array only contains a subset of the original categories. In
     this case, eliminating unused categories can speed up other operations.
     """
     g = GroupBy(self.codes)
     idx = self.categories[g.unique_keys]
     newvals = zeros(self.codes.size, int64)
     newvals[g.permutation] = g.broadcast(arange(idx.size))
     return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
Ejemplo n.º 4
0
    def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one
        ordered : bool
            If True (default), the arrays will be appended in the
            order given. If False, array data may be interleaved
            in blocks, which can greatly improve performance but
            results in non-deterministic ordering of elements.

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others],
                                       ordered=False))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c + off for c, off in \
                                   zip([self.codes] + [o.codes for o in others], idxoffsets)],
                                  ordered=ordered)
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Ejemplo n.º 5
0
def join_on_eq_with_dt(a1, a2, t1, t2, dt, pred, result_limit=1000):
    if not isinstance(a1, pdarray):
        raise ValueError("a1 must be pdarray")
    else:
        if not (a1.dtype == int64):
            raise ValueError("a1 must be int64 dtype")
        
    if not isinstance(a2, pdarray):
        raise ValueError("a2 must be pdarray")
    else:
        if not (a2.dtype == int64):
            raise ValueError("a2 must be int64 dtype")
        
    if not isinstance(t1, pdarray):
        raise ValueError("t1 must be pdarray")
    else:
        if not (t1.dtype == int64):
            raise ValueError("t1 must be int64 dtype")
        
    if not isinstance(t2, pdarray):
        raise ValueError("t2 must be pdarray")
    else:
        if not (t2.dtype == int64):
            raise ValueError("t2 must be int64 dtype")
        
    if not isinstance(dt, int):
        raise ValueError("dt must be an an int")
    
    if not (pred in predicates.keys()):
        raise ValueError("pred must be one of ",predicates.keys())
    
    if not isinstance(result_limit, int):
        raise ValueError("result_limit must be a scalar")

    # format numbers for request message
    dttype = resolve_scalar_dtype(dt)
    dtstr = NUMBER_FORMAT_STRINGS[dttype].format(dt)
    predtype = resolve_scalar_dtype(predicates[pred])
    predstr = NUMBER_FORMAT_STRINGS[predtype].format(predicates[pred])
    result_limittype = resolve_scalar_dtype(result_limit)
    result_limitstr = NUMBER_FORMAT_STRINGS[result_limittype].format(result_limit)
    # groupby on a2
    g2 = GroupBy(a2)
    # pass result into server joinEqWithDT operation
    repMsg = generic_msg("joinEqWithDT {} {} {} {} {} {} {} {} {}".format(a1.name,
                                                                          g2.segments.name,
                                                                          g2.unique_keys.name,
                                                                          g2.permutation.name,
                                                                          t1.name,
                                                                          t2.name,
                                                                          dtstr, predstr, result_limitstr))
    # create pdarrays for results
    resIAttr, resJAttr = repMsg.split("+")
    resI = create_pdarray(resIAttr)
    resJ = create_pdarray(resJAttr)
    return (resI, resJ)
Ejemplo n.º 6
0
 def reset_categories(self) -> Categorical:
     """
     Recompute the category labels, discarding any unused labels. This
     method is often useful after slicing or indexing a Categorical array, 
     when the resulting array only contains a subset of the original 
     categories. In this case, eliminating unused categories can speed up 
     other operations.
     
     Returns
     -------
     Categorical
         A Categorical object generated from the current instance
     """
     g = GroupBy(self.codes)
     idx = self.categories[g.unique_keys]
     newvals = g.broadcast(arange(idx.size), permute=True)
     return Categorical.from_codes(newvals,
                                   idx,
                                   permutation=g.permutation,
                                   segments=g.segments)
Ejemplo n.º 7
0
    def merge(self, others : List[Categorical]) -> Categorical:
        """
        Merge this Categorical with other Categorical objects in the array, 
        concatenating the arrays and synchronizing the categories.

        Parameters
        ----------
        others : List[Categorical]
            The Categorical arrays to concatenate and merge with this one

        Returns
        -------
        Categorical 
            The merged Categorical object
            
        Raises
        ------
        TypeError
            Raised if any others array objects are not Categorical objects

        Notes
        -----
        This operation can be expensive -- slower than concatenating Strings.
        """
        if isinstance(others, Categorical):
            others = [others]
        elif len(others) < 1:
            return self
        samecategories = True
        for c in others:
            if not isinstance(c, Categorical):
                raise TypeError(("Categorical: can only merge/concatenate " +
                                "with other Categoricals"))
            if (self.categories.size != c.categories.size) or not \
                                    (self.categories == c.categories).all():
                samecategories = False
        if samecategories:
            newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others]))
            return Categorical.from_codes(newvals, self.categories)
        else:
            g = GroupBy(concatenate([self.categories] + \
                                       [o.categories for o in others]))
            newidx = g.unique_keys
            wherediditgo = zeros(newidx.size, dtype=akint64)
            wherediditgo[g.permutation] = arange(newidx.size)
            idxsizes = np.array([self.categories.size] + \
                                [o.categories.size for o in others])
            idxoffsets = np.cumsum(idxsizes) - idxsizes
            oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \
                                    + [o.codes for o in others], idxoffsets)])
            newvals = wherediditgo[oldvals]
            return Categorical.from_codes(newvals, newidx)
Ejemplo n.º 8
0
def join_on_eq_with_dt(a1: pdarray,
                       a2: pdarray,
                       t1: pdarray,
                       t2: pdarray,
                       dt: int,
                       pred: str,
                       result_limit: int = 1000) -> Tuple[pdarray, pdarray]:
    """
    Performs an inner-join on equality between two integer arrays where 
    the time-window predicate is also true

    Parameters
    ----------
    a1 : pdarray, int64
        pdarray to be joined
    a2 : pdarray, int64
        pdarray to be joined
    t1 : pdarray
        timestamps in millis corresponding to the a1 pdarray
    t2 : pdarray, 
        timestamps in millis corresponding to the a2 pdarray
    dt : int
        time delta
    pred : str
        time window predicate
    result_limit : int
        size limit for returned result    

    Returns
    -------
    result_array_one : pdarray, int64
        a1 indices where a1 == a2
    result_array_one : pdarray, int64
        a2 indices where a2 == a1
        
    Raises
    ------
    TypeError
        Raised if a1, a2, t1, or t2 is not a pdarray, or if dt or 
        result_limit is not an int
    ValueError
        if a1, a2, t1, or t2 dtype is not int64, pred is not 
        'true_dt', 'abs_dt', or 'pos_dt', or result_limit is < 0    
    """
    if not (a1.dtype == akint64):
        raise ValueError("a1 must be int64 dtype")

    if not (a2.dtype == akint64):
        raise ValueError("a2 must be int64 dtype")

    if not (t1.dtype == akint64):
        raise ValueError("t1 must be int64 dtype")

    if not (t2.dtype == akint64):
        raise ValueError("t2 must be int64 dtype")

    if not (pred in predicates.keys()):
        raise ValueError("pred must be one of ", predicates.keys())

    if result_limit < 0:
        raise ValueError('the result_limit must 0 or greater')

    # format numbers for request message
    dttype = resolve_scalar_dtype(dt)
    dtstr = NUMBER_FORMAT_STRINGS[dttype].format(dt)
    predtype = resolve_scalar_dtype(predicates[pred])
    predstr = NUMBER_FORMAT_STRINGS[predtype].format(predicates[pred])
    result_limittype = resolve_scalar_dtype(result_limit)
    result_limitstr = NUMBER_FORMAT_STRINGS[result_limittype].\
                                 format(result_limit)
    # groupby on a2
    g2 = GroupBy(a2)
    # pass result into server joinEqWithDT operation
    repMsg = generic_msg("joinEqWithDT {} {} {} {} {} {} {} {} {}".\
                         format(a1.name,
                                cast(pdarray, g2.segments).name,  # type: ignore
                                cast(pdarray, g2.unique_keys).name,  # type: ignore
                                g2.permutation.name,
                                t1.name,
                                t2.name,
                                dtstr, predstr, result_limitstr))
    # create pdarrays for results
    resIAttr, resJAttr = cast(str, repMsg).split("+")
    resI = create_pdarray(resIAttr)
    resJ = create_pdarray(resJAttr)
    return (resI, resJ)
Ejemplo n.º 9
0
    def _binop(self, other: Union[Categorical, str_scalars],
               op: str_scalars) -> pdarray:
        """
        Executes the requested binop on this Categorical instance and returns 
        the results within a pdarray object.

        Parameters
        ----------
        other : Union[Categorical,str_scalars]
            the other object is a Categorical object or string scalar
        op : str_scalars
            name of the binary operation to be performed 
      
        Returns
        -------
        pdarray
            encapsulating the results of the requested binop      

        Raises
    -   -----
        ValueError
            Raised if (1) the op is not in the self.BinOps set, or (2) if the
            sizes of this and the other instance don't match
        RuntimeError
            Raised if a server-side error is thrown while executing the
            binary operation
        """
        if op not in self.BinOps:
            raise NotImplementedError("Categorical: unsupported operator: {}".\
                                      format(op))
        if np.isscalar(other) and resolve_scalar_dtype(other) == "str":
            idxresult = self.categories._binop(other, op)
            return idxresult[self.codes]
        if self.size != cast(Categorical, other).size:
            raise ValueError("Categorical {}: size mismatch {} {}".\
                             format(op, self.size, cast(Categorical,other).size))
        if isinstance(other, Categorical):
            if (self.categories.size
                    == other.categories.size) and (self.categories
                                                   == other.categories).all():
                # Because categories are identical, codes can be compared directly
                return self.codes._binop(other.codes, op)
            else:
                # Remap both codes to the union of categories
                union = unique(
                    concatenate((self.categories, other.categories),
                                ordered=False))
                newinds = arange(union.size)
                # Inds of self.categories in unioned categories
                selfnewinds = newinds[in1d(union, self.categories)]
                # Need a permutation and segments to broadcast new codes
                if self.permutation is None or self.segments is None:
                    g = GroupBy(self.codes)
                    self.permutation = g.permutation
                    self.segments = g.segments
                # Form new codes by broadcasting new indices for unioned categories
                selfnewcodes = broadcast(self.segments, selfnewinds, self.size,
                                         self.permutation)
                # Repeat for other
                othernewinds = newinds[in1d(union, other.categories)]
                if other.permutation is None or other.segments is None:
                    g = GroupBy(other.codes)
                    other.permutation = g.permutation
                    other.segments = g.segments
                othernewcodes = broadcast(other.segments, othernewinds,
                                          other.size, other.permutation)
                # selfnewcodes and othernewcodes now refer to same unioned categories
                # and can be compared directly
                return selfnewcodes._binop(othernewcodes, op)
        else:
            raise NotImplementedError(
                ("Operations between Categorical and " +
                 "non-Categorical not yet implemented. " +
                 "Consider converting operands to Categorical."))