def __init__(self, values, **kwargs) -> None: self.logger = getArkoudaLogger(name=__class__.__name__) # type: ignore if 'codes' in kwargs and 'categories' in kwargs: # This initialization is called by Categorical.from_codes() # The values arg is ignored self.codes = kwargs['codes'] self.categories = kwargs['categories'] if 'permutation' in kwargs: self.permutation = cast(pdarray, kwargs['permutation']) if 'segments' in kwargs: self.segments = cast(pdarray, kwargs['segments']) else: # Typical initialization, called with values if not isinstance(values, Strings): raise ValueError(("Categorical: inputs other than " + "Strings not yet supported")) g = GroupBy(values) self.categories = g.unique_keys self.codes = g.broadcast(arange(self.categories.size), permute=True) self.permutation = cast(pdarray, g.permutation) self.segments = g.segments # Always set these values self.size: int_scalars = self.codes.size self.nlevels = self.categories.size self.ndim = self.codes.ndim self.shape = self.codes.shape self.name: Optional[str] = None
def __init__(self, values, **kwargs): if 'codes' in kwargs and 'categories' in kwargs: # This initialization is called by Categorical.from_codes() # The values arg is ignored self.codes = kwargs['codes'] self.categories = kwargs['categories'] if 'permutation' in kwargs: self.permutation = kwargs['permutation'] if 'segments' in kwargs: self.segments = kwargs['segments'] else: # Typical initialization, called with values if not isinstance(values, Strings): raise ValueError("Categorical: inputs other than Strings not yet supported") g = GroupBy(values) self.categories = g.unique_keys self.codes = zeros(values.size, dtype=int64) self.codes[g.permutation] = g.broadcast(arange(self.categories.size)) self.permutation = g.permutation self.segments = g.segments # Always set these values self.size = self.codes.size self.nlevels = self.categories.size self.ndim = self.codes.ndim self.shape = self.codes.shape
def reset_categories(self): """ Recompute the category labels, discarding any unused labels. This method is often useful after slicing or indexing a Categorical array, when the resulting array only contains a subset of the original categories. In this case, eliminating unused categories can speed up other operations. """ g = GroupBy(self.codes) idx = self.categories[g.unique_keys] newvals = zeros(self.codes.size, int64) newvals[g.permutation] = g.broadcast(arange(idx.size)) return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
def concatenate(self, others : List[Categorical], ordered : bool=True) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one ordered : bool If True (default), the arrays will be appended in the order given. If False, array data may be interleaved in blocks, which can greatly improve performance but results in non-deterministic ordering of elements. Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others], ordered=ordered)) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others], ordered=False)) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c + off for c, off in \ zip([self.codes] + [o.codes for o in others], idxoffsets)], ordered=ordered) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def join_on_eq_with_dt(a1, a2, t1, t2, dt, pred, result_limit=1000): if not isinstance(a1, pdarray): raise ValueError("a1 must be pdarray") else: if not (a1.dtype == int64): raise ValueError("a1 must be int64 dtype") if not isinstance(a2, pdarray): raise ValueError("a2 must be pdarray") else: if not (a2.dtype == int64): raise ValueError("a2 must be int64 dtype") if not isinstance(t1, pdarray): raise ValueError("t1 must be pdarray") else: if not (t1.dtype == int64): raise ValueError("t1 must be int64 dtype") if not isinstance(t2, pdarray): raise ValueError("t2 must be pdarray") else: if not (t2.dtype == int64): raise ValueError("t2 must be int64 dtype") if not isinstance(dt, int): raise ValueError("dt must be an an int") if not (pred in predicates.keys()): raise ValueError("pred must be one of ",predicates.keys()) if not isinstance(result_limit, int): raise ValueError("result_limit must be a scalar") # format numbers for request message dttype = resolve_scalar_dtype(dt) dtstr = NUMBER_FORMAT_STRINGS[dttype].format(dt) predtype = resolve_scalar_dtype(predicates[pred]) predstr = NUMBER_FORMAT_STRINGS[predtype].format(predicates[pred]) result_limittype = resolve_scalar_dtype(result_limit) result_limitstr = NUMBER_FORMAT_STRINGS[result_limittype].format(result_limit) # groupby on a2 g2 = GroupBy(a2) # pass result into server joinEqWithDT operation repMsg = generic_msg("joinEqWithDT {} {} {} {} {} {} {} {} {}".format(a1.name, g2.segments.name, g2.unique_keys.name, g2.permutation.name, t1.name, t2.name, dtstr, predstr, result_limitstr)) # create pdarrays for results resIAttr, resJAttr = repMsg.split("+") resI = create_pdarray(resIAttr) resJ = create_pdarray(resJAttr) return (resI, resJ)
def reset_categories(self) -> Categorical: """ Recompute the category labels, discarding any unused labels. This method is often useful after slicing or indexing a Categorical array, when the resulting array only contains a subset of the original categories. In this case, eliminating unused categories can speed up other operations. Returns ------- Categorical A Categorical object generated from the current instance """ g = GroupBy(self.codes) idx = self.categories[g.unique_keys] newvals = g.broadcast(arange(idx.size), permute=True) return Categorical.from_codes(newvals, idx, permutation=g.permutation, segments=g.segments)
def merge(self, others : List[Categorical]) -> Categorical: """ Merge this Categorical with other Categorical objects in the array, concatenating the arrays and synchronizing the categories. Parameters ---------- others : List[Categorical] The Categorical arrays to concatenate and merge with this one Returns ------- Categorical The merged Categorical object Raises ------ TypeError Raised if any others array objects are not Categorical objects Notes ----- This operation can be expensive -- slower than concatenating Strings. """ if isinstance(others, Categorical): others = [others] elif len(others) < 1: return self samecategories = True for c in others: if not isinstance(c, Categorical): raise TypeError(("Categorical: can only merge/concatenate " + "with other Categoricals")) if (self.categories.size != c.categories.size) or not \ (self.categories == c.categories).all(): samecategories = False if samecategories: newvals = cast(pdarray, concatenate([self.codes] + [o.codes for o in others])) return Categorical.from_codes(newvals, self.categories) else: g = GroupBy(concatenate([self.categories] + \ [o.categories for o in others])) newidx = g.unique_keys wherediditgo = zeros(newidx.size, dtype=akint64) wherediditgo[g.permutation] = arange(newidx.size) idxsizes = np.array([self.categories.size] + \ [o.categories.size for o in others]) idxoffsets = np.cumsum(idxsizes) - idxsizes oldvals = concatenate([c.codes + off for c, off in zip([self.codes] \ + [o.codes for o in others], idxoffsets)]) newvals = wherediditgo[oldvals] return Categorical.from_codes(newvals, newidx)
def join_on_eq_with_dt(a1: pdarray, a2: pdarray, t1: pdarray, t2: pdarray, dt: int, pred: str, result_limit: int = 1000) -> Tuple[pdarray, pdarray]: """ Performs an inner-join on equality between two integer arrays where the time-window predicate is also true Parameters ---------- a1 : pdarray, int64 pdarray to be joined a2 : pdarray, int64 pdarray to be joined t1 : pdarray timestamps in millis corresponding to the a1 pdarray t2 : pdarray, timestamps in millis corresponding to the a2 pdarray dt : int time delta pred : str time window predicate result_limit : int size limit for returned result Returns ------- result_array_one : pdarray, int64 a1 indices where a1 == a2 result_array_one : pdarray, int64 a2 indices where a2 == a1 Raises ------ TypeError Raised if a1, a2, t1, or t2 is not a pdarray, or if dt or result_limit is not an int ValueError if a1, a2, t1, or t2 dtype is not int64, pred is not 'true_dt', 'abs_dt', or 'pos_dt', or result_limit is < 0 """ if not (a1.dtype == akint64): raise ValueError("a1 must be int64 dtype") if not (a2.dtype == akint64): raise ValueError("a2 must be int64 dtype") if not (t1.dtype == akint64): raise ValueError("t1 must be int64 dtype") if not (t2.dtype == akint64): raise ValueError("t2 must be int64 dtype") if not (pred in predicates.keys()): raise ValueError("pred must be one of ", predicates.keys()) if result_limit < 0: raise ValueError('the result_limit must 0 or greater') # format numbers for request message dttype = resolve_scalar_dtype(dt) dtstr = NUMBER_FORMAT_STRINGS[dttype].format(dt) predtype = resolve_scalar_dtype(predicates[pred]) predstr = NUMBER_FORMAT_STRINGS[predtype].format(predicates[pred]) result_limittype = resolve_scalar_dtype(result_limit) result_limitstr = NUMBER_FORMAT_STRINGS[result_limittype].\ format(result_limit) # groupby on a2 g2 = GroupBy(a2) # pass result into server joinEqWithDT operation repMsg = generic_msg("joinEqWithDT {} {} {} {} {} {} {} {} {}".\ format(a1.name, cast(pdarray, g2.segments).name, # type: ignore cast(pdarray, g2.unique_keys).name, # type: ignore g2.permutation.name, t1.name, t2.name, dtstr, predstr, result_limitstr)) # create pdarrays for results resIAttr, resJAttr = cast(str, repMsg).split("+") resI = create_pdarray(resIAttr) resJ = create_pdarray(resJAttr) return (resI, resJ)
def _binop(self, other: Union[Categorical, str_scalars], op: str_scalars) -> pdarray: """ Executes the requested binop on this Categorical instance and returns the results within a pdarray object. Parameters ---------- other : Union[Categorical,str_scalars] the other object is a Categorical object or string scalar op : str_scalars name of the binary operation to be performed Returns ------- pdarray encapsulating the results of the requested binop Raises - ----- ValueError Raised if (1) the op is not in the self.BinOps set, or (2) if the sizes of this and the other instance don't match RuntimeError Raised if a server-side error is thrown while executing the binary operation """ if op not in self.BinOps: raise NotImplementedError("Categorical: unsupported operator: {}".\ format(op)) if np.isscalar(other) and resolve_scalar_dtype(other) == "str": idxresult = self.categories._binop(other, op) return idxresult[self.codes] if self.size != cast(Categorical, other).size: raise ValueError("Categorical {}: size mismatch {} {}".\ format(op, self.size, cast(Categorical,other).size)) if isinstance(other, Categorical): if (self.categories.size == other.categories.size) and (self.categories == other.categories).all(): # Because categories are identical, codes can be compared directly return self.codes._binop(other.codes, op) else: # Remap both codes to the union of categories union = unique( concatenate((self.categories, other.categories), ordered=False)) newinds = arange(union.size) # Inds of self.categories in unioned categories selfnewinds = newinds[in1d(union, self.categories)] # Need a permutation and segments to broadcast new codes if self.permutation is None or self.segments is None: g = GroupBy(self.codes) self.permutation = g.permutation self.segments = g.segments # Form new codes by broadcasting new indices for unioned categories selfnewcodes = broadcast(self.segments, selfnewinds, self.size, self.permutation) # Repeat for other othernewinds = newinds[in1d(union, other.categories)] if other.permutation is None or other.segments is None: g = GroupBy(other.codes) other.permutation = g.permutation other.segments = g.segments othernewcodes = broadcast(other.segments, othernewinds, other.size, other.permutation) # selfnewcodes and othernewcodes now refer to same unioned categories # and can be compared directly return selfnewcodes._binop(othernewcodes, op) else: raise NotImplementedError( ("Operations between Categorical and " + "non-Categorical not yet implemented. " + "Consider converting operands to Categorical."))