def min(self): """ Get minimum numeric value in the RDD. Returns None on an empty RDD. Raises an exception if called on an RDD with non-numeric type. """ self._entry() count = self._count() # action if count == 0: # action return None if not is_numeric_type(self.elem_type): raise TypeError('sum: non numeric type') return self._rdd.min() # action
def mean(self): """ Mean of all the values in the RDD. Returns None on an empty RDD. Raises an exception if called on an RDD with non-numeric type. """ self._entry() count = self._count() # action if count == 0: # action return None if not is_numeric_type(self.elem_type): raise TypeError('mean: non numeric type') return self._rdd.mean() # action
def sum(self): """ Sum of all values in the RDD. Raises an exception if called on an RDD of strings, lists, or dictionaries. If the RDD contains numeric lists or arrays (array.array) and all the arrays are the same length, the sum over all the arrays will be returned. Returns None on an empty RDD. For large values, this may overflow without warning. """ self._entry() count = self._count() # action if count == 0: # action return None if is_numeric_type(self.elem_type): total = self._rdd.sum() # action elif self.elem_type is array.array: def array_sum(x, y): if x.typecode != y.typecode: logging.warn('Sum: arrays are not compatible') total = array.array(x.typecode) total.fromlist([a + b for a, b in zip(x, y)]) return total total = self._rdd.reduce(array_sum) elif self.elem_type is list: def list_sum(x, y): return [a + b for a, b in zip(x, y)] total = self._rdd.reduce(list_sum) elif self.elem_type is dict: def dict_sum(x, y): return {k: x.get(k, 0) + y.get(k, 0) for k in set(x) & set(y)} total = self._rdd.reduce(dict_sum) else: raise TypeError('sum: non numeric type') return total
def construct_from_xarray(self, xa, sub_sketch_keys=None): self._entry(sub_sketch_keys=sub_sketch_keys) if sub_sketch_keys is not None: raise NotImplementedError('sub_sketch_keys mode not implemented') # these are not going through the xrdd layer -- should they? defined = xa.to_rdd().filter(lambda x: not is_missing(x)) defined.cache() self.dtype = xa.dtype() self.count = defined.count() if util.is_numeric_type(self.dtype): self.sketch_type = 'numeric' elif util.is_date_type(self.dtype): self.sketch_type = 'date' else: self.sketch_type = 'non-numeric' # compute others later if needed self._rdd = xa.to_rdd() self.defined = defined
def var(self, ddof): """ Variance of all the values in the RDD. Returns None on an empty RDD. Raises an exception if called on an RDD with non-numeric type or if `ddof` >= length of XArray. """ self._entry(ddof=ddof) count = self._count() # action if count == 0: # action return None if not is_numeric_type(self.elem_type): raise TypeError('mean: non numeric type') if ddof < 0 or ddof > 1 or ddof >= count: raise ValueError('std: invalid ddof {}'.format(ddof)) if ddof == 0: res = self._rdd.variance() # action else: res = self._rdd.sampleVariance() # action return res