def construct_from_xarray(self, xa, sub_sketch_keys=None): self._entry(sub_sketch_keys=sub_sketch_keys) if sub_sketch_keys is not None: raise NotImplementedError('sub_sketch_keys mode not implemented') # these are not going through the xrdd layer -- should they? defined = xa.to_rdd().filter(lambda x: not is_missing(x)) defined.cache() self.dtype = xa.dtype() self.count = defined.count() if util.is_numeric_type(self.dtype): self.sketch_type = 'numeric' elif util.is_date_type(self.dtype): self.sketch_type = 'date' else: self.sketch_type = 'non-numeric' # compute others later if needed self._rdd = xa.to_rdd() self.defined = defined
def _create_stats(self): # calculate some basic statistics if self.stats is None: if util.is_date_type(self.dtype): try: self.min_val = normalize_number(self.defined.min()) self.max_val = normalize_number(self.defined.max()) except py4j.protocol.Py4JJavaError as e: self.min_val = None self.max_val = None logging.warn('Datetime max or min did not compute. ' + 'Possible mixture of offset-native and offset-aware times.') else: stats = self.defined.stats() self.min_val = normalize_number(stats.min()) self.max_val = normalize_number(stats.max()) self.mean_val = normalize_number(stats.mean()) self.sum_val = normalize_number(stats.sum()) self.variance_val = normalize_number(stats.variance()) self.stdev_val = normalize_number(stats.stdev()) self.stats = stats