def main(): data = random(1000) i = 0 for i in range(0,(len(data) - wSize + 1)): digest = TDigest() digest.batch_update(data[i:i+wSize]) results.append([i+1,digest.percentile(15)]) i += 1 print(results)
def digest_partitions(values): digest = TDigest() digest.batch_update(values) return [digest]
class BSketch: """BSketch: binning sketch for numerical values and binary target. Parameters ---------- sketch : str, optional (default="gk") Sketch algorithm. Supported algorithms are "gk" (Greenwald-Khanna's) and "t-digest" (Ted Dunning) algorithm. Algorithm "t-digest" relies on `tdigest <https://github.com/CamDavidsonPilon/tdigest>`_. eps : float (default=0.01) Relative error epsilon. K : int (default=25) Parameter excess growth K to compute compress threshold in t-digest. special_codes : array-like or None, optional (default=None) List of special codes. Use special codes to specify the data values that must be treated separately. """ def __init__(self, sketch="gk", eps=0.01, K=25, special_codes=None): self.sketch = sketch self.eps = eps self.K = K self.special_codes = special_codes _check_parameters(sketch, eps, K, special_codes) self._count_missing_e = 0 self._count_missing_ne = 0 self._count_special_e = 0 self._count_special_ne = 0 if sketch == "gk": self._sketch_e = GK(eps) self._sketch_ne = GK(eps) elif sketch == "t-digest": self._sketch_e = TDigest(eps, K) self._sketch_ne = TDigest(eps, K) def add(self, x, y, check_input=False): """Add arrays to the sketch. Parameters ---------- x : array-like, shape = (n_samples,) Training vector, where n_samples is the number of samples. y : array-like, shape = (n_samples,) Target vector relative to x. check_input : bool (default=False) Whether to check input arrays. """ xc, yc, xm, ym, xs, ys, _, _, _, _, _, _, _ = split_data( dtype=None, x=x, y=y, special_codes=self.special_codes, check_input=check_input) # Add values to sketch mask = yc == 1 if self.sketch == "gk": for v1 in xc[mask]: self._sketch_e.add(v1) for v0 in xc[~mask]: self._sketch_ne.add(v0) if self.sketch == "t-digest": self._sketch_e.batch_update(xc[mask]) self._sketch_ne.batch_update(xc[~mask]) # Keep track of missing and special counts n_missing = len(ym) if n_missing: self._count_missing_e += np.count_nonzero(ym == 1) self._count_missing_ne += np.count_nonzero(ym == 0) n_special = len(ys) if n_special: self._count_special_e += np.count_nonzero(ys == 1) self._count_special_ne += np.count_nonzero(ys == 0) def bins(self, splits): """Event and non-events counts for each bin given a list of split points. Parameters ---------- splits : array-like, shape = (n_splits,) List of split points. Returns ------- bins : tuple of arrays of size n_splits + 1. """ n_bins = len(splits) + 1 bins_e = np.zeros(n_bins).astype(np.int64) bins_ne = np.zeros(n_bins).astype(np.int64) indices_e, count_e = _indices_count(self.sketch, self._sketch_e, splits) indices_ne, count_ne = _indices_count(self.sketch, self._sketch_ne, splits) for i in range(n_bins): bins_e[i] = count_e[(indices_e == i)].sum() bins_ne[i] = count_ne[(indices_ne == i)].sum() return bins_e, bins_ne def merge(self, bsketch): """Merge current instance with another BSketch instance. Parameters ---------- bsketch : object BSketch instance. """ if not self._mergeable(bsketch): raise Exception("bsketch does not share signature.") if bsketch._sketch_e.n == 0 and bsketch._sketch_ne.n == 0: return if self._sketch_e.n == 0 and self._sketch_ne.n == 0: self._copy(bsketch) return # Merge sketches if self.sketch == "gk": self._sketch_e.merge(bsketch._sketch_e) self._sketch_ne.merge(bsketch._sketch_ne) elif self.sketch == "t-digest": self._sketch_e += bsketch._sketch_e self._sketch_ne += bsketch._sketch_ne # Merge missing and special counts self._count_missing_e += bsketch._count_missing_e self._count_missing_ne += bsketch._count_missing_ne self._count_special_e += bsketch._count_special_e self._count_special_ne += bsketch._count_special_ne def merge_sketches(self): """Merge event and non-event data internal sketches.""" if self.sketch == "gk": new_sketch = GK(self.eps) new_sketch.merge(self._sketch_e) new_sketch.merge(self._sketch_ne) else: new_sketch = self._sketch_e + self._sketch_ne return new_sketch def _copy(self, bsketch): self._sketch_e = bsketch._sketch_e self._sketch_ne = bsketch._sketch_ne # Merge missing and special counts self._count_missing_e = bsketch._count_missing_e self._count_missing_ne = bsketch._count_missing_ne self._count_special_e = bsketch._count_special_e self._count_special_ne = bsketch._count_special_ne def _mergeable(self, other): special_eq = True if self.special_codes is not None and other.special_codes is not None: special_eq = set(self.special_codes) == set(other.special_codes) return (self.sketch == other.sketch and self.eps == other.eps and self.K == other.K and special_eq) @property def n_event(self): """Event count. Returns ------- n_event : int """ count = self._sketch_e.n return count + self._count_missing_e + self._count_special_e @property def n_nonevent(self): """Non-event count. Returns ------- n_nonevent : int """ count = self._sketch_ne.n return count + self._count_missing_ne + self._count_special_ne @property def n(self): """Records count. Returns ------- n : int """ return self.n_event + self.n_nonevent
import json start_pos = 0 with open(fn, 'r') as f: while True: try: obj = json.load(f) yield obj return except json.JSONDecodeError as e: f.seek(start_pos) json_str = f.read(e.pos) obj = json.loads(json_str) start_pos += e.pos yield obj final_digest = TDigest() json_object = stream_read_json('./metric-result-no-presumed-revenue.json') for num, json_list in enumerate(json_object): num_list = pyjq.all('.[] | .metric', json_list) digest = TDigest() digest.batch_update(num_list) print(digest.percentile(50)) final_digest = final_digest + digest print('final: ' + str(final_digest.percentile(25))) print('final: ' + str(final_digest.percentile(50))) print('final: ' + str(final_digest.percentile(75))) print('final: ' + str(final_digest.percentile(90)))
class Percentiles(DataFrameModule): parameters = [('percentiles', object, [0.25, 0.5, 0.75]), ('history', np.dtype(int), 3)] def __init__(self, column, percentiles=None, **kwds): if not column: raise ProgressiveError('Need a column name') self._add_slots(kwds,'input_descriptors', [SlotDescriptor('df', type=pd.DataFrame)]) super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds) self._column = column self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < .5] uh = percentiles[percentiles > .5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles] self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC) self._df = create_dataframe(self.schema) def is_ready(self): if self.get_input_slot('df').has_created(): return True return super(Percentiles, self).is_ready() def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: # Series is not numeric or is all NaNs. return None logger.debug("column_summary - " + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ["mean", "min", "max", "std", "sum"]: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult["n"] = column_props[col]["notnulls"] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult["percentiles"] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult["median"] = colresult["percentiles"][50] colresult["iqr"] = (colresult["percentiles"][75] - colresult["percentiles"][25]) # Compute the t-digest. logger.debug("column_summary - {} - creating TDigest...".format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug("column_summary - {} - testing log trans...".format(col)) try: colresult["logtrans"] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning("test_logtrans has failed for column `{}`: {}".format( col, e)) colresult["logtrans"] = False if colresult["logtrans"]: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult["logtrans_mean"] = _tdigest_mean(logdigest) colresult["logtrans_std"] = _tdigest_std(logdigest) colresult["logtrans_IQR"] = logdigest.percentile( 75) - logdigest.percentile(25) logger.debug("column_summary - {} - should {}be log-transformed".format( col, "NOT " if not colresult["logtrans"] else "")) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug("column_summary - {} - computing histogram...".format(col)) if column_props[col]["is_categorical"]: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult["logtrans"]: counts, log_edges = np.histogram(np.log10(data), density=False, bins="fd") edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins="fd") colresult["histogram"] = { "counts": counts.tolist(), "bin_edges": edges.tolist(), } # Compute KDE logger.debug("column_summary - {} - computing KDE...".format(col)) bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) if column_props[col]["is_categorical"]: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult["min"], colresult["max"] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult["logtrans"]) colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} return {col: colresult, "_columns": [col]}
class Percentiles(TableModule): parameters = [ ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]), ("history", np.dtype(int), 3), ] inputs = [SlotDescriptor("table", type=Table)] def __init__(self, column: str, percentiles: Optional[Union[List[float], np.ndarray[Any, Any]]] = None, **kwds: Any) -> None: if not column: raise ProgressiveError("Need a column name") super(Percentiles, self).__init__(**kwds) self._columns = [column] self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): # type: ignore percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < 0.5] uh = percentiles[percentiles > 0.5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self._pername: List[str] = [_pretty_name(x) for x in self._percentiles] dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}" self.result = Table(self.generate_table_name("percentiles"), dshape=dshape, create=True) def is_ready(self) -> bool: slot = self.get_input_slot("table") if slot is not None and slot.created.any(): return True return super(Percentiles, self).is_ready() def reset(self) -> None: self.tdigest = TDigest() @process_slot("table", reset_cb="reset") @run_if_any def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: dfslot = ctx.table indices = dfslot.created.next(length=step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x[0]) df = self.table values = {} for n, p in zip(self._pername, self._percentiles): values[n] = self.tdigest.percentile(p * 100) df.add(values) # with self.lock: # df.loc[run_number] = values # if len(df) > self.params.history: # self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0: # Series is not numeric or is all NaNs. return None logger.debug('column_summary - ' + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ['mean', 'min', 'max', 'std', 'sum']: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult['n'] = column_props[col]['notnulls'] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult['percentiles'] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult['median'] = colresult['percentiles'][50] colresult['iqr'] = (colresult['percentiles'][75] - colresult['percentiles'][25]) # Compute the t-digest. logger.debug('column_summary - {} - creating TDigest...'.format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug('column_summary - {} - testing log trans...'.format(col)) try: colresult['logtrans'] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning('test_logtrans has failed for column `{}`: {}'.format( col, e)) colresult['logtrans'] = False if colresult['logtrans']: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult['logtrans_mean'] = _tdigest_mean(logdigest) colresult['logtrans_std'] = _tdigest_std(logdigest) colresult['logtrans_IQR'] = (logdigest.percentile(75) - logdigest.percentile(25)) logger.debug('column_summary - {} - should {}be log-transformed'.format( col, 'NOT ' if not colresult['logtrans'] else '')) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug('column_summary - {} - computing histogram...'.format(col)) if column_props[col]['is_categorical']: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult['logtrans']: counts, log_edges = np.histogram(np.log10(data), density=False, bins='fd') edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins='fd') colresult['histogram'] = { 'counts': counts.tolist(), 'bin_edges': edges.tolist() } # Compute KDE logger.debug('column_summary - {} - computing KDE...'.format(col)) bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1) logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw)) if column_props[col]['is_categorical']: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult['min'], colresult['max'] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult['logtrans']) colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()} return {col: colresult, '_columns': [col]}