Exemple #1
0
def main():
	data = random(1000)

	i = 0
	for i in range(0,(len(data) - wSize + 1)):
		digest = TDigest()
		digest.batch_update(data[i:i+wSize])
		results.append([i+1,digest.percentile(15)])
		i += 1
	print(results)
Exemple #2
0
def digest_partitions(values):
    digest = TDigest()
    digest.batch_update(values)
    return [digest]
Exemple #3
0
class BSketch:
    """BSketch: binning sketch for numerical values and binary target.

    Parameters
    ----------
    sketch : str, optional (default="gk")
        Sketch algorithm. Supported algorithms are "gk" (Greenwald-Khanna's)
        and "t-digest" (Ted Dunning) algorithm. Algorithm "t-digest" relies on
        `tdigest <https://github.com/CamDavidsonPilon/tdigest>`_.

    eps : float (default=0.01)
        Relative error epsilon.

    K : int (default=25)
        Parameter excess growth K to compute compress threshold in t-digest.

    special_codes : array-like or None, optional (default=None)
        List of special codes. Use special codes to specify the data values
        that must be treated separately.
    """
    def __init__(self, sketch="gk", eps=0.01, K=25, special_codes=None):
        self.sketch = sketch
        self.eps = eps
        self.K = K
        self.special_codes = special_codes

        _check_parameters(sketch, eps, K, special_codes)

        self._count_missing_e = 0
        self._count_missing_ne = 0
        self._count_special_e = 0
        self._count_special_ne = 0

        if sketch == "gk":
            self._sketch_e = GK(eps)
            self._sketch_ne = GK(eps)
        elif sketch == "t-digest":
            self._sketch_e = TDigest(eps, K)
            self._sketch_ne = TDigest(eps, K)

    def add(self, x, y, check_input=False):
        """Add arrays to the sketch.

        Parameters
        ----------
        x : array-like, shape = (n_samples,)
            Training vector, where n_samples is the number of samples.

        y : array-like, shape = (n_samples,)
            Target vector relative to x.

        check_input : bool (default=False)
            Whether to check input arrays.
        """
        xc, yc, xm, ym, xs, ys, _, _, _, _, _, _, _ = split_data(
            dtype=None,
            x=x,
            y=y,
            special_codes=self.special_codes,
            check_input=check_input)

        # Add values to sketch
        mask = yc == 1

        if self.sketch == "gk":
            for v1 in xc[mask]:
                self._sketch_e.add(v1)

            for v0 in xc[~mask]:
                self._sketch_ne.add(v0)

        if self.sketch == "t-digest":
            self._sketch_e.batch_update(xc[mask])
            self._sketch_ne.batch_update(xc[~mask])

        # Keep track of missing and special counts
        n_missing = len(ym)
        if n_missing:
            self._count_missing_e += np.count_nonzero(ym == 1)
            self._count_missing_ne += np.count_nonzero(ym == 0)

        n_special = len(ys)
        if n_special:
            self._count_special_e += np.count_nonzero(ys == 1)
            self._count_special_ne += np.count_nonzero(ys == 0)

    def bins(self, splits):
        """Event and non-events counts for each bin given a list of split
        points.

        Parameters
        ----------
        splits : array-like, shape = (n_splits,)
            List of split points.

        Returns
        -------
        bins : tuple of arrays of size n_splits + 1.
        """
        n_bins = len(splits) + 1
        bins_e = np.zeros(n_bins).astype(np.int64)
        bins_ne = np.zeros(n_bins).astype(np.int64)

        indices_e, count_e = _indices_count(self.sketch, self._sketch_e,
                                            splits)
        indices_ne, count_ne = _indices_count(self.sketch, self._sketch_ne,
                                              splits)

        for i in range(n_bins):
            bins_e[i] = count_e[(indices_e == i)].sum()
            bins_ne[i] = count_ne[(indices_ne == i)].sum()

        return bins_e, bins_ne

    def merge(self, bsketch):
        """Merge current instance with another BSketch instance.

        Parameters
        ----------
        bsketch : object
            BSketch instance.
        """
        if not self._mergeable(bsketch):
            raise Exception("bsketch does not share signature.")

        if bsketch._sketch_e.n == 0 and bsketch._sketch_ne.n == 0:
            return

        if self._sketch_e.n == 0 and self._sketch_ne.n == 0:
            self._copy(bsketch)
            return

        # Merge sketches
        if self.sketch == "gk":
            self._sketch_e.merge(bsketch._sketch_e)
            self._sketch_ne.merge(bsketch._sketch_ne)
        elif self.sketch == "t-digest":
            self._sketch_e += bsketch._sketch_e
            self._sketch_ne += bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e += bsketch._count_missing_e
        self._count_missing_ne += bsketch._count_missing_ne
        self._count_special_e += bsketch._count_special_e
        self._count_special_ne += bsketch._count_special_ne

    def merge_sketches(self):
        """Merge event and non-event data internal sketches."""
        if self.sketch == "gk":
            new_sketch = GK(self.eps)

            new_sketch.merge(self._sketch_e)
            new_sketch.merge(self._sketch_ne)
        else:
            new_sketch = self._sketch_e + self._sketch_ne

        return new_sketch

    def _copy(self, bsketch):
        self._sketch_e = bsketch._sketch_e
        self._sketch_ne = bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e = bsketch._count_missing_e
        self._count_missing_ne = bsketch._count_missing_ne
        self._count_special_e = bsketch._count_special_e
        self._count_special_ne = bsketch._count_special_ne

    def _mergeable(self, other):
        special_eq = True
        if self.special_codes is not None and other.special_codes is not None:
            special_eq = set(self.special_codes) == set(other.special_codes)

        return (self.sketch == other.sketch and self.eps == other.eps
                and self.K == other.K and special_eq)

    @property
    def n_event(self):
        """Event count.

        Returns
        -------
        n_event : int
        """
        count = self._sketch_e.n
        return count + self._count_missing_e + self._count_special_e

    @property
    def n_nonevent(self):
        """Non-event count.

        Returns
        -------
        n_nonevent : int
        """
        count = self._sketch_ne.n
        return count + self._count_missing_ne + self._count_special_ne

    @property
    def n(self):
        """Records count.

        Returns
        -------
        n : int
        """
        return self.n_event + self.n_nonevent
    import json
    start_pos = 0
    with open(fn, 'r') as f:
        while True:
            try:
                obj = json.load(f)
                yield obj
                return
            except json.JSONDecodeError as e:
                f.seek(start_pos)
                json_str = f.read(e.pos)
                obj = json.loads(json_str)
                start_pos += e.pos
                yield obj


final_digest = TDigest()

json_object = stream_read_json('./metric-result-no-presumed-revenue.json')
for num, json_list in enumerate(json_object):
    num_list = pyjq.all('.[] | .metric', json_list)
    digest = TDigest()
    digest.batch_update(num_list)
    print(digest.percentile(50))
    final_digest = final_digest + digest

print('final: ' + str(final_digest.percentile(25)))
print('final: ' + str(final_digest.percentile(50)))
print('final: ' + str(final_digest.percentile(75)))
print('final: ' + str(final_digest.percentile(90)))
def digest_partitions(values):
    digest = TDigest()
    digest.batch_update(values)
    return [digest]
Exemple #6
0
class Percentiles(DataFrameModule):
    parameters = [('percentiles', object, [0.25, 0.5, 0.75]),
                  ('history', np.dtype(int), 3)]
                  
    def __init__(self, column, percentiles=None, **kwds):
        if not column:
            raise ProgressiveError('Need a column name')
        self._add_slots(kwds,'input_descriptors',
                        [SlotDescriptor('df', type=pd.DataFrame)])
        super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds)
        self._column = column
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < .5]
                uh = percentiles[percentiles > .5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        
        self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles]
        self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC)
        self._df = create_dataframe(self.schema)

    def is_ready(self):
        if self.get_input_slot('df').has_created():
            return True
        return super(Percentiles, self).is_ready()

    def run_step(self,run_number,step_size,howlong):
        dfslot = self.get_input_slot('df')
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            dfslot.update(run_number)
            self.tdigest = TDigest() # reset

        indices = dfslot.next_created(step_size)
        steps = indices_len(indices)
        if steps == 0:
            return self._return_run_step(self.state_blocked, steps_run=steps)
        input_df = dfslot.data()
        with dfslot.lock:
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x)
        df = self._df
        values = []
        for p in self._percentiles:
            values.append(self.tdigest.percentile(p*100))
        values.append(run_number)
        with self.lock:
            df.loc[run_number] = values
            if len(df) > self.params.history:
                self._df = df.loc[df.index[-self.params.history:]]
        return self._return_run_step(dfslot.next_state(),
                                     steps_run=steps, reads=steps, updates=len(self._df))
Exemple #7
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug("column_summary - " + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ["mean", "min", "max", "std", "sum"]:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult["n"] = column_props[col]["notnulls"]

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult["percentiles"] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult["median"] = colresult["percentiles"][50]
    colresult["iqr"] = (colresult["percentiles"][75] -
                        colresult["percentiles"][25])

    # Compute the t-digest.
    logger.debug("column_summary - {} - creating TDigest...".format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug("column_summary - {} - testing log trans...".format(col))
    try:
        colresult["logtrans"] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning("test_logtrans has failed for column `{}`: {}".format(
            col, e))
        colresult["logtrans"] = False

    if colresult["logtrans"]:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult["logtrans_mean"] = _tdigest_mean(logdigest)
        colresult["logtrans_std"] = _tdigest_std(logdigest)
        colresult["logtrans_IQR"] = logdigest.percentile(
            75) - logdigest.percentile(25)

    logger.debug("column_summary - {} - should {}be log-transformed".format(
        col, "NOT " if not colresult["logtrans"] else ""))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug("column_summary - {} - computing histogram...".format(col))

    if column_props[col]["is_categorical"]:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult["logtrans"]:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins="fd")
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins="fd")

    colresult["histogram"] = {
        "counts": counts.tolist(),
        "bin_edges": edges.tolist(),
    }

    # Compute KDE
    logger.debug("column_summary - {} - computing KDE...".format(col))
    bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1)

    logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw))

    if column_props[col]["is_categorical"]:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult["min"], colresult["max"]
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult["logtrans"])

    colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()}

    return {col: colresult, "_columns": [col]}
Exemple #8
0
class Percentiles(TableModule):
    parameters = [
        ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]),
        ("history", np.dtype(int), 3),
    ]
    inputs = [SlotDescriptor("table", type=Table)]

    def __init__(self,
                 column: str,
                 percentiles: Optional[Union[List[float],
                                             np.ndarray[Any, Any]]] = None,
                 **kwds: Any) -> None:
        if not column:
            raise ProgressiveError("Need a column name")
        super(Percentiles, self).__init__(**kwds)
        self._columns = [column]
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():  # type: ignore
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < 0.5]
                uh = percentiles[percentiles > 0.5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        self._pername: List[str] = [_pretty_name(x) for x in self._percentiles]
        dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}"
        self.result = Table(self.generate_table_name("percentiles"),
                            dshape=dshape,
                            create=True)

    def is_ready(self) -> bool:
        slot = self.get_input_slot("table")
        if slot is not None and slot.created.any():
            return True
        return super(Percentiles, self).is_ready()

    def reset(self) -> None:
        self.tdigest = TDigest()

    @process_slot("table", reset_cb="reset")
    @run_if_any
    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        assert self.context
        with self.context as ctx:
            dfslot = ctx.table
            indices = dfslot.created.next(length=step_size)
            steps = indices_len(indices)
            if steps == 0:
                return self._return_run_step(self.state_blocked,
                                             steps_run=steps)
            input_df = dfslot.data()
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x[0])
            df = self.table
            values = {}
            for n, p in zip(self._pername, self._percentiles):
                values[n] = self.tdigest.percentile(p * 100)
            df.add(values)
            # with self.lock:
            #     df.loc[run_number] = values
            #     if len(df) > self.params.history:
            #         self._df = df.loc[df.index[-self.params.history:]]
            return self._return_run_step(self.next_state(dfslot),
                                         steps_run=steps)
Exemple #9
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug('column_summary - ' + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ['mean', 'min', 'max', 'std', 'sum']:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult['n'] = column_props[col]['notnulls']

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult['percentiles'] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult['median'] = colresult['percentiles'][50]
    colresult['iqr'] = (colresult['percentiles'][75] -
                        colresult['percentiles'][25])

    # Compute the t-digest.
    logger.debug('column_summary - {} - creating TDigest...'.format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug('column_summary - {} - testing log trans...'.format(col))
    try:
        colresult['logtrans'] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning('test_logtrans has failed for column `{}`: {}'.format(
            col, e))
        colresult['logtrans'] = False

    if colresult['logtrans']:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult['logtrans_mean'] = _tdigest_mean(logdigest)
        colresult['logtrans_std'] = _tdigest_std(logdigest)
        colresult['logtrans_IQR'] = (logdigest.percentile(75) -
                                     logdigest.percentile(25))

    logger.debug('column_summary - {} - should {}be log-transformed'.format(
        col, 'NOT ' if not colresult['logtrans'] else ''))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug('column_summary - {} - computing histogram...'.format(col))

    if column_props[col]['is_categorical']:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult['logtrans']:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins='fd')
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins='fd')

    colresult['histogram'] = {
        'counts': counts.tolist(),
        'bin_edges': edges.tolist()
    }

    # Compute KDE
    logger.debug('column_summary - {} - computing KDE...'.format(col))
    bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1)

    logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw))

    if column_props[col]['is_categorical']:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult['min'], colresult['max']
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult['logtrans'])

    colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()}

    return {col: colresult, '_columns': [col]}