Example #1
0
    def run_step(self,run_number,step_size,howlong):
        dfslot = self.get_input_slot('df')
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            dfslot.update(run_number)
            self.tdigest = TDigest() # reset

        indices = dfslot.next_created(step_size)
        steps = indices_len(indices)
        if steps == 0:
            return self._return_run_step(self.state_blocked, steps_run=steps)
        input_df = dfslot.data()
        with dfslot.lock:
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x)
        df = self._df
        values = []
        for p in self._percentiles:
            values.append(self.tdigest.percentile(p*100))
        values.append(run_number)
        with self.lock:
            df.loc[run_number] = values
            if len(df) > self.params.history:
                self._df = df.loc[df.index[-self.params.history:]]
        return self._return_run_step(dfslot.next_state(),
                                     steps_run=steps, reads=steps, updates=len(self._df))
class Digest:
    def __init__(self):
        self.digest = TDigest()
        self.digest.update(0)
        self._count = 0
        self.lock = asyncio.Lock()

    def add(self, v):
        self.digest.update(v)
        self._count += 1

    def percentile(self, v):
        return self.digest.percentile(v)

    def count(self):
        return self._count
Example #3
0
    def __init__(self, column, percentiles=None, **kwds):
        if not column:
            raise ProgressiveError('Need a column name')
        self._add_slots(kwds,'input_descriptors',
                        [SlotDescriptor('df', type=pd.DataFrame)])
        super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds)
        self._column = column
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < .5]
                uh = percentiles[percentiles > .5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        
        self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles]
        self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC)
        self._df = create_dataframe(self.schema)
 def __init__(self, config, logger, options):
     super(SeasonalDecomposition, self).__init__(config, logger, resource={'metric_sink': 'RedisSink',
                                                                           'output_sink': 'GraphiteSink'})
     self.plugin = options['plugin']
     self.service = options['service']
     self.params = options['params']
     self.tdigest_key = 'td:%s' % self.service
     self.td = TDigest()
     self.error_eval = {
         'tukey': self._eval_tukey,
         'quantile': self._eval_quantile
     }
Example #5
0
    def detect_anomalies(self, data, anomaly_fraction):
        data = np.asanyarray(data)
        if len(data.shape) == 1:
            data = data[:, np.newaxis]
        signal = self.reconstruct_signal(data)
        digest = TDigest()

        n = data.shape[0]
        delta = np.zeros(data.shape)

        for i in xrange(n):
            error = self.compute_error(data[i, :], signal[i, :])
            delta[i, :] = error
            digest.update(np.abs(error))

        threshold = digest.quantile(1 - anomaly_fraction)

        anomalies = []
        for i in xrange(n):
            element = delta[i]
            if np.abs(element) > threshold:
                anomalies.append(Anomaly(data[i], element, i))

        return anomalies
Example #6
0
 def reset(self) -> None:
     self.tdigest = TDigest()
 def __init__(self):
     self.digest = TDigest()
     self.digest.update(0)
     self._count = 0
     self.lock = asyncio.Lock()
Example #8
0
class Percentiles(DataFrameModule):
    parameters = [('percentiles', object, [0.25, 0.5, 0.75]),
                  ('history', np.dtype(int), 3)]
                  
    def __init__(self, column, percentiles=None, **kwds):
        if not column:
            raise ProgressiveError('Need a column name')
        self._add_slots(kwds,'input_descriptors',
                        [SlotDescriptor('df', type=pd.DataFrame)])
        super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds)
        self._column = column
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < .5]
                uh = percentiles[percentiles > .5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        
        self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles]
        self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC)
        self._df = create_dataframe(self.schema)

    def is_ready(self):
        if self.get_input_slot('df').has_created():
            return True
        return super(Percentiles, self).is_ready()

    def run_step(self,run_number,step_size,howlong):
        dfslot = self.get_input_slot('df')
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            dfslot.update(run_number)
            self.tdigest = TDigest() # reset

        indices = dfslot.next_created(step_size)
        steps = indices_len(indices)
        if steps == 0:
            return self._return_run_step(self.state_blocked, steps_run=steps)
        input_df = dfslot.data()
        with dfslot.lock:
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x)
        df = self._df
        values = []
        for p in self._percentiles:
            values.append(self.tdigest.percentile(p*100))
        values.append(run_number)
        with self.lock:
            df.loc[run_number] = values
            if len(df) > self.params.history:
                self._df = df.loc[df.index[-self.params.history:]]
        return self._return_run_step(dfslot.next_state(),
                                     steps_run=steps, reads=steps, updates=len(self._df))
Example #9
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug("column_summary - " + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ["mean", "min", "max", "std", "sum"]:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult["n"] = column_props[col]["notnulls"]

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult["percentiles"] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult["median"] = colresult["percentiles"][50]
    colresult["iqr"] = (colresult["percentiles"][75] -
                        colresult["percentiles"][25])

    # Compute the t-digest.
    logger.debug("column_summary - {} - creating TDigest...".format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug("column_summary - {} - testing log trans...".format(col))
    try:
        colresult["logtrans"] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning("test_logtrans has failed for column `{}`: {}".format(
            col, e))
        colresult["logtrans"] = False

    if colresult["logtrans"]:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult["logtrans_mean"] = _tdigest_mean(logdigest)
        colresult["logtrans_std"] = _tdigest_std(logdigest)
        colresult["logtrans_IQR"] = logdigest.percentile(
            75) - logdigest.percentile(25)

    logger.debug("column_summary - {} - should {}be log-transformed".format(
        col, "NOT " if not colresult["logtrans"] else ""))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug("column_summary - {} - computing histogram...".format(col))

    if column_props[col]["is_categorical"]:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult["logtrans"]:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins="fd")
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins="fd")

    colresult["histogram"] = {
        "counts": counts.tolist(),
        "bin_edges": edges.tolist(),
    }

    # Compute KDE
    logger.debug("column_summary - {} - computing KDE...".format(col))
    bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1)

    logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw))

    if column_props[col]["is_categorical"]:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult["min"], colresult["max"]
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult["logtrans"])

    colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()}

    return {col: colresult, "_columns": [col]}
Example #10
0
def ingestRecordsMultiProc(dimensionsMetrics, dimensionsEvents, args):
    ## Register sigint handler
    signal.signal(signal.SIGINT, signalHandlerMultiProc)

    numHosts = len(dimensionsMetrics)
    remainder = numHosts % args.processes
    startId = 0

    ingestionStart = timer()

    for processId in range(1, args.processes + 1):
        endId = startId + int(
            numHosts / args.processes) + (1 if remainder > 0 else 0)
        if endId > numHosts:
            print(
                "Number of processes more than number of hosts, skipping process creation"
            )
            break
        print("Starting process {} with host ranges: [{}, {}]".format(
            processId, startId, endId - 1))

        ## Select a subset of hosts
        dimensionsMetricsLocal = dimensionsMetrics[startId:endId]
        dimensionsMetricsSet = set()
        for dim in dimensionsMetricsLocal:
            dimensionsMetricsSet.add(
                (dim.region, dim.cell, dim.silo, dim.availability_zone,
                 dim.microservice_name, dim.instance_name))
        dimensionsEventsLocal = list()
        ## Select the dimension events for the hosts selected above.
        for dim in dimensionsEvents:
            host = (dim.region, dim.cell, dim.silo, dim.availability_zone,
                    dim.microservice_name, dim.instance_name)
            if host in dimensionsMetricsSet:
                dimensionsEventsLocal.append(dim)

        print(
            "Starting process {} with host ranges: [{}, {}]. Metrics: {}. Events: {}"
            .format(processId, startId, endId - 1, len(dimensionsMetricsLocal),
                    len(dimensionsEventsLocal)))
        lowUtilizationHosts, highUtilizationHosts = initializeHighAndLowUtilizationHosts(
            len(dimensionsMetricsLocal))
        parentConn, childConn = multiprocessing.Pipe()
        manager = multiprocessing.Manager()
        event = manager.Event()
        process = MultiProcessIngestWorker(
            processId, args, dimensionsMetricsLocal, dimensionsEventsLocal,
            highUtilizationHosts, lowUtilizationHosts, childConn, event)
        process.start()
        processes.append((process, parentConn, event))
        remainder -= 1
        startId = endId

    success = 0
    count = 0
    totalLatency = 0.0
    aggregatedDigests = TDigest()
    pooledVariance = 0.0
    for p, conn, event in processes:
        output = conn.recv()
        p.join()
        if output == None:
            continue

        success += output.success
        ## Pool the variance.
        if count == 0:
            pooledVariance = output.variance
        else:
            pooledVariance = ((count - 1) * pooledVariance +
                              (output.count - 1) * output.variance) / (
                                  (count - 1) + (output.count - 1))
        count += output.count
        aggregatedDigests += output.digest
        totalLatency += output.sum

    print(
        "[OVERALL] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}"
        .format(count, success, round(totalLatency / count, 3),
                round(math.sqrt(pooledVariance), 3),
                round(aggregatedDigests.percentile(50), 3),
                round(aggregatedDigests.percentile(90), 3),
                round(aggregatedDigests.percentile(99), 3)))

    ingestionEnd = timer()
    print("Total time to ingest: {:,} seconds".format(
        round(ingestionEnd - ingestionStart, 2)))
 def __init__(self):
     super().__init__()
     self.digest = TDigest()
Example #12
0
class BSketch:
    """BSketch: binning sketch for numerical values and binary target.

    Parameters
    ----------
    sketch : str, optional (default="gk")
        Sketch algorithm. Supported algorithms are "gk" (Greenwald-Khanna's)
        and "t-digest" (Ted Dunning) algorithm. Algorithm "t-digest" relies on
        `tdigest <https://github.com/CamDavidsonPilon/tdigest>`_.

    eps : float (default=0.01)
        Relative error epsilon.

    K : int (default=25)
        Parameter excess growth K to compute compress threshold in t-digest.

    special_codes : array-like or None, optional (default=None)
        List of special codes. Use special codes to specify the data values
        that must be treated separately.
    """
    def __init__(self, sketch="gk", eps=0.01, K=25, special_codes=None):
        self.sketch = sketch
        self.eps = eps
        self.K = K
        self.special_codes = special_codes

        _check_parameters(sketch, eps, K, special_codes)

        self._count_missing_e = 0
        self._count_missing_ne = 0
        self._count_special_e = 0
        self._count_special_ne = 0

        if sketch == "gk":
            self._sketch_e = GK(eps)
            self._sketch_ne = GK(eps)
        elif sketch == "t-digest":
            self._sketch_e = TDigest(eps, K)
            self._sketch_ne = TDigest(eps, K)

    def add(self, x, y, check_input=False):
        """Add arrays to the sketch.

        Parameters
        ----------
        x : array-like, shape = (n_samples,)
            Training vector, where n_samples is the number of samples.

        y : array-like, shape = (n_samples,)
            Target vector relative to x.

        check_input : bool (default=False)
            Whether to check input arrays.
        """
        xc, yc, xm, ym, xs, ys, _, _, _, _, _, _, _ = split_data(
            dtype=None,
            x=x,
            y=y,
            special_codes=self.special_codes,
            check_input=check_input)

        # Add values to sketch
        mask = yc == 1

        if self.sketch == "gk":
            for v1 in xc[mask]:
                self._sketch_e.add(v1)

            for v0 in xc[~mask]:
                self._sketch_ne.add(v0)

        if self.sketch == "t-digest":
            self._sketch_e.batch_update(xc[mask])
            self._sketch_ne.batch_update(xc[~mask])

        # Keep track of missing and special counts
        n_missing = len(ym)
        if n_missing:
            self._count_missing_e += np.count_nonzero(ym == 1)
            self._count_missing_ne += np.count_nonzero(ym == 0)

        n_special = len(ys)
        if n_special:
            self._count_special_e += np.count_nonzero(ys == 1)
            self._count_special_ne += np.count_nonzero(ys == 0)

    def bins(self, splits):
        """Event and non-events counts for each bin given a list of split
        points.

        Parameters
        ----------
        splits : array-like, shape = (n_splits,)
            List of split points.

        Returns
        -------
        bins : tuple of arrays of size n_splits + 1.
        """
        n_bins = len(splits) + 1
        bins_e = np.zeros(n_bins).astype(np.int64)
        bins_ne = np.zeros(n_bins).astype(np.int64)

        indices_e, count_e = self._indices_count(self._sketch_e, splits)
        indices_ne, count_ne = self._indices_count(self._sketch_ne, splits)

        for i in range(n_bins):
            bins_e[i] = count_e[(indices_e == i)].sum()
            bins_ne[i] = count_ne[(indices_ne == i)].sum()

        return bins_e, bins_ne

    def merge(self, bsketch):
        """Merge current instance with another BSketch instance.

        Parameters
        ----------
        bsketch : object
            BSketch instance.
        """
        if not self._mergeable(bsketch):
            raise Exception("bsketch does not share signature.")

        if bsketch._sketch_e.n == 0 and bsketch._sketch_ne.n == 0:
            return

        if self._sketch_e.n == 0 and self._sketch_ne.n == 0:
            self._copy(bsketch)
            return

        # Merge sketches
        if self.sketch == "gk":
            self._sketch_e.merge(bsketch._sketch_e)
            self._sketch_ne.merge(bsketch._sketch_ne)
        elif self.sketch == "t-digest":
            self._sketch_e += bsketch._sketch_e
            self._sketch_ne += bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e += bsketch._count_missing_e
        self._count_missing_ne += bsketch._count_missing_ne
        self._count_special_e += bsketch._count_special_e
        self._count_special_ne += bsketch._count_special_ne

    def merge_sketches(self):
        """Merge event and non-event data internal sketches."""
        if self.sketch == "gk":
            new_sketch = GK(self.eps)

            new_sketch.merge(self._sketch_e)
            new_sketch.merge(self._sketch_ne)
        else:
            new_sketch = self._sketch_e + self._sketch_ne

        return new_sketch

    def _copy(self, bsketch):
        self._sketch_e = bsketch._sketch_e
        self._sketch_ne = bsketch._sketch_ne

        # Merge missing and special counts
        self._count_missing_e = bsketch._count_missing_e
        self._count_missing_ne = bsketch._count_missing_ne
        self._count_special_e = bsketch._count_special_e
        self._count_special_ne = bsketch._count_special_ne

    def _indices_count(self, sketch, splits):
        values = np.zeros(len(sketch))
        count = np.zeros(len(sketch))

        if self.sketch == "gk":
            for i, entry in enumerate(sketch.entries):
                values[i] = entry.value
                count[i] = entry.g

        elif self.sketch == "t-digest":
            for i, key in enumerate(sketch.C.keys()):
                centroid = sketch.C.get_value(key)
                values[i] = centroid.mean
                count[i] = centroid.count

        indices = np.searchsorted(splits, values, side='left')
        return indices, count

    def _mergeable(self, other):
        special_eq = True
        if self.special_codes is not None and other.special_codes is not None:
            special_eq = set(self.special_codes) == set(other.special_codes)

        return (self.sketch == other.sketch and self.eps == other.eps
                and self.K == other.K and special_eq)

    @property
    def n_event(self):
        """Event count.

        Returns
        -------
        n_event : int
        """
        count = self._sketch_e.n
        return count + self._count_missing_e + self._count_special_e

    @property
    def n_nonevent(self):
        """Non-event count.

        Returns
        -------
        n_nonevent : int
        """
        count = self._sketch_ne.n
        return count + self._count_missing_ne + self._count_special_ne

    @property
    def n(self):
        """Records count.

        Returns
        -------
        n : int
        """
        return self.n_event + self.n_nonevent
Example #13
0
def initialise_digest(v):
    d = TDigest()
    d.update(v)
    return d
Example #14
0
    def run(self):
        global lock
        global seriesId
        global timestamp

        with lock:
            ## Randomly pick a series ID to start for this process.
            seriesId = random.randint(
                0,
                len(self.dimensionEvents) + len(self.dimensionMetrics) - 1)
            timestamp = getTimestampMillis()
            print("Process {} using start series ID: {}".format(
                self.processId, seriesId))

        ## Register sigint handler
        signal.signal(signal.SIGINT, signalHandler)
        overallSummary = None
        ingestionStart = timer()

        try:
            for threadId in range(self.args.concurrency):
                threadIdStr = "{}-{}".format(self.processId, threadId + 1)
                print("Starting ThreadId: {}".format(threadIdStr))
                thread = IngestionThread(threadIdStr, self.args,
                                         self.dimensionMetrics,
                                         self.dimensionEvents,
                                         self.highUtilizationHosts,
                                         self.lowUtilizationHosts, self.event)
                thread.start()
                self.threads.append(thread)

            success = 0
            count = 0
            totalLatency = 0.0
            aggregatedDigests = TDigest()
            pooledVariance = 0.0
            for t in self.threads:
                t.join()
                success += t.success
                ## Pool the variance.
                if count == 0:
                    pooledVariance = t.variance
                else:
                    pooledVariance = ((count - 1) * pooledVariance +
                                      (t.count - 1) * t.variance) / (
                                          (count - 1) + (t.count - 1))
                count += t.count
                aggregatedDigests += t.digest
                totalLatency += t.sum

            print(
                "[Process: {}] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}"
                .format(self.processId, count, success,
                        round(totalLatency / count, 3),
                        round(math.sqrt(pooledVariance), 3),
                        round(aggregatedDigests.percentile(50), 3),
                        round(aggregatedDigests.percentile(90), 3),
                        round(aggregatedDigests.percentile(99), 3)))

            overallSummary = IngestionSummaryStats(aggregatedDigests, count,
                                                   success, totalLatency,
                                                   pooledVariance)
            ingestionEnd = timer()
            print("Total time to ingest: {:,} seconds".format(
                round(ingestionEnd - ingestionStart, 2)))
        finally:
            self.conn.send(overallSummary)
Example #15
0
from tdigest import TDigest

consumer = KafkaConsumer('demo-topic',
                         group_id=None,
                         bootstrap_servers='127.0.0.1:9092',
                         value_deserializer=lambda v: json.loads(v))

entity_detectors = {}
counter = 0

for msg in consumer:
    entity_id = msg.value['id']
    value = msg.value['value']

    if entity_id not in entity_detectors:
        entity_detectors[entity_id] = TDigest()

    #Get entity specific anomaly detector
    detector = entity_detectors[entity_id]

    #Check if detector is empty
    if (10 > len(detector)):
        detector.update(value)
        continue

    #Get bounds
    upp_bound = detector.percentile(99.9)
    low_bound = detector.percentile(0.1)

    #Display info
    if (0 == (counter % 5)):
Example #16
0
class IngestionThread(threading.Thread):
    def __init__(self, threadId, args, dimensionMetrics, dimensionEvents,
                 highUtilizationHosts, lowUtilizationHosts, event):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.args = args
        self.dimensionMetrics = dimensionMetrics
        self.dimensionEvents = dimensionEvents
        self.client = tswrite.createWriteClient(args.endpoint,
                                                profile=args.profile)
        self.databaseName = args.databaseName
        self.tableName = args.tableName
        self.numMetrics = len(dimensionMetrics)
        self.numEvents = len(dimensionEvents)
        self.digest = TDigest(
        )  ## Use the t-digest to compute the streaming percentiles
        self.count = 0
        self.success = 0
        self.sum = 0.0
        self.variance = float('nan')
        self.highUtilizationHosts = highUtilizationHosts
        self.lowUtilizationHosts = lowUtilizationHosts
        self.sigInt = False
        self.event = event

    def run(self):
        global seriesId
        global timestamp
        global lock

        idx = 0
        mean = 0.0
        squared = 0.0

        while True:
            with lock:
                if self.sigInt == True or sigInt == True or self.event.is_set(
                ):
                    print("Thread {} exiting.".format(self.threadId))
                    break

                seriesId += 1
                if seriesId >= self.numMetrics + self.numEvents:
                    ## Wrapping around, so move to new timestamp.
                    seriesId = 0
                    newTimestamp = timestamp + self.args.intervalMillis
                    currentTime = getCurrentTimestampMillis()
                    ## Check if the timestamps are falling behind
                    if newTimestamp < currentTime - 0.05 * self.args.intervalMillis:
                        print(
                            "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes."
                            .format(self.args.intervalMillis,
                                    currentTime - timestamp))
                        ## Move time forward.
                        timestamp = getTimestampMillis()
                    else:
                        timestamp = newTimestamp
                        ## Check if we are ingesting too fast, then slow down.
                        if timestamp > currentTime - 1000:
                            ## Slow down
                            sleepTimeSecs = int(
                                (timestamp - currentTime) / 1000)
                            print("Thread {} sleeping for {} secs".format(
                                self.threadId, sleepTimeSecs))
                            time.sleep(sleepTimeSecs)

                    now = datetime.datetime.now()
                    print(
                        "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}."
                        .format(self.threadId,
                                now.strftime("%Y-%m-%d %H:%M:%S"), timestamp))

                localSeriesId = seriesId
                localTimestamp = timestamp

            if localSeriesId < self.numMetrics:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionMetrics[localSeriesId])
                records = model.createRandomMetrics(seriesId, localTimestamp,
                                                    "MILLISECONDS",
                                                    self.highUtilizationHosts,
                                                    self.lowUtilizationHosts)
            else:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionEvents[localSeriesId - self.numMetrics])
                records = model.createRandomEvent(localTimestamp,
                                                  "MILLISECONDS")

            idx += 1
            start = timer()
            try:
                writeResult = tswrite.writeRecords(self.client,
                                                   self.databaseName,
                                                   self.tableName,
                                                   commonAttributes, records)
                self.success += 1
            except Exception as e:
                print(e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
                requestId = "RequestId: {}".format(
                    e.response['ResponseMetadata']['RequestId'])
                print(requestId)
                print(json.dumps(commonAttributes, indent=2))
                print(json.dumps(records, indent=2))
                continue
            finally:
                self.count += 1
                end = timer()
                cur = end - start
                self.digest.update(cur)
                self.sum += cur
                ## Computing the streaming M^2 (squared distance from mean)
                delta = cur - mean
                mean += delta / self.count
                squared += delta * (cur - mean)
                if self.count > 1:
                    self.variance = float(squared / (self.count - 1))

            requestId = writeResult['ResponseMetadata']['RequestId']
            if idx % 1000 == 0:
                now = datetime.datetime.now()
                print(
                    "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}"
                    .format(self.threadId, idx,
                            now.strftime("%Y-%m-%d %H:%M:%S"), requestId,
                            round(self.sum / self.count, 3),
                            round(math.sqrt(self.variance), 3),
                            round(self.digest.percentile(50), 3),
                            round(self.digest.percentile(90), 3),
                            round(self.digest.percentile(99), 3)))

    def interrupt(self):
        print("Interrupting thread: ", self.threadId)
        self.sigInt = True
Example #17
0
class Percentiles(TableModule):
    parameters = [
        ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]),
        ("history", np.dtype(int), 3),
    ]
    inputs = [SlotDescriptor("table", type=Table)]

    def __init__(self,
                 column: str,
                 percentiles: Optional[Union[List[float],
                                             np.ndarray[Any, Any]]] = None,
                 **kwds: Any) -> None:
        if not column:
            raise ProgressiveError("Need a column name")
        super(Percentiles, self).__init__(**kwds)
        self._columns = [column]
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():  # type: ignore
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < 0.5]
                uh = percentiles[percentiles > 0.5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        self._pername: List[str] = [_pretty_name(x) for x in self._percentiles]
        dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}"
        self.result = Table(self.generate_table_name("percentiles"),
                            dshape=dshape,
                            create=True)

    def is_ready(self) -> bool:
        slot = self.get_input_slot("table")
        if slot is not None and slot.created.any():
            return True
        return super(Percentiles, self).is_ready()

    def reset(self) -> None:
        self.tdigest = TDigest()

    @process_slot("table", reset_cb="reset")
    @run_if_any
    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        assert self.context
        with self.context as ctx:
            dfslot = ctx.table
            indices = dfslot.created.next(length=step_size)
            steps = indices_len(indices)
            if steps == 0:
                return self._return_run_step(self.state_blocked,
                                             steps_run=steps)
            input_df = dfslot.data()
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x[0])
            df = self.table
            values = {}
            for n, p in zip(self._pername, self._percentiles):
                values[n] = self.tdigest.percentile(p * 100)
            df.add(values)
            # with self.lock:
            #     df.loc[run_number] = values
            #     if len(df) > self.params.history:
            #         self._df = df.loc[df.index[-self.params.history:]]
            return self._return_run_step(self.next_state(dfslot),
                                         steps_run=steps)
Example #18
0
def digest_partitions(values):
    digest = TDigest()
    digest.batch_update(values)
    return [digest]
 def __init__(self,**kwargs):
     super().__init__(**kwargs)
     self.digest = TDigest()
Example #20
0
    def find_equalization_params(self, batch, component, survey_id_col, sample_size=10000,
                                 container_name='equal_params', **kwargs):
        """ Estimates 95th percentile of absolute values for each seismic survey
        in dataset for equalization.

        This method utilizes t-digest structure for batch-wise estimation of rank-based statistics,
        namely 95th percentile.

        Parameters
        ----------
        batch : SeismicBatch or B() named expression.
            Current batch from pipeline.
        component : str
            Component with shot gathers.
        survey_id_col : str
            Column in index that indicate names of seismic
            surveys from different seasons.
        sample_size: int, optional
            Number of elements to draw from each shot gather to update
            estimates if TDigest. Time for each update grows linearly
            with `sample_size`. Default is 10000.
        container_name: str, optional
            Name of the `SeismicDataset` attribute to store a dict
            with estimated percentile. Also contains `survey_id_col`
            key and corresponding value.
        kwargs: misc
            Parameters for TDigest objects.

        Raises
        ------
        ValueError : If index is not FieldIndex.
        ValueError : If shot gather with same id is contained in more
                     than one survey.

        Note
        ----
        Dictoinary with estimated percentile can be obtained from pipeline using `D(container_name)`.
        """
        if not isinstance(self.index, FieldIndex):
            raise ValueError("Index must be FieldIndex, not {}".format(type(self.index)))

        private_name = '_' + container_name
        params = getattr(self, private_name, None)
        if params is None:
            surveys = np.unique(self.index.get_df()[survey_id_col])
            delta, k = kwargs.pop('delta', 0.01), kwargs.pop('K', 25)
            params = dict(zip(surveys, [TDigest(delta, k) for _ in surveys]))
            setattr(self, private_name, params)

        for idx in batch.indices:
            surveys_by_fieldrecord = np.unique(batch.index.get_df(index=idx)[survey_id_col])
            if len(surveys_by_fieldrecord) != 1:
                raise ValueError('Field {} represents data from more than one survey!'.format(idx))
            survey = surveys_by_fieldrecord[0]

            pos = batch.index.get_pos(idx)
            sample = np.random.choice(getattr(batch, component)[pos].reshape(-1), size=sample_size)

            params[survey].batch_update(np.absolute(sample))

        statistics = dict([survey, digest.percentile(95)]
                          for survey, digest in params.items() if digest.n > 0)
        statistics['survey_id_col'] = survey_id_col
        setattr(self, container_name, statistics)
Example #21
0
from sim_wallet import simWallet
from tdigest import TDigest

import time

digest = TDigest()

wallet = simWallet()

wallet.print_wallet()

initial_value = wallet.estimate_total()

print('Initial wallet value is {} BTC.'.format(initial_value), flush=True)

while True:

    current_price = wallet.update_price()
    digest.update(current_price)

    digest_value = digest.percentile(15)

    print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format(
        current_price, digest_value),
          flush=True)

    if current_price < 0.9 * digest_value:
        wallet.buy_bnb(1)

    if current_price > 1.1 * digest_value:
        wallet.sell_bnb(1)
class SeasonalDecomposition(BaseTask):

    def __init__(self, config, logger, options):
        super(SeasonalDecomposition, self).__init__(config, logger, resource={'metric_sink': 'RedisSink',
                                                                              'output_sink': 'GraphiteSink'})
        self.plugin = options['plugin']
        self.service = options['service']
        self.params = options['params']
        self.tdigest_key = 'td:%s' % self.service
        self.td = TDigest()
        self.error_eval = {
            'tukey': self._eval_tukey,
            'quantile': self._eval_quantile
        }

    def _eval_quantile(self, error):
        state = {}
        alpha = self.params['error_params']['alpha']
        lower = self.td.quantile(alpha / 2)
        upper = self.td.quantile(1 - alpha / 2)
        if 'minimal_lower_threshold' in self.params['error_params']:
            lower = max(
                lower, self.params['error_params']['minimal_lower_threshold'])
        if 'minimal_upper_threshold' in self.params['error_params']:
            upper = min(
                upper, self.params['error_params']['minimal_upper_threshold'])
        flag = 0
        if error > upper:
            flag = 1
        elif error < lower:
            flag = -1
        state['flag'] = flag
        state['lower'] = lower
        state['upper'] = upper
        state['alpha'] = alpha
        return state

    def _eval_tukey(self, error):
        state = {}
        iqr_scaling = self.params['error_params'].get('iqr_scaling', 1.5)
        quantile_25 = self.td.quantile(0.25)
        quantile_75 = self.td.quantile(0.75)
        iqr = quantile_75 - quantile_25
        lower = quantile_25 - iqr_scaling * iqr
        upper = quantile_75 + iqr_scaling * iqr
        if 'minimal_lower_threshold' in self.params['error_params']:
            lower = max(
                lower, self.params['error_params']['minimal_lower_threshold'])
        if 'minimal_upper_threshold' in self.params['error_params']:
            upper = min(
                upper, self.params['error_params']['minimal_upper_threshold'])
        flag = 0
        if error > upper:
            flag = 1
        elif error < lower:
            flag = -1
        state['flag'] = flag
        state['lower'] = lower
        state['upper'] = upper
        return state

    def read(self):
        metric = self.params['metric']
        period_length = self.params['period_length']
        seasons = self.params['seasons']
        default = self.params['default']
        tdigest_json = [el for el in self.metric_sink.read(self.tdigest_key)]
        if tdigest_json:
            centroids = json.loads(tdigest_json[0])
            [self.td.add(c[0], c[1]) for c in centroids]

        # gather data and assure requirements
        data = [el for el in self.metric_sink.read(metric)]
        data = sorted(data, key=lambda tup: tup.timestamp)
        step_size = find_step_size(data)
        if not step_size:
            self.logger.error(
                'Datapoints have no common time grid or are not enough. Exiting')
            return None
        if data[-1].timestamp - int(time()) > 2 * step_size:
            self.logger.error('Datapoints are too old (%d sec). Exiting' % (
                data[-1].timestamp - int(time())))
            return None
        data = insert_missing_datapoints(data, default, step_size)
        if len(data) < period_length * seasons:
            self.logger.error(
                'Not enough (%d) datapoints. Exiting' % len(data))
            return None
        data = data[-period_length * seasons - 1:-1]

        return data

    def process(self, data):
        if data:
            period_length = self.params['period_length']
            error_type = self.params.get('error_type', 'norm')
            data = [float(el.value) for el in data]

            try:
                r_stl = robjects.r.stl
                r_ts = robjects.r.ts
                r_data_ts = r_ts(data, frequency=period_length)
                r_res = r_stl(r_data_ts, s_window="periodic", robust=True)
                r_res_ts = asarray(r_res[0])
                seasonal = r_res_ts[:, 0][-1]
                trend = r_res_ts[:, 1][-1]
                _error = r_res_ts[:, 2][-1]
                model = seasonal + trend
            except Exception as e:
                self.logger.error('STL Call failed: %s. Exiting' % e)
                return (0.0, 0.0, 0.0, {'flag': -1})

            if error_type == 'norm':
                error = _error / model if model != 0 else -1
            elif error_type == 'median':
                error = data[-1] - seasonal - median(data)
            elif error_type == 'stl':
                error = _error

            # add error to distribution and evaluate
            self.td.add(error, 1.0)
            state = self.error_eval[self.params['error_handling']](error)
            self.metric_sink.write(
                [RedisGeneric(self.tdigest_key, self.td.serialize())])

            return (seasonal, trend, error, state)

        else:
            return (0.0, 0.0, 0.0, {'flag': -1})

    def write(self, state):
        (seasonal, trend, error, state) = state
        prefix = '%s.%s' % (self.plugin, self.service)
        now = int(time())
        for name, value in state.iteritems():
            self.sink.write(
                TimeSeriesTuple('%s.%s' % (prefix, name), now, value))

        self.sink.write(
            TimeSeriesTuple('%s.%s' % (prefix, 'seasonal'), now, seasonal))
        self.sink.write(
            TimeSeriesTuple('%s.%s' % (prefix, 'trend'), now, trend))
        self.sink.write(
            TimeSeriesTuple('%s.%s' % (prefix, 'error'), now, error))

    def run(self):
        data = self.read()
        state = self.process(data)
        self.write(state)
        return True
Example #23
0
    if path.exists(decompressed_fname) is False:
        print("Decompressing {}".format(filename))
        decompress_file(filename)

    docs = []
    tree = ET.iterparse(decompressed_fname)
    print("Reading {}\n".format(decompressed_fname))
    progress = tqdm(unit="docs")

    doc = {}
    text = None
    comment = None
    username = None
    timestamp = None
    ts_digest = TDigest()
    for event, elem in tree:
        if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page":
            doc = {}
            doc["title"] = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
            doc["text"] = text
            doc["comment"] = comment
            doc["username"] = username
            doc["timestamp"] = int(timestamp)
            ts_digest.update(int(timestamp))
            if doc["text"] is not None and doc["comment"] is not None and doc["username"] is not None and doc[
                "timestamp"] is not None:
                total_docs = total_docs + 1
                docs.append(doc)
                progress.update()
                elem.clear()  # won't need the children any more
def digest_partitions(values):
    digest = TDigest()
    digest.batch_update(values)
    return [digest]
Example #25
0
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words,
                                use_numeric_range_searchs, ts_digest, p_writes):
    total_benchmark_reads = 0
    total_benchmark_writes = 0
    all_csvfile = open(all_fname, 'a', newline='')
    bench_csvfile = open(bench_fname, 'w', newline='')
    all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    progress = tqdm(unit="docs", total=total_benchmark_commands)
    total_docs = len(docs)

    ## timestamp related
    timestamps_pdist = generate_lognormal_dist(total_benchmark_commands)
    min_ts = ts_digest.percentile(0.0)
    max_ts = ts_digest.percentile(100.0)
    query_range_digest = TDigest()

    generated_commands = 0
    while generated_commands < total_benchmark_commands:
        query_ts_pdist = timestamps_pdist[generated_commands]
        percentile = (1.0 - query_ts_pdist) * 100.0
        query_min_ts = ts_digest.percentile(percentile)

        random_doc_pos = random.randint(0, total_docs - 1)
        doc = docs[random_doc_pos]
        # decide read or write
        p_cmd = random.random()
        if p_cmd < p_writes:
            ## WRITE
            total_benchmark_writes = total_benchmark_writes + 1
            generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"],
                                                      doc["username"],
                                                      doc["timestamp"],
                                                      generated_commands)

        else:
            ## READ
            total_benchmark_reads = total_benchmark_reads + 1
            words, totalW = getQueryWords(doc, stop_words, 2)

            choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0]
            generated_row = None
            numeric_range_str = ""
            if use_numeric_range_searchs:
                numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts)
                query_range_digest.update(int(max_ts - query_min_ts))
            if choice == "simple-1word-query" and len(words) >= 1:
                generated_row = generate_ft_search_row(indexname, "simple-1word-query",
                                                       "{}{}".format(numeric_range_str, words[0]))
            elif choice == "2word-union-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-union-query",
                                                       "{}{} {}".format(numeric_range_str, words[0], words[1]))
            elif choice == "2word-intersection-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-intersection-query",
                                                       "{}{}|{}".format(numeric_range_str, words[0], words[1]))
        if generated_row != None:
            #             all_csv_writer.writerow(generated_row)
            #             bench_csv_writer.writerow(generated_row)
            progress.update()
            generated_commands = generated_commands + 1
    progress.close()
    bench_csvfile.close()
    all_csvfile.close()

    #     print()
    xx = []
    yy = []
    p90 = query_range_digest.percentile(90.0)
    dataset_percent = ts_digest.cdf(p90)

    print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent))
    print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts)))
    for centroid in query_range_digest.centroids_to_list():
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(query_range_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages Query time range')
    plt.xlabel('Query time range')
    plt.ylabel('cdf')
    plt.xscale('log')
    plt.show()

    return total_benchmark_reads, total_benchmark_writes
Example #26
0
numberOfEnds = 3

duration = 10000
lengthOfInterval = 1000
numberOfIntervals = round(duration / lengthOfInterval)

# an inverval will not be logged unless an event happens afterwards,
# so last intevall will not be logged

# interval counter
i = 1
j = 1
k = 1
l = 1

# stores values per interval
t = 0
q = [0 for x in range(numberOfNodes)]
c = [0 for y in range(numberOfNodes)]

# stores all values

throughput = []
timeToComplete = []
numberOfArrivals = []
cpu = []

# tdigest

time = TDigest()
Example #27
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug('column_summary - ' + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ['mean', 'min', 'max', 'std', 'sum']:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult['n'] = column_props[col]['notnulls']

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult['percentiles'] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult['median'] = colresult['percentiles'][50]
    colresult['iqr'] = (colresult['percentiles'][75] -
                        colresult['percentiles'][25])

    # Compute the t-digest.
    logger.debug('column_summary - {} - creating TDigest...'.format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug('column_summary - {} - testing log trans...'.format(col))
    try:
        colresult['logtrans'] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning('test_logtrans has failed for column `{}`: {}'.format(
            col, e))
        colresult['logtrans'] = False

    if colresult['logtrans']:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult['logtrans_mean'] = _tdigest_mean(logdigest)
        colresult['logtrans_std'] = _tdigest_std(logdigest)
        colresult['logtrans_IQR'] = (logdigest.percentile(75) -
                                     logdigest.percentile(25))

    logger.debug('column_summary - {} - should {}be log-transformed'.format(
        col, 'NOT ' if not colresult['logtrans'] else ''))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug('column_summary - {} - computing histogram...'.format(col))

    if column_props[col]['is_categorical']:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult['logtrans']:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins='fd')
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins='fd')

    colresult['histogram'] = {
        'counts': counts.tolist(),
        'bin_edges': edges.tolist()
    }

    # Compute KDE
    logger.debug('column_summary - {} - computing KDE...'.format(col))
    bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1)

    logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw))

    if column_props[col]['is_categorical']:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult['min'], colresult['max']
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult['logtrans'])

    colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()}

    return {col: colresult, '_columns': [col]}