Python TDigest.percentile Exemples, tdigest.TDigest.percentile Python Exemples

Exemple #1

0

Afficher le fichier

def main():
	data = random(1000)

	i = 0
	for i in range(0,(len(data) - wSize + 1)):
		digest = TDigest()
		digest.batch_update(data[i:i+wSize])
		results.append([i+1,digest.percentile(15)])
		i += 1
	print(results)

Exemple #2

0

Afficher le fichier

Fichier : tping.py Projet : JOHN-FROD/sz_checksite

    def get_footer_stats(self):
        try:
            self.check_progress = False
            try:
                avg_ms = sum(self.ms_list) / len(self.ms_list)
            except ZeroDivisionError:
                # 捕获全部检测失败的情况。
                avg_ms = 0

            # 打印百分比分布信息 percentile
            if sys.modules.get("tdigest"):
                digest = TDigest()
                for item in self.ms_list:
                    digest.update(item)
                p50 = digest.percentile(50)
                p90 = digest.percentile(90)
                p99 = digest.percentile(99)
                percentile_string = "\tp50: %.2f p90: %.2f p99: %.2f" % (
                    p50, p90, p99)
            else:
                percentile_string = ""

            footer = "\rtotal: %d  success: %d  failure: %d  s_rate: %.2f  f_rate: %.2f  avg_ms: %.2f ms" % (
                self.check_count, self.check_success_count,
                self.check_failure_count,
                self.check_success_count / self.check_count,
                self.check_failure_count / self.check_count, avg_ms)

            #print(footer + percentile_string, file=sys.stderr, flush=True)
            print(footer + percentile_string, flush=True)
            if self.check_success_count / self.check_count == 0:
                # 如果一个都没有成功，则返回状态码 1.
                exit(1)
            elif self.check_success_count / self.check_count < 1:
                # 返回成功率的整数状态码
                exit(trunc(self.check_success_count / self.check_count * 100))
            else:
                # 全部成功，退出状态码 0。
                #exit(0)
                pass
        except Exception:
            # 指定 Exception 不捕获 exit 退出异常动作。
            pass

Exemple #3

0

Afficher le fichier

Fichier : statesboy_cat.py Projet : dioguerra/microprediction

class DigestMachine(DistMachine):

    def __init__(self):
        super().__init__()
        self.digest = TDigest()

    def update(self, value, dt=None, **kwargs):
        self.digest.update(value)

    def inv_cdf(self, p):
        return self.digest.percentile(100. * p)

Exemple #4

0

Afficher le fichier

Fichier : test_ws_client2.py Projet : dirtysalt/dirtysalt.github.io

class Digest:
    def __init__(self):
        self.digest = TDigest()
        self.digest.update(0)
        self._count = 0
        self.lock = asyncio.Lock()

    def add(self, v):
        self.digest.update(v)
        self._count += 1

    def percentile(self, v):
        return self.digest.percentile(v)

    def count(self):
        return self._count

Exemple #5

0

Afficher le fichier

Fichier : continuous_ingester.py Projet : sethusrinivasan/amazon-timestream-tools

class IngestionThread(threading.Thread):
    def __init__(self, threadId, args, dimensionMetrics, dimensionEvents,
                 highUtilizationHosts, lowUtilizationHosts, event):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.args = args
        self.dimensionMetrics = dimensionMetrics
        self.dimensionEvents = dimensionEvents
        self.client = tswrite.createWriteClient(region=args.region,
                                                profile=args.profile,
                                                endpoint=args.endpoint)
        self.databaseName = args.databaseName
        self.tableName = args.tableName
        self.numMetrics = len(dimensionMetrics)
        self.numEvents = len(dimensionEvents)
        self.digest = TDigest(
        )  ## Use the t-digest to compute the streaming percentiles
        self.count = 0
        self.success = 0
        self.sum = 0.0
        self.variance = float('nan')
        self.highUtilizationHosts = highUtilizationHosts
        self.lowUtilizationHosts = lowUtilizationHosts
        self.sigInt = False
        self.event = event
        self.recordsWritten = 0

    def run(self):
        global seriesId
        global timestamp
        global lock

        idx = 0
        mean = 0.0
        squared = 0.0
        addReqId = self.args.addReqId
        addReqIdAsDim = addReqId and self.args.addReqIdAsDim
        addReqIdAsMeasure = addReqId and not self.args.addReqIdAsDim

        writeRecordsBatch = list()
        recordsToWrite = list()

        while True:
            with lock:
                if self.sigInt == True or sigInt == True or self.event.is_set(
                ):
                    print("Thread {} exiting.".format(self.threadId))
                    break

                seriesId += 1
                if seriesId >= self.numMetrics + self.numEvents:
                    ## Wrapping around, so move to new timestamp.
                    seriesId = 0
                    newTimestamp = timestamp + self.args.intervalMillis
                    currentTime = getCurrentTimestampMillis()
                    ## Check if the timestamps are falling behind
                    if newTimestamp < currentTime - 0.05 * self.args.intervalMillis:
                        print(
                            "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes."
                            .format(self.args.intervalMillis,
                                    currentTime - timestamp))
                        ## Move time forward.
                        timestamp = getTimestampMillis()
                    else:
                        timestamp = newTimestamp
                        ## Check if we are ingesting too fast, then slow down.
                        if timestamp > currentTime - 1000:
                            ## Slow down
                            sleepTimeSecs = int(
                                (timestamp - currentTime) / 1000)
                            print("Thread {} sleeping for {} secs".format(
                                self.threadId, sleepTimeSecs))
                            time.sleep(sleepTimeSecs)

                    now = datetime.datetime.now()
                    print(
                        "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}."
                        .format(self.threadId,
                                now.strftime("%Y-%m-%d %H:%M:%S"), timestamp))

                localSeriesId = seriesId
                localTimestamp = timestamp

            if localSeriesId < self.numMetrics:
                dimensions = model.createDimensionsEntry(
                    self.dimensionMetrics[localSeriesId],
                    addReqId=addReqIdAsDim)
                records = model.createRandomMetrics(localSeriesId,
                                                    dimensions,
                                                    localTimestamp,
                                                    "MILLISECONDS",
                                                    self.highUtilizationHosts,
                                                    self.lowUtilizationHosts,
                                                    wide=self.args.wide,
                                                    addReqId=addReqIdAsMeasure)
            else:
                dimensions = model.createDimensionsEntry(
                    self.dimensionEvents[localSeriesId - self.numMetrics],
                    addReqId=addReqIdAsDim)
                records = model.createRandomEvent(dimensions,
                                                  localTimestamp,
                                                  "MILLISECONDS",
                                                  wide=self.args.wide,
                                                  addReqId=addReqIdAsMeasure)

            if self.args.batchWrites:
                if len(writeRecordsBatch) + len(
                        records) <= self.args.batchSize:
                    writeRecordsBatch.extend(records)
                    ## Generate more data, unless we're wrapping around, at which point, drain any pending records.
                    if localSeriesId < self.numMetrics + self.numEvents:
                        continue
                else:
                    ## transfer a subset of values from the records produced into the batch
                    spaceRemaining = self.args.batchSize - len(
                        writeRecordsBatch)
                    assert (spaceRemaining < len(records))
                    ## Transfer 0 - spaceRemaining - 1 to be written with this batch, and spaceRemaining - end in the next batch
                    ## If spaceRemaining is 0, then just write what we have accumulated so far
                    if spaceRemaining > 0:
                        writeRecordsBatch.extend(records[0:spaceRemaining])
                    ## The batch is full, now we issue the write record request.
                    recordsToWrite.clear()
                    recordsToWrite.extend(writeRecordsBatch)
                    writeRecordsBatch.clear()
                    writeRecordsBatch.extend(records[spaceRemaining:])
            else:
                recordsToWrite.clear()
                recordsToWrite.extend(records)

            idx += 1
            start = timer()
            try:
                writeResult = tswrite.writeRecords(self.client,
                                                   self.databaseName,
                                                   self.tableName,
                                                   recordsToWrite)
                self.recordsWritten += len(recordsToWrite)
                self.success += 1
            except Exception as e:
                print(e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
                requestId = "RequestId: {}".format(
                    e.response['ResponseMetadata']['RequestId'])
                print(requestId)
                print(json.dumps(dimensions, indent=2))
                print(json.dumps(records, indent=2))
                continue
            finally:
                self.count += 1
                end = timer()
                cur = end - start
                self.digest.update(cur)
                self.sum += cur
                ## Computing the streaming M^2 (squared distance from mean)
                delta = cur - mean
                mean += delta / self.count
                squared += delta * (cur - mean)
                if self.count > 1:
                    self.variance = float(squared / (self.count - 1))

            requestId = writeResult['ResponseMetadata']['RequestId']
            if idx % 1000 == 0:
                now = datetime.datetime.now()
                print(
                    "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}"
                    .format(self.threadId, idx,
                            now.strftime("%Y-%m-%d %H:%M:%S"), requestId,
                            round(self.sum / self.count, 3),
                            round(math.sqrt(self.variance), 3),
                            round(self.digest.percentile(50), 3),
                            round(self.digest.percentile(90), 3),
                            round(self.digest.percentile(99), 3),
                            self.recordsWritten))

    def interrupt(self):
        print("Interrupting thread: ", self.threadId)
        self.sigInt = True

Exemple #6

0

Afficher le fichier

Fichier : continuous_ingester.py Projet : sethusrinivasan/amazon-timestream-tools

def ingestRecordsMultiProc(dimensionsMetrics, dimensionsEvents, args):
    ## Register sigint handler
    signal.signal(signal.SIGINT, signalHandlerMultiProc)

    numHosts = len(dimensionsMetrics)
    remainder = numHosts % args.processes
    startId = 0

    ingestionStart = timer()

    for processId in range(1, args.processes + 1):
        endId = startId + int(
            numHosts / args.processes) + (1 if remainder > 0 else 0)
        if endId > numHosts:
            print(
                "Number of processes more than number of hosts, skipping process creation"
            )
            break
        print("Starting process {} with host ranges: [{}, {}]".format(
            processId, startId, endId - 1))

        ## Select a subset of hosts
        dimensionsMetricsLocal = dimensionsMetrics[startId:endId]
        dimensionsMetricsSet = set()
        for dim in dimensionsMetricsLocal:
            dimensionsMetricsSet.add(
                (dim.region, dim.cell, dim.silo, dim.availability_zone,
                 dim.microservice_name, dim.instance_name))
        dimensionsEventsLocal = list()
        ## Select the dimension events for the hosts selected above.
        for dim in dimensionsEvents:
            host = (dim.region, dim.cell, dim.silo, dim.availability_zone,
                    dim.microservice_name, dim.instance_name)
            if host in dimensionsMetricsSet:
                dimensionsEventsLocal.append(dim)

        print(
            "Starting process {} with host ranges: [{}, {}]. Metrics: {}. Events: {}"
            .format(processId, startId, endId - 1, len(dimensionsMetricsLocal),
                    len(dimensionsEventsLocal)))
        lowUtilizationHosts, highUtilizationHosts = initializeHighAndLowUtilizationHosts(
            len(dimensionsMetricsLocal))
        parentConn, childConn = multiprocessing.Pipe()
        manager = multiprocessing.Manager()
        event = manager.Event()
        process = MultiProcessIngestWorker(
            processId, args, dimensionsMetricsLocal, dimensionsEventsLocal,
            highUtilizationHosts, lowUtilizationHosts, childConn, event)
        process.start()
        processes.append((process, parentConn, event))
        remainder -= 1
        startId = endId

    success = 0
    count = 0
    recordsWritten = 0
    totalLatency = 0.0
    aggregatedDigests = TDigest()
    pooledVariance = 0.0
    for p, conn, event in processes:
        output = conn.recv()
        p.join()
        if output == None:
            continue

        success += output.success
        ## Pool the variance.
        if count == 0:
            pooledVariance = output.variance
        else:
            pooledVariance = ((count - 1) * pooledVariance +
                              (output.count - 1) * output.variance) / (
                                  (count - 1) + (output.count - 1))
        count += output.count
        recordsWritten += output.records
        aggregatedDigests += output.digest
        totalLatency += output.sum

    print(
        "[OVERALL] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}"
        .format(count, success, round(totalLatency / count, 3),
                round(math.sqrt(pooledVariance), 3),
                round(aggregatedDigests.percentile(50), 3),
                round(aggregatedDigests.percentile(90), 3),
                round(aggregatedDigests.percentile(99), 3), recordsWritten))

    ingestionEnd = timer()
    print("Total time to ingest: {:,} seconds. TPS: {:,}, Records/sec: {:,}".
          format(round(ingestionEnd - ingestionStart, 2),
                 round(count / (ingestionEnd - ingestionStart), 2),
                 round(recordsWritten / (ingestionEnd - ingestionStart), 2)))

Exemple #7

0

Afficher le fichier

Fichier : continuous_ingester.py Projet : sethusrinivasan/amazon-timestream-tools

    def run(self):
        global lock
        global seriesId
        global timestamp

        with lock:
            ## Randomly pick a series ID to start for this process.
            seriesId = random.randint(
                0,
                len(self.dimensionEvents) + len(self.dimensionMetrics) - 1)
            timestamp = getTimestampMillis()
            print("Process {} using start series ID: {}".format(
                self.processId, seriesId))

        ## Register sigint handler
        signal.signal(signal.SIGINT, signalHandler)
        overallSummary = None
        ingestionStart = timer()

        try:
            for threadId in range(self.args.concurrency):
                threadIdStr = "{}-{}".format(self.processId, threadId + 1)
                print("Starting ThreadId: {}".format(threadIdStr))
                thread = IngestionThread(threadIdStr, self.args,
                                         self.dimensionMetrics,
                                         self.dimensionEvents,
                                         self.highUtilizationHosts,
                                         self.lowUtilizationHosts, self.event)
                thread.start()
                self.threads.append(thread)

            success = 0
            count = 0
            recordsWritten = 0
            totalLatency = 0.0
            aggregatedDigests = TDigest()
            pooledVariance = 0.0
            for t in self.threads:
                t.join()
                success += t.success
                ## Pool the variance.
                if count == 0:
                    pooledVariance = t.variance
                else:
                    pooledVariance = ((count - 1) * pooledVariance +
                                      (t.count - 1) * t.variance) / (
                                          (count - 1) + (t.count - 1))
                count += t.count
                recordsWritten += t.recordsWritten
                aggregatedDigests += t.digest
                totalLatency += t.sum

            print(
                "[Process: {}] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}"
                .format(self.processId, count, success,
                        round(totalLatency / count, 3),
                        round(math.sqrt(pooledVariance), 3),
                        round(aggregatedDigests.percentile(50), 3),
                        round(aggregatedDigests.percentile(90), 3),
                        round(aggregatedDigests.percentile(99), 3),
                        recordsWritten))

            overallSummary = IngestionSummaryStats(aggregatedDigests, count,
                                                   success, totalLatency,
                                                   pooledVariance,
                                                   recordsWritten)
            ingestionEnd = timer()
            print("Total time to ingest: {:,} seconds".format(
                round(ingestionEnd - ingestionStart, 2)))
        finally:
            self.conn.send(overallSummary)

Exemple #8

0

Afficher le fichier

Fichier : sim_trader_bot_reverse.py Projet : tdrvlad/Crypto-Trader

digest = TDigest()

wallet = simWallet()

wallet.print_wallet()

initial_value = wallet.estimate_total()

print('Initial wallet value is {} BTC.'.format(initial_value), flush=True)

while True:

    current_price = wallet.update_price()
    digest.update(current_price)

    digest_value = digest.percentile(15)

    print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format(
        current_price, digest_value),
          flush=True)

    if current_price > 1.1 * digest_value:
        wallet.buy_bnb(1)

    if current_price < 0.9 * digest_value:
        wallet.sell_bnb(1)

    percent = int(wallet.estimate_total() / initial_value * 100)
    print('\nCurrent wallet value is {}% of initial'.format(percent),
          flush=True)

Exemple #9

0

Afficher le fichier

Fichier : percentile-test-02.py Projet : rodrigolazarinigil/python-data-validation

    import json
    start_pos = 0
    with open(fn, 'r') as f:
        while True:
            try:
                obj = json.load(f)
                yield obj
                return
            except json.JSONDecodeError as e:
                f.seek(start_pos)
                json_str = f.read(e.pos)
                obj = json.loads(json_str)
                start_pos += e.pos
                yield obj


final_digest = TDigest()

json_object = stream_read_json('./metric-result-no-presumed-revenue.json')
for num, json_list in enumerate(json_object):
    num_list = pyjq.all('.[] | .metric', json_list)
    digest = TDigest()
    digest.batch_update(num_list)
    print(digest.percentile(50))
    final_digest = final_digest + digest

print('final: ' + str(final_digest.percentile(25)))
print('final: ' + str(final_digest.percentile(50)))
print('final: ' + str(final_digest.percentile(75)))
print('final: ' + str(final_digest.percentile(90)))

Exemple #10

0

Afficher le fichier

Fichier : percentiles.py Projet : jdfekete/progressivis

class Percentiles(DataFrameModule):
    parameters = [('percentiles', object, [0.25, 0.5, 0.75]),
                  ('history', np.dtype(int), 3)]
                  
    def __init__(self, column, percentiles=None, **kwds):
        if not column:
            raise ProgressiveError('Need a column name')
        self._add_slots(kwds,'input_descriptors',
                        [SlotDescriptor('df', type=pd.DataFrame)])
        super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds)
        self._column = column
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < .5]
                uh = percentiles[percentiles > .5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        
        self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles]
        self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC)
        self._df = create_dataframe(self.schema)

    def is_ready(self):
        if self.get_input_slot('df').has_created():
            return True
        return super(Percentiles, self).is_ready()

    def run_step(self,run_number,step_size,howlong):
        dfslot = self.get_input_slot('df')
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            dfslot.update(run_number)
            self.tdigest = TDigest() # reset

        indices = dfslot.next_created(step_size)
        steps = indices_len(indices)
        if steps == 0:
            return self._return_run_step(self.state_blocked, steps_run=steps)
        input_df = dfslot.data()
        with dfslot.lock:
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x)
        df = self._df
        values = []
        for p in self._percentiles:
            values.append(self.tdigest.percentile(p*100))
        values.append(run_number)
        with self.lock:
            df.loc[run_number] = values
            if len(df) > self.params.history:
                self._df = df.loc[df.index[-self.params.history:]]
        return self._return_run_step(dfslot.next_state(),
                                     steps_run=steps, reads=steps, updates=len(self._df))

Exemple #11

0

Afficher le fichier

Fichier : metrics.py Projet : sfrias/lens

def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug("column_summary - " + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ["mean", "min", "max", "std", "sum"]:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult["n"] = column_props[col]["notnulls"]

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult["percentiles"] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult["median"] = colresult["percentiles"][50]
    colresult["iqr"] = (colresult["percentiles"][75] -
                        colresult["percentiles"][25])

    # Compute the t-digest.
    logger.debug("column_summary - {} - creating TDigest...".format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug("column_summary - {} - testing log trans...".format(col))
    try:
        colresult["logtrans"] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning("test_logtrans has failed for column `{}`: {}".format(
            col, e))
        colresult["logtrans"] = False

    if colresult["logtrans"]:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult["logtrans_mean"] = _tdigest_mean(logdigest)
        colresult["logtrans_std"] = _tdigest_std(logdigest)
        colresult["logtrans_IQR"] = logdigest.percentile(
            75) - logdigest.percentile(25)

    logger.debug("column_summary - {} - should {}be log-transformed".format(
        col, "NOT " if not colresult["logtrans"] else ""))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug("column_summary - {} - computing histogram...".format(col))

    if column_props[col]["is_categorical"]:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult["logtrans"]:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins="fd")
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins="fd")

    colresult["histogram"] = {
        "counts": counts.tolist(),
        "bin_edges": edges.tolist(),
    }

    # Compute KDE
    logger.debug("column_summary - {} - computing KDE...".format(col))
    bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1)

    logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw))

    if column_props[col]["is_categorical"]:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult["min"], colresult["max"]
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult["logtrans"])

    colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()}

    return {col: colresult, "_columns": [col]}

Exemple #12

0

Afficher le fichier

def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words,
                                use_numeric_range_searchs, ts_digest, p_writes):
    total_benchmark_reads = 0
    total_benchmark_writes = 0
    all_csvfile = open(all_fname, 'a', newline='')
    bench_csvfile = open(bench_fname, 'w', newline='')
    all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    progress = tqdm(unit="docs", total=total_benchmark_commands)
    total_docs = len(docs)

    ## timestamp related
    timestamps_pdist = generate_lognormal_dist(total_benchmark_commands)
    min_ts = ts_digest.percentile(0.0)
    max_ts = ts_digest.percentile(100.0)
    query_range_digest = TDigest()

    generated_commands = 0
    while generated_commands < total_benchmark_commands:
        query_ts_pdist = timestamps_pdist[generated_commands]
        percentile = (1.0 - query_ts_pdist) * 100.0
        query_min_ts = ts_digest.percentile(percentile)

        random_doc_pos = random.randint(0, total_docs - 1)
        doc = docs[random_doc_pos]
        # decide read or write
        p_cmd = random.random()
        if p_cmd < p_writes:
            ## WRITE
            total_benchmark_writes = total_benchmark_writes + 1
            generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"],
                                                      doc["username"],
                                                      doc["timestamp"],
                                                      generated_commands)

        else:
            ## READ
            total_benchmark_reads = total_benchmark_reads + 1
            words, totalW = getQueryWords(doc, stop_words, 2)

            choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0]
            generated_row = None
            numeric_range_str = ""
            if use_numeric_range_searchs:
                numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts)
                query_range_digest.update(int(max_ts - query_min_ts))
            if choice == "simple-1word-query" and len(words) >= 1:
                generated_row = generate_ft_search_row(indexname, "simple-1word-query",
                                                       "{}{}".format(numeric_range_str, words[0]))
            elif choice == "2word-union-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-union-query",
                                                       "{}{} {}".format(numeric_range_str, words[0], words[1]))
            elif choice == "2word-intersection-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-intersection-query",
                                                       "{}{}|{}".format(numeric_range_str, words[0], words[1]))
        if generated_row != None:
            #             all_csv_writer.writerow(generated_row)
            #             bench_csv_writer.writerow(generated_row)
            progress.update()
            generated_commands = generated_commands + 1
    progress.close()
    bench_csvfile.close()
    all_csvfile.close()

    #     print()
    xx = []
    yy = []
    p90 = query_range_digest.percentile(90.0)
    dataset_percent = ts_digest.cdf(p90)

    print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent))
    print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts)))
    for centroid in query_range_digest.centroids_to_list():
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(query_range_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages Query time range')
    plt.xlabel('Query time range')
    plt.ylabel('cdf')
    plt.xscale('log')
    plt.show()

    return total_benchmark_reads, total_benchmark_writes

Exemple #13

0

Afficher le fichier

class Percentiles(TableModule):
    parameters = [
        ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]),
        ("history", np.dtype(int), 3),
    ]
    inputs = [SlotDescriptor("table", type=Table)]

    def __init__(self,
                 column: str,
                 percentiles: Optional[Union[List[float],
                                             np.ndarray[Any, Any]]] = None,
                 **kwds: Any) -> None:
        if not column:
            raise ProgressiveError("Need a column name")
        super(Percentiles, self).__init__(**kwds)
        self._columns = [column]
        self.default_step_size = 1000
        self.tdigest = TDigest()

        if percentiles is None:
            percentiles = np.array([0.25, 0.5, 0.75])
        else:
            # get them all to be in [0, 1]
            percentiles = np.asarray(percentiles)
            if (percentiles > 1).any():  # type: ignore
                percentiles = percentiles / 100.0
                msg = ("percentiles should all be in the interval [0, 1]. "
                       "Try {0} instead.")
                raise ValueError(msg.format(list(percentiles)))
            if (percentiles != 0.5).all():  # median isn't included
                lh = percentiles[percentiles < 0.5]
                uh = percentiles[percentiles > 0.5]
                percentiles = np.hstack([lh, 0.5, uh])

        self._percentiles = percentiles
        self._pername: List[str] = [_pretty_name(x) for x in self._percentiles]
        dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}"
        self.result = Table(self.generate_table_name("percentiles"),
                            dshape=dshape,
                            create=True)

    def is_ready(self) -> bool:
        slot = self.get_input_slot("table")
        if slot is not None and slot.created.any():
            return True
        return super(Percentiles, self).is_ready()

    def reset(self) -> None:
        self.tdigest = TDigest()

    @process_slot("table", reset_cb="reset")
    @run_if_any
    def run_step(self, run_number: int, step_size: int,
                 howlong: float) -> ReturnRunStep:
        assert self.context
        with self.context as ctx:
            dfslot = ctx.table
            indices = dfslot.created.next(length=step_size)
            steps = indices_len(indices)
            if steps == 0:
                return self._return_run_step(self.state_blocked,
                                             steps_run=steps)
            input_df = dfslot.data()
            x = self.filter_columns(input_df, fix_loc(indices))
            self.tdigest.batch_update(x[0])
            df = self.table
            values = {}
            for n, p in zip(self._pername, self._percentiles):
                values[n] = self.tdigest.percentile(p * 100)
            df.add(values)
            # with self.lock:
            #     df.loc[run_number] = values
            #     if len(df) > self.params.history:
            #         self._df = df.loc[df.index[-self.params.history:]]
            return self._return_run_step(self.next_state(dfslot),
                                         steps_run=steps)

Exemple #14

0

Afficher le fichier

class IngestionThread(threading.Thread):
    def __init__(self, threadId, args, dimensionMetrics, dimensionEvents,
                 highUtilizationHosts, lowUtilizationHosts, event):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.args = args
        self.dimensionMetrics = dimensionMetrics
        self.dimensionEvents = dimensionEvents
        self.client = tswrite.createWriteClient(args.endpoint,
                                                profile=args.profile)
        self.databaseName = args.databaseName
        self.tableName = args.tableName
        self.numMetrics = len(dimensionMetrics)
        self.numEvents = len(dimensionEvents)
        self.digest = TDigest(
        )  ## Use the t-digest to compute the streaming percentiles
        self.count = 0
        self.success = 0
        self.sum = 0.0
        self.variance = float('nan')
        self.highUtilizationHosts = highUtilizationHosts
        self.lowUtilizationHosts = lowUtilizationHosts
        self.sigInt = False
        self.event = event

    def run(self):
        global seriesId
        global timestamp
        global lock

        idx = 0
        mean = 0.0
        squared = 0.0

        while True:
            with lock:
                if self.sigInt == True or sigInt == True or self.event.is_set(
                ):
                    print("Thread {} exiting.".format(self.threadId))
                    break

                seriesId += 1
                if seriesId >= self.numMetrics + self.numEvents:
                    ## Wrapping around, so move to new timestamp.
                    seriesId = 0
                    newTimestamp = timestamp + self.args.intervalMillis
                    currentTime = getCurrentTimestampMillis()
                    ## Check if the timestamps are falling behind
                    if newTimestamp < currentTime - 0.05 * self.args.intervalMillis:
                        print(
                            "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes."
                            .format(self.args.intervalMillis,
                                    currentTime - timestamp))
                        ## Move time forward.
                        timestamp = getTimestampMillis()
                    else:
                        timestamp = newTimestamp
                        ## Check if we are ingesting too fast, then slow down.
                        if timestamp > currentTime - 1000:
                            ## Slow down
                            sleepTimeSecs = int(
                                (timestamp - currentTime) / 1000)
                            print("Thread {} sleeping for {} secs".format(
                                self.threadId, sleepTimeSecs))
                            time.sleep(sleepTimeSecs)

                    now = datetime.datetime.now()
                    print(
                        "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}."
                        .format(self.threadId,
                                now.strftime("%Y-%m-%d %H:%M:%S"), timestamp))

                localSeriesId = seriesId
                localTimestamp = timestamp

            if localSeriesId < self.numMetrics:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionMetrics[localSeriesId])
                records = model.createRandomMetrics(seriesId, localTimestamp,
                                                    "MILLISECONDS",
                                                    self.highUtilizationHosts,
                                                    self.lowUtilizationHosts)
            else:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionEvents[localSeriesId - self.numMetrics])
                records = model.createRandomEvent(localTimestamp,
                                                  "MILLISECONDS")

            idx += 1
            start = timer()
            try:
                writeResult = tswrite.writeRecords(self.client,
                                                   self.databaseName,
                                                   self.tableName,
                                                   commonAttributes, records)
                self.success += 1
            except Exception as e:
                print(e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
                requestId = "RequestId: {}".format(
                    e.response['ResponseMetadata']['RequestId'])
                print(requestId)
                print(json.dumps(commonAttributes, indent=2))
                print(json.dumps(records, indent=2))
                continue
            finally:
                self.count += 1
                end = timer()
                cur = end - start
                self.digest.update(cur)
                self.sum += cur
                ## Computing the streaming M^2 (squared distance from mean)
                delta = cur - mean
                mean += delta / self.count
                squared += delta * (cur - mean)
                if self.count > 1:
                    self.variance = float(squared / (self.count - 1))

            requestId = writeResult['ResponseMetadata']['RequestId']
            if idx % 1000 == 0:
                now = datetime.datetime.now()
                print(
                    "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}"
                    .format(self.threadId, idx,
                            now.strftime("%Y-%m-%d %H:%M:%S"), requestId,
                            round(self.sum / self.count, 3),
                            round(math.sqrt(self.variance), 3),
                            round(self.digest.percentile(50), 3),
                            round(self.digest.percentile(90), 3),
                            round(self.digest.percentile(99), 3)))

    def interrupt(self):
        print("Interrupting thread: ", self.threadId)
        self.sigInt = True

Exemple #15

0

Afficher le fichier

def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug('column_summary - ' + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ['mean', 'min', 'max', 'std', 'sum']:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult['n'] = column_props[col]['notnulls']

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult['percentiles'] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult['median'] = colresult['percentiles'][50]
    colresult['iqr'] = (colresult['percentiles'][75] -
                        colresult['percentiles'][25])

    # Compute the t-digest.
    logger.debug('column_summary - {} - creating TDigest...'.format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug('column_summary - {} - testing log trans...'.format(col))
    try:
        colresult['logtrans'] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning('test_logtrans has failed for column `{}`: {}'.format(
            col, e))
        colresult['logtrans'] = False

    if colresult['logtrans']:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult['logtrans_mean'] = _tdigest_mean(logdigest)
        colresult['logtrans_std'] = _tdigest_std(logdigest)
        colresult['logtrans_IQR'] = (logdigest.percentile(75) -
                                     logdigest.percentile(25))

    logger.debug('column_summary - {} - should {}be log-transformed'.format(
        col, 'NOT ' if not colresult['logtrans'] else ''))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug('column_summary - {} - computing histogram...'.format(col))

    if column_props[col]['is_categorical']:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult['logtrans']:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins='fd')
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins='fd')

    colresult['histogram'] = {
        'counts': counts.tolist(),
        'bin_edges': edges.tolist()
    }

    # Compute KDE
    logger.debug('column_summary - {} - computing KDE...'.format(col))
    bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1)

    logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw))

    if column_props[col]['is_categorical']:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult['min'], colresult['max']
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult['logtrans'])

    colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()}

    return {col: colresult, '_columns': [col]}