def main(): data = random(1000) i = 0 for i in range(0,(len(data) - wSize + 1)): digest = TDigest() digest.batch_update(data[i:i+wSize]) results.append([i+1,digest.percentile(15)]) i += 1 print(results)
def get_footer_stats(self): try: self.check_progress = False try: avg_ms = sum(self.ms_list) / len(self.ms_list) except ZeroDivisionError: # 捕获全部检测失败的情况。 avg_ms = 0 # 打印百分比分布信息 percentile if sys.modules.get("tdigest"): digest = TDigest() for item in self.ms_list: digest.update(item) p50 = digest.percentile(50) p90 = digest.percentile(90) p99 = digest.percentile(99) percentile_string = "\tp50: %.2f p90: %.2f p99: %.2f" % ( p50, p90, p99) else: percentile_string = "" footer = "\rtotal: %d success: %d failure: %d s_rate: %.2f f_rate: %.2f avg_ms: %.2f ms" % ( self.check_count, self.check_success_count, self.check_failure_count, self.check_success_count / self.check_count, self.check_failure_count / self.check_count, avg_ms) #print(footer + percentile_string, file=sys.stderr, flush=True) print(footer + percentile_string, flush=True) if self.check_success_count / self.check_count == 0: # 如果一个都没有成功,则返回状态码 1. exit(1) elif self.check_success_count / self.check_count < 1: # 返回成功率的整数状态码 exit(trunc(self.check_success_count / self.check_count * 100)) else: # 全部成功,退出状态码 0。 #exit(0) pass except Exception: # 指定 Exception 不捕获 exit 退出异常动作。 pass
class DigestMachine(DistMachine): def __init__(self): super().__init__() self.digest = TDigest() def update(self, value, dt=None, **kwargs): self.digest.update(value) def inv_cdf(self, p): return self.digest.percentile(100. * p)
class Digest: def __init__(self): self.digest = TDigest() self.digest.update(0) self._count = 0 self.lock = asyncio.Lock() def add(self, v): self.digest.update(v) self._count += 1 def percentile(self, v): return self.digest.percentile(v) def count(self): return self._count
class IngestionThread(threading.Thread): def __init__(self, threadId, args, dimensionMetrics, dimensionEvents, highUtilizationHosts, lowUtilizationHosts, event): threading.Thread.__init__(self) self.threadId = threadId self.args = args self.dimensionMetrics = dimensionMetrics self.dimensionEvents = dimensionEvents self.client = tswrite.createWriteClient(region=args.region, profile=args.profile, endpoint=args.endpoint) self.databaseName = args.databaseName self.tableName = args.tableName self.numMetrics = len(dimensionMetrics) self.numEvents = len(dimensionEvents) self.digest = TDigest( ) ## Use the t-digest to compute the streaming percentiles self.count = 0 self.success = 0 self.sum = 0.0 self.variance = float('nan') self.highUtilizationHosts = highUtilizationHosts self.lowUtilizationHosts = lowUtilizationHosts self.sigInt = False self.event = event self.recordsWritten = 0 def run(self): global seriesId global timestamp global lock idx = 0 mean = 0.0 squared = 0.0 addReqId = self.args.addReqId addReqIdAsDim = addReqId and self.args.addReqIdAsDim addReqIdAsMeasure = addReqId and not self.args.addReqIdAsDim writeRecordsBatch = list() recordsToWrite = list() while True: with lock: if self.sigInt == True or sigInt == True or self.event.is_set( ): print("Thread {} exiting.".format(self.threadId)) break seriesId += 1 if seriesId >= self.numMetrics + self.numEvents: ## Wrapping around, so move to new timestamp. seriesId = 0 newTimestamp = timestamp + self.args.intervalMillis currentTime = getCurrentTimestampMillis() ## Check if the timestamps are falling behind if newTimestamp < currentTime - 0.05 * self.args.intervalMillis: print( "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes." .format(self.args.intervalMillis, currentTime - timestamp)) ## Move time forward. timestamp = getTimestampMillis() else: timestamp = newTimestamp ## Check if we are ingesting too fast, then slow down. if timestamp > currentTime - 1000: ## Slow down sleepTimeSecs = int( (timestamp - currentTime) / 1000) print("Thread {} sleeping for {} secs".format( self.threadId, sleepTimeSecs)) time.sleep(sleepTimeSecs) now = datetime.datetime.now() print( "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}." .format(self.threadId, now.strftime("%Y-%m-%d %H:%M:%S"), timestamp)) localSeriesId = seriesId localTimestamp = timestamp if localSeriesId < self.numMetrics: dimensions = model.createDimensionsEntry( self.dimensionMetrics[localSeriesId], addReqId=addReqIdAsDim) records = model.createRandomMetrics(localSeriesId, dimensions, localTimestamp, "MILLISECONDS", self.highUtilizationHosts, self.lowUtilizationHosts, wide=self.args.wide, addReqId=addReqIdAsMeasure) else: dimensions = model.createDimensionsEntry( self.dimensionEvents[localSeriesId - self.numMetrics], addReqId=addReqIdAsDim) records = model.createRandomEvent(dimensions, localTimestamp, "MILLISECONDS", wide=self.args.wide, addReqId=addReqIdAsMeasure) if self.args.batchWrites: if len(writeRecordsBatch) + len( records) <= self.args.batchSize: writeRecordsBatch.extend(records) ## Generate more data, unless we're wrapping around, at which point, drain any pending records. if localSeriesId < self.numMetrics + self.numEvents: continue else: ## transfer a subset of values from the records produced into the batch spaceRemaining = self.args.batchSize - len( writeRecordsBatch) assert (spaceRemaining < len(records)) ## Transfer 0 - spaceRemaining - 1 to be written with this batch, and spaceRemaining - end in the next batch ## If spaceRemaining is 0, then just write what we have accumulated so far if spaceRemaining > 0: writeRecordsBatch.extend(records[0:spaceRemaining]) ## The batch is full, now we issue the write record request. recordsToWrite.clear() recordsToWrite.extend(writeRecordsBatch) writeRecordsBatch.clear() writeRecordsBatch.extend(records[spaceRemaining:]) else: recordsToWrite.clear() recordsToWrite.extend(records) idx += 1 start = timer() try: writeResult = tswrite.writeRecords(self.client, self.databaseName, self.tableName, recordsToWrite) self.recordsWritten += len(recordsToWrite) self.success += 1 except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) requestId = "RequestId: {}".format( e.response['ResponseMetadata']['RequestId']) print(requestId) print(json.dumps(dimensions, indent=2)) print(json.dumps(records, indent=2)) continue finally: self.count += 1 end = timer() cur = end - start self.digest.update(cur) self.sum += cur ## Computing the streaming M^2 (squared distance from mean) delta = cur - mean mean += delta / self.count squared += delta * (cur - mean) if self.count > 1: self.variance = float(squared / (self.count - 1)) requestId = writeResult['ResponseMetadata']['RequestId'] if idx % 1000 == 0: now = datetime.datetime.now() print( "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}" .format(self.threadId, idx, now.strftime("%Y-%m-%d %H:%M:%S"), requestId, round(self.sum / self.count, 3), round(math.sqrt(self.variance), 3), round(self.digest.percentile(50), 3), round(self.digest.percentile(90), 3), round(self.digest.percentile(99), 3), self.recordsWritten)) def interrupt(self): print("Interrupting thread: ", self.threadId) self.sigInt = True
def ingestRecordsMultiProc(dimensionsMetrics, dimensionsEvents, args): ## Register sigint handler signal.signal(signal.SIGINT, signalHandlerMultiProc) numHosts = len(dimensionsMetrics) remainder = numHosts % args.processes startId = 0 ingestionStart = timer() for processId in range(1, args.processes + 1): endId = startId + int( numHosts / args.processes) + (1 if remainder > 0 else 0) if endId > numHosts: print( "Number of processes more than number of hosts, skipping process creation" ) break print("Starting process {} with host ranges: [{}, {}]".format( processId, startId, endId - 1)) ## Select a subset of hosts dimensionsMetricsLocal = dimensionsMetrics[startId:endId] dimensionsMetricsSet = set() for dim in dimensionsMetricsLocal: dimensionsMetricsSet.add( (dim.region, dim.cell, dim.silo, dim.availability_zone, dim.microservice_name, dim.instance_name)) dimensionsEventsLocal = list() ## Select the dimension events for the hosts selected above. for dim in dimensionsEvents: host = (dim.region, dim.cell, dim.silo, dim.availability_zone, dim.microservice_name, dim.instance_name) if host in dimensionsMetricsSet: dimensionsEventsLocal.append(dim) print( "Starting process {} with host ranges: [{}, {}]. Metrics: {}. Events: {}" .format(processId, startId, endId - 1, len(dimensionsMetricsLocal), len(dimensionsEventsLocal))) lowUtilizationHosts, highUtilizationHosts = initializeHighAndLowUtilizationHosts( len(dimensionsMetricsLocal)) parentConn, childConn = multiprocessing.Pipe() manager = multiprocessing.Manager() event = manager.Event() process = MultiProcessIngestWorker( processId, args, dimensionsMetricsLocal, dimensionsEventsLocal, highUtilizationHosts, lowUtilizationHosts, childConn, event) process.start() processes.append((process, parentConn, event)) remainder -= 1 startId = endId success = 0 count = 0 recordsWritten = 0 totalLatency = 0.0 aggregatedDigests = TDigest() pooledVariance = 0.0 for p, conn, event in processes: output = conn.recv() p.join() if output == None: continue success += output.success ## Pool the variance. if count == 0: pooledVariance = output.variance else: pooledVariance = ((count - 1) * pooledVariance + (output.count - 1) * output.variance) / ( (count - 1) + (output.count - 1)) count += output.count recordsWritten += output.records aggregatedDigests += output.digest totalLatency += output.sum print( "[OVERALL] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}" .format(count, success, round(totalLatency / count, 3), round(math.sqrt(pooledVariance), 3), round(aggregatedDigests.percentile(50), 3), round(aggregatedDigests.percentile(90), 3), round(aggregatedDigests.percentile(99), 3), recordsWritten)) ingestionEnd = timer() print("Total time to ingest: {:,} seconds. TPS: {:,}, Records/sec: {:,}". format(round(ingestionEnd - ingestionStart, 2), round(count / (ingestionEnd - ingestionStart), 2), round(recordsWritten / (ingestionEnd - ingestionStart), 2)))
def run(self): global lock global seriesId global timestamp with lock: ## Randomly pick a series ID to start for this process. seriesId = random.randint( 0, len(self.dimensionEvents) + len(self.dimensionMetrics) - 1) timestamp = getTimestampMillis() print("Process {} using start series ID: {}".format( self.processId, seriesId)) ## Register sigint handler signal.signal(signal.SIGINT, signalHandler) overallSummary = None ingestionStart = timer() try: for threadId in range(self.args.concurrency): threadIdStr = "{}-{}".format(self.processId, threadId + 1) print("Starting ThreadId: {}".format(threadIdStr)) thread = IngestionThread(threadIdStr, self.args, self.dimensionMetrics, self.dimensionEvents, self.highUtilizationHosts, self.lowUtilizationHosts, self.event) thread.start() self.threads.append(thread) success = 0 count = 0 recordsWritten = 0 totalLatency = 0.0 aggregatedDigests = TDigest() pooledVariance = 0.0 for t in self.threads: t.join() success += t.success ## Pool the variance. if count == 0: pooledVariance = t.variance else: pooledVariance = ((count - 1) * pooledVariance + (t.count - 1) * t.variance) / ( (count - 1) + (t.count - 1)) count += t.count recordsWritten += t.recordsWritten aggregatedDigests += t.digest totalLatency += t.sum print( "[Process: {}] Total={:,}, Success={:,}, Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}" .format(self.processId, count, success, round(totalLatency / count, 3), round(math.sqrt(pooledVariance), 3), round(aggregatedDigests.percentile(50), 3), round(aggregatedDigests.percentile(90), 3), round(aggregatedDigests.percentile(99), 3), recordsWritten)) overallSummary = IngestionSummaryStats(aggregatedDigests, count, success, totalLatency, pooledVariance, recordsWritten) ingestionEnd = timer() print("Total time to ingest: {:,} seconds".format( round(ingestionEnd - ingestionStart, 2))) finally: self.conn.send(overallSummary)
digest = TDigest() wallet = simWallet() wallet.print_wallet() initial_value = wallet.estimate_total() print('Initial wallet value is {} BTC.'.format(initial_value), flush=True) while True: current_price = wallet.update_price() digest.update(current_price) digest_value = digest.percentile(15) print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format( current_price, digest_value), flush=True) if current_price > 1.1 * digest_value: wallet.buy_bnb(1) if current_price < 0.9 * digest_value: wallet.sell_bnb(1) percent = int(wallet.estimate_total() / initial_value * 100) print('\nCurrent wallet value is {}% of initial'.format(percent), flush=True)
import json start_pos = 0 with open(fn, 'r') as f: while True: try: obj = json.load(f) yield obj return except json.JSONDecodeError as e: f.seek(start_pos) json_str = f.read(e.pos) obj = json.loads(json_str) start_pos += e.pos yield obj final_digest = TDigest() json_object = stream_read_json('./metric-result-no-presumed-revenue.json') for num, json_list in enumerate(json_object): num_list = pyjq.all('.[] | .metric', json_list) digest = TDigest() digest.batch_update(num_list) print(digest.percentile(50)) final_digest = final_digest + digest print('final: ' + str(final_digest.percentile(25))) print('final: ' + str(final_digest.percentile(50))) print('final: ' + str(final_digest.percentile(75))) print('final: ' + str(final_digest.percentile(90)))
class Percentiles(DataFrameModule): parameters = [('percentiles', object, [0.25, 0.5, 0.75]), ('history', np.dtype(int), 3)] def __init__(self, column, percentiles=None, **kwds): if not column: raise ProgressiveError('Need a column name') self._add_slots(kwds,'input_descriptors', [SlotDescriptor('df', type=pd.DataFrame)]) super(Percentiles, self).__init__(dataframe_slot='percentiles', **kwds) self._column = column self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < .5] uh = percentiles[percentiles > .5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self.schema = [(_pretty_name(x), np.dtype(float), np.nan) for x in self._percentiles] self.schema.append(DataFrameModule.UPDATE_COLUMN_DESC) self._df = create_dataframe(self.schema) def is_ready(self): if self.get_input_slot('df').has_created(): return True return super(Percentiles, self).is_ready() def run_step(self,run_number,step_size,howlong): dfslot = self.get_input_slot('df') dfslot.update(run_number) if dfslot.has_updated() or dfslot.has_deleted(): dfslot.reset() dfslot.update(run_number) self.tdigest = TDigest() # reset indices = dfslot.next_created(step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() with dfslot.lock: x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x) df = self._df values = [] for p in self._percentiles: values.append(self.tdigest.percentile(p*100)) values.append(run_number) with self.lock: df.loc[run_number] = values if len(df) > self.params.history: self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(dfslot.next_state(), steps_run=steps, reads=steps, updates=len(self._df))
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: # Series is not numeric or is all NaNs. return None logger.debug("column_summary - " + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ["mean", "min", "max", "std", "sum"]: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult["n"] = column_props[col]["notnulls"] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult["percentiles"] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult["median"] = colresult["percentiles"][50] colresult["iqr"] = (colresult["percentiles"][75] - colresult["percentiles"][25]) # Compute the t-digest. logger.debug("column_summary - {} - creating TDigest...".format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug("column_summary - {} - testing log trans...".format(col)) try: colresult["logtrans"] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning("test_logtrans has failed for column `{}`: {}".format( col, e)) colresult["logtrans"] = False if colresult["logtrans"]: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult["logtrans_mean"] = _tdigest_mean(logdigest) colresult["logtrans_std"] = _tdigest_std(logdigest) colresult["logtrans_IQR"] = logdigest.percentile( 75) - logdigest.percentile(25) logger.debug("column_summary - {} - should {}be log-transformed".format( col, "NOT " if not colresult["logtrans"] else "")) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug("column_summary - {} - computing histogram...".format(col)) if column_props[col]["is_categorical"]: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult["logtrans"]: counts, log_edges = np.histogram(np.log10(data), density=False, bins="fd") edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins="fd") colresult["histogram"] = { "counts": counts.tolist(), "bin_edges": edges.tolist(), } # Compute KDE logger.debug("column_summary - {} - computing KDE...".format(col)) bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) if column_props[col]["is_categorical"]: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult["min"], colresult["max"] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult["logtrans"]) colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} return {col: colresult, "_columns": [col]}
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words, use_numeric_range_searchs, ts_digest, p_writes): total_benchmark_reads = 0 total_benchmark_writes = 0 all_csvfile = open(all_fname, 'a', newline='') bench_csvfile = open(bench_fname, 'w', newline='') all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) progress = tqdm(unit="docs", total=total_benchmark_commands) total_docs = len(docs) ## timestamp related timestamps_pdist = generate_lognormal_dist(total_benchmark_commands) min_ts = ts_digest.percentile(0.0) max_ts = ts_digest.percentile(100.0) query_range_digest = TDigest() generated_commands = 0 while generated_commands < total_benchmark_commands: query_ts_pdist = timestamps_pdist[generated_commands] percentile = (1.0 - query_ts_pdist) * 100.0 query_min_ts = ts_digest.percentile(percentile) random_doc_pos = random.randint(0, total_docs - 1) doc = docs[random_doc_pos] # decide read or write p_cmd = random.random() if p_cmd < p_writes: ## WRITE total_benchmark_writes = total_benchmark_writes + 1 generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"], doc["username"], doc["timestamp"], generated_commands) else: ## READ total_benchmark_reads = total_benchmark_reads + 1 words, totalW = getQueryWords(doc, stop_words, 2) choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0] generated_row = None numeric_range_str = "" if use_numeric_range_searchs: numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts) query_range_digest.update(int(max_ts - query_min_ts)) if choice == "simple-1word-query" and len(words) >= 1: generated_row = generate_ft_search_row(indexname, "simple-1word-query", "{}{}".format(numeric_range_str, words[0])) elif choice == "2word-union-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-union-query", "{}{} {}".format(numeric_range_str, words[0], words[1])) elif choice == "2word-intersection-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-intersection-query", "{}{}|{}".format(numeric_range_str, words[0], words[1])) if generated_row != None: # all_csv_writer.writerow(generated_row) # bench_csv_writer.writerow(generated_row) progress.update() generated_commands = generated_commands + 1 progress.close() bench_csvfile.close() all_csvfile.close() # print() xx = [] yy = [] p90 = query_range_digest.percentile(90.0) dataset_percent = ts_digest.cdf(p90) print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent)) print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts))) for centroid in query_range_digest.centroids_to_list(): ts_m = centroid["m"] xx.append(ts_m) yy.append(query_range_digest.cdf(ts_m)) plt.scatter(xx, yy) plt.title('EnWiki pages Query time range') plt.xlabel('Query time range') plt.ylabel('cdf') plt.xscale('log') plt.show() return total_benchmark_reads, total_benchmark_writes
class Percentiles(TableModule): parameters = [ ("percentiles", np.dtype(np.object_), [0.25, 0.5, 0.75]), ("history", np.dtype(int), 3), ] inputs = [SlotDescriptor("table", type=Table)] def __init__(self, column: str, percentiles: Optional[Union[List[float], np.ndarray[Any, Any]]] = None, **kwds: Any) -> None: if not column: raise ProgressiveError("Need a column name") super(Percentiles, self).__init__(**kwds) self._columns = [column] self.default_step_size = 1000 self.tdigest = TDigest() if percentiles is None: percentiles = np.array([0.25, 0.5, 0.75]) else: # get them all to be in [0, 1] percentiles = np.asarray(percentiles) if (percentiles > 1).any(): # type: ignore percentiles = percentiles / 100.0 msg = ("percentiles should all be in the interval [0, 1]. " "Try {0} instead.") raise ValueError(msg.format(list(percentiles))) if (percentiles != 0.5).all(): # median isn't included lh = percentiles[percentiles < 0.5] uh = percentiles[percentiles > 0.5] percentiles = np.hstack([lh, 0.5, uh]) self._percentiles = percentiles self._pername: List[str] = [_pretty_name(x) for x in self._percentiles] dshape = "{" + ",".join(["%s: real" % n for n in self._pername]) + "}" self.result = Table(self.generate_table_name("percentiles"), dshape=dshape, create=True) def is_ready(self) -> bool: slot = self.get_input_slot("table") if slot is not None and slot.created.any(): return True return super(Percentiles, self).is_ready() def reset(self) -> None: self.tdigest = TDigest() @process_slot("table", reset_cb="reset") @run_if_any def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: dfslot = ctx.table indices = dfslot.created.next(length=step_size) steps = indices_len(indices) if steps == 0: return self._return_run_step(self.state_blocked, steps_run=steps) input_df = dfslot.data() x = self.filter_columns(input_df, fix_loc(indices)) self.tdigest.batch_update(x[0]) df = self.table values = {} for n, p in zip(self._pername, self._percentiles): values[n] = self.tdigest.percentile(p * 100) df.add(values) # with self.lock: # df.loc[run_number] = values # if len(df) > self.params.history: # self._df = df.loc[df.index[-self.params.history:]] return self._return_run_step(self.next_state(dfslot), steps_run=steps)
class IngestionThread(threading.Thread): def __init__(self, threadId, args, dimensionMetrics, dimensionEvents, highUtilizationHosts, lowUtilizationHosts, event): threading.Thread.__init__(self) self.threadId = threadId self.args = args self.dimensionMetrics = dimensionMetrics self.dimensionEvents = dimensionEvents self.client = tswrite.createWriteClient(args.endpoint, profile=args.profile) self.databaseName = args.databaseName self.tableName = args.tableName self.numMetrics = len(dimensionMetrics) self.numEvents = len(dimensionEvents) self.digest = TDigest( ) ## Use the t-digest to compute the streaming percentiles self.count = 0 self.success = 0 self.sum = 0.0 self.variance = float('nan') self.highUtilizationHosts = highUtilizationHosts self.lowUtilizationHosts = lowUtilizationHosts self.sigInt = False self.event = event def run(self): global seriesId global timestamp global lock idx = 0 mean = 0.0 squared = 0.0 while True: with lock: if self.sigInt == True or sigInt == True or self.event.is_set( ): print("Thread {} exiting.".format(self.threadId)) break seriesId += 1 if seriesId >= self.numMetrics + self.numEvents: ## Wrapping around, so move to new timestamp. seriesId = 0 newTimestamp = timestamp + self.args.intervalMillis currentTime = getCurrentTimestampMillis() ## Check if the timestamps are falling behind if newTimestamp < currentTime - 0.05 * self.args.intervalMillis: print( "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes." .format(self.args.intervalMillis, currentTime - timestamp)) ## Move time forward. timestamp = getTimestampMillis() else: timestamp = newTimestamp ## Check if we are ingesting too fast, then slow down. if timestamp > currentTime - 1000: ## Slow down sleepTimeSecs = int( (timestamp - currentTime) / 1000) print("Thread {} sleeping for {} secs".format( self.threadId, sleepTimeSecs)) time.sleep(sleepTimeSecs) now = datetime.datetime.now() print( "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}." .format(self.threadId, now.strftime("%Y-%m-%d %H:%M:%S"), timestamp)) localSeriesId = seriesId localTimestamp = timestamp if localSeriesId < self.numMetrics: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionMetrics[localSeriesId]) records = model.createRandomMetrics(seriesId, localTimestamp, "MILLISECONDS", self.highUtilizationHosts, self.lowUtilizationHosts) else: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionEvents[localSeriesId - self.numMetrics]) records = model.createRandomEvent(localTimestamp, "MILLISECONDS") idx += 1 start = timer() try: writeResult = tswrite.writeRecords(self.client, self.databaseName, self.tableName, commonAttributes, records) self.success += 1 except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) requestId = "RequestId: {}".format( e.response['ResponseMetadata']['RequestId']) print(requestId) print(json.dumps(commonAttributes, indent=2)) print(json.dumps(records, indent=2)) continue finally: self.count += 1 end = timer() cur = end - start self.digest.update(cur) self.sum += cur ## Computing the streaming M^2 (squared distance from mean) delta = cur - mean mean += delta / self.count squared += delta * (cur - mean) if self.count > 1: self.variance = float(squared / (self.count - 1)) requestId = writeResult['ResponseMetadata']['RequestId'] if idx % 1000 == 0: now = datetime.datetime.now() print( "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}" .format(self.threadId, idx, now.strftime("%Y-%m-%d %H:%M:%S"), requestId, round(self.sum / self.count, 3), round(math.sqrt(self.variance), 3), round(self.digest.percentile(50), 3), round(self.digest.percentile(90), 3), round(self.digest.percentile(99), 3))) def interrupt(self): print("Interrupting thread: ", self.threadId) self.sigInt = True
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0: # Series is not numeric or is all NaNs. return None logger.debug('column_summary - ' + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ['mean', 'min', 'max', 'std', 'sum']: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult['n'] = column_props[col]['notnulls'] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult['percentiles'] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult['median'] = colresult['percentiles'][50] colresult['iqr'] = (colresult['percentiles'][75] - colresult['percentiles'][25]) # Compute the t-digest. logger.debug('column_summary - {} - creating TDigest...'.format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug('column_summary - {} - testing log trans...'.format(col)) try: colresult['logtrans'] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning('test_logtrans has failed for column `{}`: {}'.format( col, e)) colresult['logtrans'] = False if colresult['logtrans']: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult['logtrans_mean'] = _tdigest_mean(logdigest) colresult['logtrans_std'] = _tdigest_std(logdigest) colresult['logtrans_IQR'] = (logdigest.percentile(75) - logdigest.percentile(25)) logger.debug('column_summary - {} - should {}be log-transformed'.format( col, 'NOT ' if not colresult['logtrans'] else '')) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug('column_summary - {} - computing histogram...'.format(col)) if column_props[col]['is_categorical']: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult['logtrans']: counts, log_edges = np.histogram(np.log10(data), density=False, bins='fd') edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins='fd') colresult['histogram'] = { 'counts': counts.tolist(), 'bin_edges': edges.tolist() } # Compute KDE logger.debug('column_summary - {} - computing KDE...'.format(col)) bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1) logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw)) if column_props[col]['is_categorical']: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult['min'], colresult['max'] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult['logtrans']) colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()} return {col: colresult, '_columns': [col]}