def _test_logtrans(digest): """ Test if t-digest distribution is more normal when log-transformed. Test whether a log-transform improves normality of data with a simplified Kolmogorov-Smirnov two-sided test (the location and scale of the normal distribution are estimated from the median and standard deviation of the data). Parameters ---------- digest : tdigest.TDigest t-digest data structure. Returns ------- TODO """ if digest.percentile(0) <= 0: return False logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) lKS, lp = _tdigest_norm_kstest(logdigest) KS, p = _tdigest_norm_kstest(digest) logger.debug("KSnorm: log: {:.2g}, {:.2g}; linear: {:.2g}, {:.2g}".format( lKS, lp, KS, p)) return ((lKS < KS) and (lp > p) and (lp > LOGNORMALITY_P_THRESH) and (p < LOGNORMALITY_P_THRESH))
class DigestMachine(DistMachine): def __init__(self): super().__init__() self.digest = TDigest() def update(self, value, dt=None, **kwargs): self.digest.update(value) def inv_cdf(self, p): return self.digest.percentile(100. * p)
class Digest: def __init__(self): self.digest = TDigest() self.digest.update(0) self._count = 0 self.lock = asyncio.Lock() def add(self, v): self.digest.update(v) self._count += 1 def percentile(self, v): return self.digest.percentile(v) def count(self): return self._count
def initial_state(self, name, **ignore): """ Decide if it is a process or not, and create initial sketch of CDF of values or changes in values """ # This is one off. Restarting may change the classification ! values = self.get_lagged_values(name=name) times = self.get_lagged_times(name=name) digest = TDigest() as_process = is_process(values) data = np.diff(list(values) + [0., 0.]) if is_process(values) else values for value in data: digest.update(value) return { 't': times[0], 'digest': digest, 'as_process': as_process, 'dt': approx_dt(times), 'name': name }
def _tdigest_normalise(digest): """TODO Parameters ---------- digest : tdigest.TDigest t-digest data structure. Returns ------- TODO """ m = _tdigest_mean(digest) s = _tdigest_std(digest) ndigest = TDigest() for x in digest.C.values(): ndigest.update((x.mean - m) / s, x.count) return ndigest
def get_footer_stats(self): try: self.check_progress = False try: avg_ms = sum(self.ms_list) / len(self.ms_list) except ZeroDivisionError: # 捕获全部检测失败的情况。 avg_ms = 0 # 打印百分比分布信息 percentile if sys.modules.get("tdigest"): digest = TDigest() for item in self.ms_list: digest.update(item) p50 = digest.percentile(50) p90 = digest.percentile(90) p99 = digest.percentile(99) percentile_string = "\tp50: %.2f p90: %.2f p99: %.2f" % ( p50, p90, p99) else: percentile_string = "" footer = "\rtotal: %d success: %d failure: %d s_rate: %.2f f_rate: %.2f avg_ms: %.2f ms" % ( self.check_count, self.check_success_count, self.check_failure_count, self.check_success_count / self.check_count, self.check_failure_count / self.check_count, avg_ms) #print(footer + percentile_string, file=sys.stderr, flush=True) print(footer + percentile_string, flush=True) if self.check_success_count / self.check_count == 0: # 如果一个都没有成功,则返回状态码 1. exit(1) elif self.check_success_count / self.check_count < 1: # 返回成功率的整数状态码 exit(trunc(self.check_success_count / self.check_count * 100)) else: # 全部成功,退出状态码 0。 #exit(0) pass except Exception: # 指定 Exception 不捕获 exit 退出异常动作。 pass
def detect_anomalies(self, data, anomaly_fraction): data = np.asanyarray(data) if len(data.shape) == 1: data = data[:, np.newaxis] signal = self.reconstruct_signal(data) digest = TDigest() n = data.shape[0] delta = np.zeros(data.shape) for i in xrange(n): error = self.compute_error(data[i, :], signal[i, :]) delta[i, :] = error digest.update(np.abs(error)) threshold = digest.quantile(1 - anomaly_fraction) anomalies = [] for i in xrange(n): element = delta[i] if np.abs(element) > threshold: anomalies.append(Anomaly(data[i], element, i)) return anomalies
class RouteStat: __slots__ = [ "method", "route", "statusCode", "count", "sum", "sumsq", "time", "td", "tdigest", ] @property def __dict__(self): tdigest = as_bytes(self.td) self.tdigest = base64.b64encode(tdigest).decode("ascii") return {s: getattr(self, s) for s in self.__slots__ if s != "td"} def __init__(self, *, method="", route="", status_code=0, time=None): self.method = method self.route = route self.statusCode = status_code self.count = 0 self.sum = 0 self.sumsq = 0 self.time = time_trunc_minute(time) self.td = TDigest(K=20) self.tdigest = None def add(self, ms): self.count += 1 self.sum += ms self.sumsq += ms * ms self.td.update(ms)
def initialise_digest(v): d = TDigest() d.update(v) return d
import time digest = TDigest() wallet = simWallet() wallet.print_wallet() initial_value = wallet.estimate_total() print('Initial wallet value is {} BTC.'.format(initial_value), flush=True) while True: current_price = wallet.update_price() digest.update(current_price) digest_value = digest.percentile(15) print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format( current_price, digest_value), flush=True) if current_price > 1.1 * digest_value: wallet.buy_bnb(1) if current_price < 0.9 * digest_value: wallet.sell_bnb(1) percent = int(wallet.estimate_total() / initial_value * 100) print('\nCurrent wallet value is {}% of initial'.format(percent),
class IngestionThread(threading.Thread): def __init__(self, threadId, args, dimensionMetrics, dimensionEvents, highUtilizationHosts, lowUtilizationHosts, event): threading.Thread.__init__(self) self.threadId = threadId self.args = args self.dimensionMetrics = dimensionMetrics self.dimensionEvents = dimensionEvents self.client = tswrite.createWriteClient(args.endpoint, profile=args.profile) self.databaseName = args.databaseName self.tableName = args.tableName self.numMetrics = len(dimensionMetrics) self.numEvents = len(dimensionEvents) self.digest = TDigest( ) ## Use the t-digest to compute the streaming percentiles self.count = 0 self.success = 0 self.sum = 0.0 self.variance = float('nan') self.highUtilizationHosts = highUtilizationHosts self.lowUtilizationHosts = lowUtilizationHosts self.sigInt = False self.event = event def run(self): global seriesId global timestamp global lock idx = 0 mean = 0.0 squared = 0.0 while True: with lock: if self.sigInt == True or sigInt == True or self.event.is_set( ): print("Thread {} exiting.".format(self.threadId)) break seriesId += 1 if seriesId >= self.numMetrics + self.numEvents: ## Wrapping around, so move to new timestamp. seriesId = 0 newTimestamp = timestamp + self.args.intervalMillis currentTime = getCurrentTimestampMillis() ## Check if the timestamps are falling behind if newTimestamp < currentTime - 0.05 * self.args.intervalMillis: print( "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes." .format(self.args.intervalMillis, currentTime - timestamp)) ## Move time forward. timestamp = getTimestampMillis() else: timestamp = newTimestamp ## Check if we are ingesting too fast, then slow down. if timestamp > currentTime - 1000: ## Slow down sleepTimeSecs = int( (timestamp - currentTime) / 1000) print("Thread {} sleeping for {} secs".format( self.threadId, sleepTimeSecs)) time.sleep(sleepTimeSecs) now = datetime.datetime.now() print( "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}." .format(self.threadId, now.strftime("%Y-%m-%d %H:%M:%S"), timestamp)) localSeriesId = seriesId localTimestamp = timestamp if localSeriesId < self.numMetrics: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionMetrics[localSeriesId]) records = model.createRandomMetrics(seriesId, localTimestamp, "MILLISECONDS", self.highUtilizationHosts, self.lowUtilizationHosts) else: commonAttributes = model.createWriteRecordCommonAttributes( self.dimensionEvents[localSeriesId - self.numMetrics]) records = model.createRandomEvent(localTimestamp, "MILLISECONDS") idx += 1 start = timer() try: writeResult = tswrite.writeRecords(self.client, self.databaseName, self.tableName, commonAttributes, records) self.success += 1 except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) requestId = "RequestId: {}".format( e.response['ResponseMetadata']['RequestId']) print(requestId) print(json.dumps(commonAttributes, indent=2)) print(json.dumps(records, indent=2)) continue finally: self.count += 1 end = timer() cur = end - start self.digest.update(cur) self.sum += cur ## Computing the streaming M^2 (squared distance from mean) delta = cur - mean mean += delta / self.count squared += delta * (cur - mean) if self.count > 1: self.variance = float(squared / (self.count - 1)) requestId = writeResult['ResponseMetadata']['RequestId'] if idx % 1000 == 0: now = datetime.datetime.now() print( "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}" .format(self.threadId, idx, now.strftime("%Y-%m-%d %H:%M:%S"), requestId, round(self.sum / self.count, 3), round(math.sqrt(self.variance), 3), round(self.digest.percentile(50), 3), round(self.digest.percentile(90), 3), round(self.digest.percentile(99), 3))) def interrupt(self): print("Interrupting thread: ", self.threadId) self.sigInt = True
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words, use_numeric_range_searchs, ts_digest, p_writes): total_benchmark_reads = 0 total_benchmark_writes = 0 all_csvfile = open(all_fname, 'a', newline='') bench_csvfile = open(bench_fname, 'w', newline='') all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL) progress = tqdm(unit="docs", total=total_benchmark_commands) total_docs = len(docs) ## timestamp related timestamps_pdist = generate_lognormal_dist(total_benchmark_commands) min_ts = ts_digest.percentile(0.0) max_ts = ts_digest.percentile(100.0) query_range_digest = TDigest() generated_commands = 0 while generated_commands < total_benchmark_commands: query_ts_pdist = timestamps_pdist[generated_commands] percentile = (1.0 - query_ts_pdist) * 100.0 query_min_ts = ts_digest.percentile(percentile) random_doc_pos = random.randint(0, total_docs - 1) doc = docs[random_doc_pos] # decide read or write p_cmd = random.random() if p_cmd < p_writes: ## WRITE total_benchmark_writes = total_benchmark_writes + 1 generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"], doc["username"], doc["timestamp"], generated_commands) else: ## READ total_benchmark_reads = total_benchmark_reads + 1 words, totalW = getQueryWords(doc, stop_words, 2) choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0] generated_row = None numeric_range_str = "" if use_numeric_range_searchs: numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts) query_range_digest.update(int(max_ts - query_min_ts)) if choice == "simple-1word-query" and len(words) >= 1: generated_row = generate_ft_search_row(indexname, "simple-1word-query", "{}{}".format(numeric_range_str, words[0])) elif choice == "2word-union-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-union-query", "{}{} {}".format(numeric_range_str, words[0], words[1])) elif choice == "2word-intersection-query" and len(words) >= 2: generated_row = generate_ft_search_row(indexname, "2word-intersection-query", "{}{}|{}".format(numeric_range_str, words[0], words[1])) if generated_row != None: # all_csv_writer.writerow(generated_row) # bench_csv_writer.writerow(generated_row) progress.update() generated_commands = generated_commands + 1 progress.close() bench_csvfile.close() all_csvfile.close() # print() xx = [] yy = [] p90 = query_range_digest.percentile(90.0) dataset_percent = ts_digest.cdf(p90) print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent)) print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts))) for centroid in query_range_digest.centroids_to_list(): ts_m = centroid["m"] xx.append(ts_m) yy.append(query_range_digest.cdf(ts_m)) plt.scatter(xx, yy) plt.title('EnWiki pages Query time range') plt.xlabel('Query time range') plt.ylabel('cdf') plt.xscale('log') plt.show() return total_benchmark_reads, total_benchmark_writes
doc = {} text = None comment = None username = None timestamp = None ts_digest = TDigest() for event, elem in tree: if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page": doc = {} doc["title"] = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}title") doc["text"] = text doc["comment"] = comment doc["username"] = username doc["timestamp"] = int(timestamp) ts_digest.update(int(timestamp)) if doc["text"] is not None and doc["comment"] is not None and doc["username"] is not None and doc[ "timestamp"] is not None: total_docs = total_docs + 1 docs.append(doc) progress.update() elem.clear() # won't need the children any more if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}revision": text = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}text") comment = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}comment") ts = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}timestamp") dt = parse(ts) timestamp = dt.timestamp() if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}contributor": username = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}username")
class IngestionThread(threading.Thread): def __init__(self, threadId, args, dimensionMetrics, dimensionEvents, highUtilizationHosts, lowUtilizationHosts, event): threading.Thread.__init__(self) self.threadId = threadId self.args = args self.dimensionMetrics = dimensionMetrics self.dimensionEvents = dimensionEvents self.client = tswrite.createWriteClient(region=args.region, profile=args.profile, endpoint=args.endpoint) self.databaseName = args.databaseName self.tableName = args.tableName self.numMetrics = len(dimensionMetrics) self.numEvents = len(dimensionEvents) self.digest = TDigest( ) ## Use the t-digest to compute the streaming percentiles self.count = 0 self.success = 0 self.sum = 0.0 self.variance = float('nan') self.highUtilizationHosts = highUtilizationHosts self.lowUtilizationHosts = lowUtilizationHosts self.sigInt = False self.event = event self.recordsWritten = 0 def run(self): global seriesId global timestamp global lock idx = 0 mean = 0.0 squared = 0.0 addReqId = self.args.addReqId addReqIdAsDim = addReqId and self.args.addReqIdAsDim addReqIdAsMeasure = addReqId and not self.args.addReqIdAsDim writeRecordsBatch = list() recordsToWrite = list() while True: with lock: if self.sigInt == True or sigInt == True or self.event.is_set( ): print("Thread {} exiting.".format(self.threadId)) break seriesId += 1 if seriesId >= self.numMetrics + self.numEvents: ## Wrapping around, so move to new timestamp. seriesId = 0 newTimestamp = timestamp + self.args.intervalMillis currentTime = getCurrentTimestampMillis() ## Check if the timestamps are falling behind if newTimestamp < currentTime - 0.05 * self.args.intervalMillis: print( "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes." .format(self.args.intervalMillis, currentTime - timestamp)) ## Move time forward. timestamp = getTimestampMillis() else: timestamp = newTimestamp ## Check if we are ingesting too fast, then slow down. if timestamp > currentTime - 1000: ## Slow down sleepTimeSecs = int( (timestamp - currentTime) / 1000) print("Thread {} sleeping for {} secs".format( self.threadId, sleepTimeSecs)) time.sleep(sleepTimeSecs) now = datetime.datetime.now() print( "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}." .format(self.threadId, now.strftime("%Y-%m-%d %H:%M:%S"), timestamp)) localSeriesId = seriesId localTimestamp = timestamp if localSeriesId < self.numMetrics: dimensions = model.createDimensionsEntry( self.dimensionMetrics[localSeriesId], addReqId=addReqIdAsDim) records = model.createRandomMetrics(localSeriesId, dimensions, localTimestamp, "MILLISECONDS", self.highUtilizationHosts, self.lowUtilizationHosts, wide=self.args.wide, addReqId=addReqIdAsMeasure) else: dimensions = model.createDimensionsEntry( self.dimensionEvents[localSeriesId - self.numMetrics], addReqId=addReqIdAsDim) records = model.createRandomEvent(dimensions, localTimestamp, "MILLISECONDS", wide=self.args.wide, addReqId=addReqIdAsMeasure) if self.args.batchWrites: if len(writeRecordsBatch) + len( records) <= self.args.batchSize: writeRecordsBatch.extend(records) ## Generate more data, unless we're wrapping around, at which point, drain any pending records. if localSeriesId < self.numMetrics + self.numEvents: continue else: ## transfer a subset of values from the records produced into the batch spaceRemaining = self.args.batchSize - len( writeRecordsBatch) assert (spaceRemaining < len(records)) ## Transfer 0 - spaceRemaining - 1 to be written with this batch, and spaceRemaining - end in the next batch ## If spaceRemaining is 0, then just write what we have accumulated so far if spaceRemaining > 0: writeRecordsBatch.extend(records[0:spaceRemaining]) ## The batch is full, now we issue the write record request. recordsToWrite.clear() recordsToWrite.extend(writeRecordsBatch) writeRecordsBatch.clear() writeRecordsBatch.extend(records[spaceRemaining:]) else: recordsToWrite.clear() recordsToWrite.extend(records) idx += 1 start = timer() try: writeResult = tswrite.writeRecords(self.client, self.databaseName, self.tableName, recordsToWrite) self.recordsWritten += len(recordsToWrite) self.success += 1 except Exception as e: print(e) exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) requestId = "RequestId: {}".format( e.response['ResponseMetadata']['RequestId']) print(requestId) print(json.dumps(dimensions, indent=2)) print(json.dumps(records, indent=2)) continue finally: self.count += 1 end = timer() cur = end - start self.digest.update(cur) self.sum += cur ## Computing the streaming M^2 (squared distance from mean) delta = cur - mean mean += delta / self.count squared += delta * (cur - mean) if self.count > 1: self.variance = float(squared / (self.count - 1)) requestId = writeResult['ResponseMetadata']['RequestId'] if idx % 1000 == 0: now = datetime.datetime.now() print( "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}" .format(self.threadId, idx, now.strftime("%Y-%m-%d %H:%M:%S"), requestId, round(self.sum / self.count, 3), round(math.sqrt(self.variance), 3), round(self.digest.percentile(50), 3), round(self.digest.percentile(90), 3), round(self.digest.percentile(99), 3), self.recordsWritten)) def interrupt(self): print("Interrupting thread: ", self.threadId) self.sigInt = True
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0: # Series is not numeric or is all NaNs. return None logger.debug("column_summary - " + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ["mean", "min", "max", "std", "sum"]: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult["n"] = column_props[col]["notnulls"] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult["percentiles"] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult["median"] = colresult["percentiles"][50] colresult["iqr"] = (colresult["percentiles"][75] - colresult["percentiles"][25]) # Compute the t-digest. logger.debug("column_summary - {} - creating TDigest...".format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug("column_summary - {} - testing log trans...".format(col)) try: colresult["logtrans"] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning("test_logtrans has failed for column `{}`: {}".format( col, e)) colresult["logtrans"] = False if colresult["logtrans"]: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult["logtrans_mean"] = _tdigest_mean(logdigest) colresult["logtrans_std"] = _tdigest_std(logdigest) colresult["logtrans_IQR"] = logdigest.percentile( 75) - logdigest.percentile(25) logger.debug("column_summary - {} - should {}be log-transformed".format( col, "NOT " if not colresult["logtrans"] else "")) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug("column_summary - {} - computing histogram...".format(col)) if column_props[col]["is_categorical"]: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult["logtrans"]: counts, log_edges = np.histogram(np.log10(data), density=False, bins="fd") edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins="fd") colresult["histogram"] = { "counts": counts.tolist(), "bin_edges": edges.tolist(), } # Compute KDE logger.debug("column_summary - {} - computing KDE...".format(col)) bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1) logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw)) if column_props[col]["is_categorical"]: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult["min"], colresult["max"] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult["logtrans"]) colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()} return {col: colresult, "_columns": [col]}
def column_summary(series, column_props, delta=0.01): """Summarise a numeric column. Parameters ---------- series : pd.Series Numeric column. column_props : TODO TODO delta : float TODO Returns ------- TODO """ col = series.name if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0: # Series is not numeric or is all NaNs. return None logger.debug('column_summary - ' + col) # select non-nulls from column data = series.dropna() colresult = {} for m in ['mean', 'min', 'max', 'std', 'sum']: val = getattr(data, m)() if type(val) is np.int64: colresult[m] = int(val) else: colresult[m] = val colresult['n'] = column_props[col]['notnulls'] percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9] colresult['percentiles'] = { perc: np.nanpercentile(series, perc) for perc in percentiles } colresult['median'] = colresult['percentiles'][50] colresult['iqr'] = (colresult['percentiles'][75] - colresult['percentiles'][25]) # Compute the t-digest. logger.debug('column_summary - {} - creating TDigest...'.format(col)) digest = TDigest(delta) digest.batch_update(data) logger.debug('column_summary - {} - testing log trans...'.format(col)) try: colresult['logtrans'] = bool(_test_logtrans(digest)) except Exception as e: # Hard to pinpoint problems with the logtrans TDigest. logger.warning('test_logtrans has failed for column `{}`: {}'.format( col, e)) colresult['logtrans'] = False if colresult['logtrans']: logdigest = TDigest() for c in digest.C.values(): logdigest.update(np.log(c.mean), c.count) colresult['logtrans_mean'] = _tdigest_mean(logdigest) colresult['logtrans_std'] = _tdigest_std(logdigest) colresult['logtrans_IQR'] = (logdigest.percentile(75) - logdigest.percentile(25)) logger.debug('column_summary - {} - should {}be log-transformed'.format( col, 'NOT ' if not colresult['logtrans'] else '')) # Compress and store the t-digest. digest.delta = delta digest.compress() colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()] # Compute histogram logger.debug('column_summary - {} - computing histogram...'.format(col)) if column_props[col]['is_categorical']: # Compute frequency table and store as histogram counts, edges = _compute_histogram_from_frequencies(data) else: if colresult['logtrans']: counts, log_edges = np.histogram(np.log10(data), density=False, bins='fd') edges = 10**log_edges else: counts, edges = np.histogram(data, density=False, bins='fd') colresult['histogram'] = { 'counts': counts.tolist(), 'bin_edges': edges.tolist() } # Compute KDE logger.debug('column_summary - {} - computing KDE...'.format(col)) bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1) logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw)) if column_props[col]['is_categorical']: kde_x, kde_y = np.zeros(1), np.zeros(1) else: coord_range = colresult['min'], colresult['max'] kde_x, kde_y = _compute_smoothed_histogram( data, bw, coord_range, logtrans=colresult['logtrans']) colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()} return {col: colresult, '_columns': [col]}