Example #1
0
def _test_logtrans(digest):
    """
    Test if t-digest distribution is more normal when log-transformed.

    Test whether a log-transform improves normality of data with a
    simplified Kolmogorov-Smirnov two-sided test (the location and scale
    of the normal distribution are estimated from the median and
    standard deviation of the data).

    Parameters
    ----------
    digest : tdigest.TDigest
        t-digest data structure.

    Returns
    -------
    TODO
    """
    if digest.percentile(0) <= 0:
        return False

    logdigest = TDigest()
    for c in digest.C.values():
        logdigest.update(np.log(c.mean), c.count)

    lKS, lp = _tdigest_norm_kstest(logdigest)
    KS, p = _tdigest_norm_kstest(digest)
    logger.debug("KSnorm: log: {:.2g}, {:.2g}; linear: {:.2g}, {:.2g}".format(
        lKS, lp, KS, p))

    return ((lKS < KS) and (lp > p) and (lp > LOGNORMALITY_P_THRESH)
            and (p < LOGNORMALITY_P_THRESH))
class DigestMachine(DistMachine):

    def __init__(self):
        super().__init__()
        self.digest = TDigest()

    def update(self, value, dt=None, **kwargs):
        self.digest.update(value)

    def inv_cdf(self, p):
        return self.digest.percentile(100. * p)
class Digest:
    def __init__(self):
        self.digest = TDigest()
        self.digest.update(0)
        self._count = 0
        self.lock = asyncio.Lock()

    def add(self, v):
        self.digest.update(v)
        self._count += 1

    def percentile(self, v):
        return self.digest.percentile(v)

    def count(self):
        return self._count
 def initial_state(self, name, **ignore):
     """ Decide if it is a process or not, and create initial sketch of CDF of values or changes in values """
     # This is one off. Restarting may change the classification !
     values = self.get_lagged_values(name=name)
     times = self.get_lagged_times(name=name)
     digest = TDigest()
     as_process = is_process(values)
     data = np.diff(list(values) +
                    [0., 0.]) if is_process(values) else values
     for value in data:
         digest.update(value)
     return {
         't': times[0],
         'digest': digest,
         'as_process': as_process,
         'dt': approx_dt(times),
         'name': name
     }
Example #5
0
def _tdigest_normalise(digest):
    """TODO

    Parameters
    ----------
    digest : tdigest.TDigest
        t-digest data structure.

    Returns
    -------
    TODO
    """
    m = _tdigest_mean(digest)
    s = _tdigest_std(digest)
    ndigest = TDigest()
    for x in digest.C.values():
        ndigest.update((x.mean - m) / s, x.count)
    return ndigest
Example #6
0
    def get_footer_stats(self):
        try:
            self.check_progress = False
            try:
                avg_ms = sum(self.ms_list) / len(self.ms_list)
            except ZeroDivisionError:
                # 捕获全部检测失败的情况。
                avg_ms = 0

            # 打印百分比分布信息 percentile
            if sys.modules.get("tdigest"):
                digest = TDigest()
                for item in self.ms_list:
                    digest.update(item)
                p50 = digest.percentile(50)
                p90 = digest.percentile(90)
                p99 = digest.percentile(99)
                percentile_string = "\tp50: %.2f p90: %.2f p99: %.2f" % (
                    p50, p90, p99)
            else:
                percentile_string = ""

            footer = "\rtotal: %d  success: %d  failure: %d  s_rate: %.2f  f_rate: %.2f  avg_ms: %.2f ms" % (
                self.check_count, self.check_success_count,
                self.check_failure_count,
                self.check_success_count / self.check_count,
                self.check_failure_count / self.check_count, avg_ms)

            #print(footer + percentile_string, file=sys.stderr, flush=True)
            print(footer + percentile_string, flush=True)
            if self.check_success_count / self.check_count == 0:
                # 如果一个都没有成功,则返回状态码 1.
                exit(1)
            elif self.check_success_count / self.check_count < 1:
                # 返回成功率的整数状态码
                exit(trunc(self.check_success_count / self.check_count * 100))
            else:
                # 全部成功,退出状态码 0。
                #exit(0)
                pass
        except Exception:
            # 指定 Exception 不捕获 exit 退出异常动作。
            pass
Example #7
0
    def detect_anomalies(self, data, anomaly_fraction):
        data = np.asanyarray(data)
        if len(data.shape) == 1:
            data = data[:, np.newaxis]
        signal = self.reconstruct_signal(data)
        digest = TDigest()

        n = data.shape[0]
        delta = np.zeros(data.shape)

        for i in xrange(n):
            error = self.compute_error(data[i, :], signal[i, :])
            delta[i, :] = error
            digest.update(np.abs(error))

        threshold = digest.quantile(1 - anomaly_fraction)

        anomalies = []
        for i in xrange(n):
            element = delta[i]
            if np.abs(element) > threshold:
                anomalies.append(Anomaly(data[i], element, i))

        return anomalies
Example #8
0
    def detect_anomalies(self, data, anomaly_fraction):
        data = np.asanyarray(data)
        if len(data.shape) == 1:
            data = data[:, np.newaxis]
        signal = self.reconstruct_signal(data)
        digest = TDigest()

        n = data.shape[0]
        delta = np.zeros(data.shape)

        for i in xrange(n):
            error = self.compute_error(data[i, :], signal[i, :])
            delta[i, :] = error
            digest.update(np.abs(error))

        threshold = digest.quantile(1 - anomaly_fraction)

        anomalies = []
        for i in xrange(n):
            element = delta[i]
            if np.abs(element) > threshold:
                anomalies.append(Anomaly(data[i], element, i))

        return anomalies
Example #9
0
class RouteStat:
    __slots__ = [
        "method",
        "route",
        "statusCode",
        "count",
        "sum",
        "sumsq",
        "time",
        "td",
        "tdigest",
    ]

    @property
    def __dict__(self):
        tdigest = as_bytes(self.td)
        self.tdigest = base64.b64encode(tdigest).decode("ascii")

        return {s: getattr(self, s) for s in self.__slots__ if s != "td"}

    def __init__(self, *, method="", route="", status_code=0, time=None):
        self.method = method
        self.route = route
        self.statusCode = status_code
        self.count = 0
        self.sum = 0
        self.sumsq = 0
        self.time = time_trunc_minute(time)
        self.td = TDigest(K=20)
        self.tdigest = None

    def add(self, ms):
        self.count += 1
        self.sum += ms
        self.sumsq += ms * ms
        self.td.update(ms)
def initialise_digest(v):
    d = TDigest()
    d.update(v)
    return d
import time

digest = TDigest()

wallet = simWallet()

wallet.print_wallet()

initial_value = wallet.estimate_total()

print('Initial wallet value is {} BTC.'.format(initial_value), flush=True)

while True:

    current_price = wallet.update_price()
    digest.update(current_price)

    digest_value = digest.percentile(15)

    print('\n\nCurrent BNB/BTC price is {}. Digest value is {}'.format(
        current_price, digest_value),
          flush=True)

    if current_price > 1.1 * digest_value:
        wallet.buy_bnb(1)

    if current_price < 0.9 * digest_value:
        wallet.sell_bnb(1)

    percent = int(wallet.estimate_total() / initial_value * 100)
    print('\nCurrent wallet value is {}% of initial'.format(percent),
Example #12
0
class IngestionThread(threading.Thread):
    def __init__(self, threadId, args, dimensionMetrics, dimensionEvents,
                 highUtilizationHosts, lowUtilizationHosts, event):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.args = args
        self.dimensionMetrics = dimensionMetrics
        self.dimensionEvents = dimensionEvents
        self.client = tswrite.createWriteClient(args.endpoint,
                                                profile=args.profile)
        self.databaseName = args.databaseName
        self.tableName = args.tableName
        self.numMetrics = len(dimensionMetrics)
        self.numEvents = len(dimensionEvents)
        self.digest = TDigest(
        )  ## Use the t-digest to compute the streaming percentiles
        self.count = 0
        self.success = 0
        self.sum = 0.0
        self.variance = float('nan')
        self.highUtilizationHosts = highUtilizationHosts
        self.lowUtilizationHosts = lowUtilizationHosts
        self.sigInt = False
        self.event = event

    def run(self):
        global seriesId
        global timestamp
        global lock

        idx = 0
        mean = 0.0
        squared = 0.0

        while True:
            with lock:
                if self.sigInt == True or sigInt == True or self.event.is_set(
                ):
                    print("Thread {} exiting.".format(self.threadId))
                    break

                seriesId += 1
                if seriesId >= self.numMetrics + self.numEvents:
                    ## Wrapping around, so move to new timestamp.
                    seriesId = 0
                    newTimestamp = timestamp + self.args.intervalMillis
                    currentTime = getCurrentTimestampMillis()
                    ## Check if the timestamps are falling behind
                    if newTimestamp < currentTime - 0.05 * self.args.intervalMillis:
                        print(
                            "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes."
                            .format(self.args.intervalMillis,
                                    currentTime - timestamp))
                        ## Move time forward.
                        timestamp = getTimestampMillis()
                    else:
                        timestamp = newTimestamp
                        ## Check if we are ingesting too fast, then slow down.
                        if timestamp > currentTime - 1000:
                            ## Slow down
                            sleepTimeSecs = int(
                                (timestamp - currentTime) / 1000)
                            print("Thread {} sleeping for {} secs".format(
                                self.threadId, sleepTimeSecs))
                            time.sleep(sleepTimeSecs)

                    now = datetime.datetime.now()
                    print(
                        "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}."
                        .format(self.threadId,
                                now.strftime("%Y-%m-%d %H:%M:%S"), timestamp))

                localSeriesId = seriesId
                localTimestamp = timestamp

            if localSeriesId < self.numMetrics:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionMetrics[localSeriesId])
                records = model.createRandomMetrics(seriesId, localTimestamp,
                                                    "MILLISECONDS",
                                                    self.highUtilizationHosts,
                                                    self.lowUtilizationHosts)
            else:
                commonAttributes = model.createWriteRecordCommonAttributes(
                    self.dimensionEvents[localSeriesId - self.numMetrics])
                records = model.createRandomEvent(localTimestamp,
                                                  "MILLISECONDS")

            idx += 1
            start = timer()
            try:
                writeResult = tswrite.writeRecords(self.client,
                                                   self.databaseName,
                                                   self.tableName,
                                                   commonAttributes, records)
                self.success += 1
            except Exception as e:
                print(e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
                requestId = "RequestId: {}".format(
                    e.response['ResponseMetadata']['RequestId'])
                print(requestId)
                print(json.dumps(commonAttributes, indent=2))
                print(json.dumps(records, indent=2))
                continue
            finally:
                self.count += 1
                end = timer()
                cur = end - start
                self.digest.update(cur)
                self.sum += cur
                ## Computing the streaming M^2 (squared distance from mean)
                delta = cur - mean
                mean += delta / self.count
                squared += delta * (cur - mean)
                if self.count > 1:
                    self.variance = float(squared / (self.count - 1))

            requestId = writeResult['ResponseMetadata']['RequestId']
            if idx % 1000 == 0:
                now = datetime.datetime.now()
                print(
                    "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}"
                    .format(self.threadId, idx,
                            now.strftime("%Y-%m-%d %H:%M:%S"), requestId,
                            round(self.sum / self.count, 3),
                            round(math.sqrt(self.variance), 3),
                            round(self.digest.percentile(50), 3),
                            round(self.digest.percentile(90), 3),
                            round(self.digest.percentile(99), 3)))

    def interrupt(self):
        print("Interrupting thread: ", self.threadId)
        self.sigInt = True
Example #13
0
def generate_benchmark_commands(total_benchmark_commands, bench_fname, all_fname, indexname, docs, stop_words,
                                use_numeric_range_searchs, ts_digest, p_writes):
    total_benchmark_reads = 0
    total_benchmark_writes = 0
    all_csvfile = open(all_fname, 'a', newline='')
    bench_csvfile = open(bench_fname, 'w', newline='')
    all_csv_writer = csv.writer(all_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    bench_csv_writer = csv.writer(bench_csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
    progress = tqdm(unit="docs", total=total_benchmark_commands)
    total_docs = len(docs)

    ## timestamp related
    timestamps_pdist = generate_lognormal_dist(total_benchmark_commands)
    min_ts = ts_digest.percentile(0.0)
    max_ts = ts_digest.percentile(100.0)
    query_range_digest = TDigest()

    generated_commands = 0
    while generated_commands < total_benchmark_commands:
        query_ts_pdist = timestamps_pdist[generated_commands]
        percentile = (1.0 - query_ts_pdist) * 100.0
        query_min_ts = ts_digest.percentile(percentile)

        random_doc_pos = random.randint(0, total_docs - 1)
        doc = docs[random_doc_pos]
        # decide read or write
        p_cmd = random.random()
        if p_cmd < p_writes:
            ## WRITE
            total_benchmark_writes = total_benchmark_writes + 1
            generated_row, doc_size = use_case_to_cmd(use_ftadd, doc["title"], doc["text"], doc["comment"],
                                                      doc["username"],
                                                      doc["timestamp"],
                                                      generated_commands)

        else:
            ## READ
            total_benchmark_reads = total_benchmark_reads + 1
            words, totalW = getQueryWords(doc, stop_words, 2)

            choice = random.choices(["simple-1word-query", "2word-union-query", "2word-intersection-query"])[0]
            generated_row = None
            numeric_range_str = ""
            if use_numeric_range_searchs:
                numeric_range_str = "@timestamp:[{} {}] ".format(query_min_ts, max_ts)
                query_range_digest.update(int(max_ts - query_min_ts))
            if choice == "simple-1word-query" and len(words) >= 1:
                generated_row = generate_ft_search_row(indexname, "simple-1word-query",
                                                       "{}{}".format(numeric_range_str, words[0]))
            elif choice == "2word-union-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-union-query",
                                                       "{}{} {}".format(numeric_range_str, words[0], words[1]))
            elif choice == "2word-intersection-query" and len(words) >= 2:
                generated_row = generate_ft_search_row(indexname, "2word-intersection-query",
                                                       "{}{}|{}".format(numeric_range_str, words[0], words[1]))
        if generated_row != None:
            #             all_csv_writer.writerow(generated_row)
            #             bench_csv_writer.writerow(generated_row)
            progress.update()
            generated_commands = generated_commands + 1
    progress.close()
    bench_csvfile.close()
    all_csvfile.close()

    #     print()
    xx = []
    yy = []
    p90 = query_range_digest.percentile(90.0)
    dataset_percent = ts_digest.cdf(p90)

    print("90% of the read queries target at max {} percent o keyspace".format(dataset_percent))
    print("100% of the read queries target at max {} percent o keyspace".format(ts_digest.cdf(max_ts - min_ts)))
    for centroid in query_range_digest.centroids_to_list():
        ts_m = centroid["m"]
        xx.append(ts_m)
        yy.append(query_range_digest.cdf(ts_m))
    plt.scatter(xx, yy)

    plt.title('EnWiki pages Query time range')
    plt.xlabel('Query time range')
    plt.ylabel('cdf')
    plt.xscale('log')
    plt.show()

    return total_benchmark_reads, total_benchmark_writes
Example #14
0
    doc = {}
    text = None
    comment = None
    username = None
    timestamp = None
    ts_digest = TDigest()
    for event, elem in tree:
        if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page":
            doc = {}
            doc["title"] = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}title")
            doc["text"] = text
            doc["comment"] = comment
            doc["username"] = username
            doc["timestamp"] = int(timestamp)
            ts_digest.update(int(timestamp))
            if doc["text"] is not None and doc["comment"] is not None and doc["username"] is not None and doc[
                "timestamp"] is not None:
                total_docs = total_docs + 1
                docs.append(doc)
                progress.update()
                elem.clear()  # won't need the children any more
        if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}revision":
            text = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}text")
            comment = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}comment")
            ts = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}timestamp")
            dt = parse(ts)
            timestamp = dt.timestamp()
        if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}contributor":
            username = elem.findtext("{http://www.mediawiki.org/xml/export-0.10/}username")
class IngestionThread(threading.Thread):
    def __init__(self, threadId, args, dimensionMetrics, dimensionEvents,
                 highUtilizationHosts, lowUtilizationHosts, event):
        threading.Thread.__init__(self)
        self.threadId = threadId
        self.args = args
        self.dimensionMetrics = dimensionMetrics
        self.dimensionEvents = dimensionEvents
        self.client = tswrite.createWriteClient(region=args.region,
                                                profile=args.profile,
                                                endpoint=args.endpoint)
        self.databaseName = args.databaseName
        self.tableName = args.tableName
        self.numMetrics = len(dimensionMetrics)
        self.numEvents = len(dimensionEvents)
        self.digest = TDigest(
        )  ## Use the t-digest to compute the streaming percentiles
        self.count = 0
        self.success = 0
        self.sum = 0.0
        self.variance = float('nan')
        self.highUtilizationHosts = highUtilizationHosts
        self.lowUtilizationHosts = lowUtilizationHosts
        self.sigInt = False
        self.event = event
        self.recordsWritten = 0

    def run(self):
        global seriesId
        global timestamp
        global lock

        idx = 0
        mean = 0.0
        squared = 0.0
        addReqId = self.args.addReqId
        addReqIdAsDim = addReqId and self.args.addReqIdAsDim
        addReqIdAsMeasure = addReqId and not self.args.addReqIdAsDim

        writeRecordsBatch = list()
        recordsToWrite = list()

        while True:
            with lock:
                if self.sigInt == True or sigInt == True or self.event.is_set(
                ):
                    print("Thread {} exiting.".format(self.threadId))
                    break

                seriesId += 1
                if seriesId >= self.numMetrics + self.numEvents:
                    ## Wrapping around, so move to new timestamp.
                    seriesId = 0
                    newTimestamp = timestamp + self.args.intervalMillis
                    currentTime = getCurrentTimestampMillis()
                    ## Check if the timestamps are falling behind
                    if newTimestamp < currentTime - 0.05 * self.args.intervalMillis:
                        print(
                            "Can't keep up ingestion to the desired inter-event interval. Expected interval: {} ms. Actual: {} ms. Consider increasing concurrency or processes."
                            .format(self.args.intervalMillis,
                                    currentTime - timestamp))
                        ## Move time forward.
                        timestamp = getTimestampMillis()
                    else:
                        timestamp = newTimestamp
                        ## Check if we are ingesting too fast, then slow down.
                        if timestamp > currentTime - 1000:
                            ## Slow down
                            sleepTimeSecs = int(
                                (timestamp - currentTime) / 1000)
                            print("Thread {} sleeping for {} secs".format(
                                self.threadId, sleepTimeSecs))
                            time.sleep(sleepTimeSecs)

                    now = datetime.datetime.now()
                    print(
                        "Resetting to first series from thread: [{}] at time {}. Timestamp set to: {}."
                        .format(self.threadId,
                                now.strftime("%Y-%m-%d %H:%M:%S"), timestamp))

                localSeriesId = seriesId
                localTimestamp = timestamp

            if localSeriesId < self.numMetrics:
                dimensions = model.createDimensionsEntry(
                    self.dimensionMetrics[localSeriesId],
                    addReqId=addReqIdAsDim)
                records = model.createRandomMetrics(localSeriesId,
                                                    dimensions,
                                                    localTimestamp,
                                                    "MILLISECONDS",
                                                    self.highUtilizationHosts,
                                                    self.lowUtilizationHosts,
                                                    wide=self.args.wide,
                                                    addReqId=addReqIdAsMeasure)
            else:
                dimensions = model.createDimensionsEntry(
                    self.dimensionEvents[localSeriesId - self.numMetrics],
                    addReqId=addReqIdAsDim)
                records = model.createRandomEvent(dimensions,
                                                  localTimestamp,
                                                  "MILLISECONDS",
                                                  wide=self.args.wide,
                                                  addReqId=addReqIdAsMeasure)

            if self.args.batchWrites:
                if len(writeRecordsBatch) + len(
                        records) <= self.args.batchSize:
                    writeRecordsBatch.extend(records)
                    ## Generate more data, unless we're wrapping around, at which point, drain any pending records.
                    if localSeriesId < self.numMetrics + self.numEvents:
                        continue
                else:
                    ## transfer a subset of values from the records produced into the batch
                    spaceRemaining = self.args.batchSize - len(
                        writeRecordsBatch)
                    assert (spaceRemaining < len(records))
                    ## Transfer 0 - spaceRemaining - 1 to be written with this batch, and spaceRemaining - end in the next batch
                    ## If spaceRemaining is 0, then just write what we have accumulated so far
                    if spaceRemaining > 0:
                        writeRecordsBatch.extend(records[0:spaceRemaining])
                    ## The batch is full, now we issue the write record request.
                    recordsToWrite.clear()
                    recordsToWrite.extend(writeRecordsBatch)
                    writeRecordsBatch.clear()
                    writeRecordsBatch.extend(records[spaceRemaining:])
            else:
                recordsToWrite.clear()
                recordsToWrite.extend(records)

            idx += 1
            start = timer()
            try:
                writeResult = tswrite.writeRecords(self.client,
                                                   self.databaseName,
                                                   self.tableName,
                                                   recordsToWrite)
                self.recordsWritten += len(recordsToWrite)
                self.success += 1
            except Exception as e:
                print(e)
                exc_type, exc_value, exc_traceback = sys.exc_info()
                traceback.print_exception(exc_type,
                                          exc_value,
                                          exc_traceback,
                                          limit=2,
                                          file=sys.stdout)
                requestId = "RequestId: {}".format(
                    e.response['ResponseMetadata']['RequestId'])
                print(requestId)
                print(json.dumps(dimensions, indent=2))
                print(json.dumps(records, indent=2))
                continue
            finally:
                self.count += 1
                end = timer()
                cur = end - start
                self.digest.update(cur)
                self.sum += cur
                ## Computing the streaming M^2 (squared distance from mean)
                delta = cur - mean
                mean += delta / self.count
                squared += delta * (cur - mean)
                if self.count > 1:
                    self.variance = float(squared / (self.count - 1))

            requestId = writeResult['ResponseMetadata']['RequestId']
            if idx % 1000 == 0:
                now = datetime.datetime.now()
                print(
                    "{}. {}. {}. Last RequestId: {}. Avg={:,}, Stddev={:,}, 50thPerc={:,}, 90thPerc={:,}, 99thPerc={:,}. Records written: {}"
                    .format(self.threadId, idx,
                            now.strftime("%Y-%m-%d %H:%M:%S"), requestId,
                            round(self.sum / self.count, 3),
                            round(math.sqrt(self.variance), 3),
                            round(self.digest.percentile(50), 3),
                            round(self.digest.percentile(90), 3),
                            round(self.digest.percentile(99), 3),
                            self.recordsWritten))

    def interrupt(self):
        print("Interrupting thread: ", self.threadId)
        self.sigInt = True
Example #16
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]["numeric"] or column_props[col]["notnulls"] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug("column_summary - " + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ["mean", "min", "max", "std", "sum"]:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult["n"] = column_props[col]["notnulls"]

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult["percentiles"] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult["median"] = colresult["percentiles"][50]
    colresult["iqr"] = (colresult["percentiles"][75] -
                        colresult["percentiles"][25])

    # Compute the t-digest.
    logger.debug("column_summary - {} - creating TDigest...".format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug("column_summary - {} - testing log trans...".format(col))
    try:
        colresult["logtrans"] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning("test_logtrans has failed for column `{}`: {}".format(
            col, e))
        colresult["logtrans"] = False

    if colresult["logtrans"]:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult["logtrans_mean"] = _tdigest_mean(logdigest)
        colresult["logtrans_std"] = _tdigest_std(logdigest)
        colresult["logtrans_IQR"] = logdigest.percentile(
            75) - logdigest.percentile(25)

    logger.debug("column_summary - {} - should {}be log-transformed".format(
        col, "NOT " if not colresult["logtrans"] else ""))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult["tdigest"] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug("column_summary - {} - computing histogram...".format(col))

    if column_props[col]["is_categorical"]:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult["logtrans"]:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins="fd")
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins="fd")

    colresult["histogram"] = {
        "counts": counts.tolist(),
        "bin_edges": edges.tolist(),
    }

    # Compute KDE
    logger.debug("column_summary - {} - computing KDE...".format(col))
    bw = _bw_scott(colresult, colresult["n"], colresult["logtrans"], 1)

    logger.debug("column_summary - {} - KDE bw: {:.4g}".format(col, bw))

    if column_props[col]["is_categorical"]:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult["min"], colresult["max"]
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult["logtrans"])

    colresult["kde"] = {"x": kde_x.tolist(), "y": kde_y.tolist()}

    return {col: colresult, "_columns": [col]}
Example #17
0
def column_summary(series, column_props, delta=0.01):
    """Summarise a numeric column.

    Parameters
    ----------
    series : pd.Series
        Numeric column.
    column_props : TODO
        TODO
    delta : float
        TODO

    Returns
    -------
    TODO
    """
    col = series.name
    if not column_props[col]['numeric'] or column_props[col]['notnulls'] == 0:
        # Series is not numeric or is all NaNs.
        return None

    logger.debug('column_summary - ' + col)

    # select non-nulls from column
    data = series.dropna()

    colresult = {}
    for m in ['mean', 'min', 'max', 'std', 'sum']:
        val = getattr(data, m)()
        if type(val) is np.int64:
            colresult[m] = int(val)
        else:
            colresult[m] = val

    colresult['n'] = column_props[col]['notnulls']

    percentiles = [0.1, 1, 10, 25, 50, 75, 90, 99, 99.9]
    colresult['percentiles'] = {
        perc: np.nanpercentile(series, perc)
        for perc in percentiles
    }
    colresult['median'] = colresult['percentiles'][50]
    colresult['iqr'] = (colresult['percentiles'][75] -
                        colresult['percentiles'][25])

    # Compute the t-digest.
    logger.debug('column_summary - {} - creating TDigest...'.format(col))
    digest = TDigest(delta)
    digest.batch_update(data)

    logger.debug('column_summary - {} - testing log trans...'.format(col))
    try:
        colresult['logtrans'] = bool(_test_logtrans(digest))
    except Exception as e:
        # Hard to pinpoint problems with the logtrans TDigest.
        logger.warning('test_logtrans has failed for column `{}`: {}'.format(
            col, e))
        colresult['logtrans'] = False

    if colresult['logtrans']:
        logdigest = TDigest()
        for c in digest.C.values():
            logdigest.update(np.log(c.mean), c.count)
        colresult['logtrans_mean'] = _tdigest_mean(logdigest)
        colresult['logtrans_std'] = _tdigest_std(logdigest)
        colresult['logtrans_IQR'] = (logdigest.percentile(75) -
                                     logdigest.percentile(25))

    logger.debug('column_summary - {} - should {}be log-transformed'.format(
        col, 'NOT ' if not colresult['logtrans'] else ''))

    # Compress and store the t-digest.
    digest.delta = delta
    digest.compress()
    colresult['tdigest'] = [(c.mean, c.count) for c in digest.C.values()]

    # Compute histogram
    logger.debug('column_summary - {} - computing histogram...'.format(col))

    if column_props[col]['is_categorical']:
        # Compute frequency table and store as histogram
        counts, edges = _compute_histogram_from_frequencies(data)
    else:
        if colresult['logtrans']:
            counts, log_edges = np.histogram(np.log10(data),
                                             density=False,
                                             bins='fd')
            edges = 10**log_edges
        else:
            counts, edges = np.histogram(data, density=False, bins='fd')

    colresult['histogram'] = {
        'counts': counts.tolist(),
        'bin_edges': edges.tolist()
    }

    # Compute KDE
    logger.debug('column_summary - {} - computing KDE...'.format(col))
    bw = _bw_scott(colresult, colresult['n'], colresult['logtrans'], 1)

    logger.debug('column_summary - {} - KDE bw: {:.4g}'.format(col, bw))

    if column_props[col]['is_categorical']:
        kde_x, kde_y = np.zeros(1), np.zeros(1)
    else:
        coord_range = colresult['min'], colresult['max']
        kde_x, kde_y = _compute_smoothed_histogram(
            data, bw, coord_range, logtrans=colresult['logtrans'])

    colresult['kde'] = {'x': kde_x.tolist(), 'y': kde_y.tolist()}

    return {col: colresult, '_columns': [col]}