예제 #1
0
    def write(self, detector):
        """Print all keys and values from detector structure

        they include also a metadata read from binary output file
        """
        for name, value in sorted(detector.__dict__.items()):
            # be careful not to check for np.array but for np.ndarray!
            if name not in {'data', 'data_raw', 'error', 'error_raw', 'counter'}:  # skip non-metadata fields
                line = "{:24s}: '{:s}'".format(str(name), str(value))
                print(line)
        # print some data-related statistics
        print(75 * "*")
        print("Data min: {:g}, max: {:g}".format(detector.data.min(), detector.data.max()))

        if self.options.details:
            # print data scatter-plot if possible
            if detector.dimension == 1:
                try:
                    from hipsterplot import plot
                    print(75 * "*")
                    print("Data scatter plot")
                    plot(detector.data_raw)
                except ImportError:
                    logger.warning("Detailed summary requires installation of hipsterplot package")
            # print data histogram if possible
            try:
                from bashplotlib.histogram import plot_hist
                print(75 * "*")
                print("Data histogram")
                plot_hist(detector.data_raw, bincount=70, xlab=False, showSummary=True)
            except ImportError:
                logger.warning("Detailed summary requires installation of bashplotlib package")

        return 0
예제 #2
0
def explore_col(df, col, histo=True, nb_uniques=10):
    print(f"{'-'*(len(col)+4)}")
    print(f"| {col} |")
    print(f"{'-'*(len(col)+4)}")
    print(f"type: {type(df[col][0])}")
    print(df[col][:10])
    print(f"\nmin: {df[col].dropna().min()}")
    print(f"max: {df[col].dropna().max()}")
    if (isinstance(df[col][0], int) 
        or isinstance(df[col][0], float)
        or np.issubdtype(type(df[col][0]), np.integer)
        or np.issubdtype(type(df[col][0]), np.floating)
       ):
        print(f"mean: {np.mean(df[col])}")
        print("\n---------------------------------")
        print("|  25%  |  50%  |  75%  |  std  |")
        print("---------------------------------")
        print("|" + "|".join([(str(val)+'       ')[:7] for val in df[col].quantile([.25, .5, .75])])
                + "|" + (str(df[col].std())+'       ')[:7] + "|"
        )
        print("---------------------------------")
        if histo:
            print("histogram:")
            if isinstance(df[col][0], bool):
                plot_hist(
                    df[col].dropna().apply(lambda row: 0 if row is False else 1 if row is True else -1),
                    bincount = 30,
                    pch = '.',
                    xlab=True
                )
            else:
                plot_hist(df[col].dropna(),bincount = 30, pch = '.', xlab=True)

    uniques = sorted(df[col].dropna().unique())
    print(f"contains {len(uniques)} different values")
          
    if len(uniques) < nb_uniques:
          print("uniques values: ")
          columns = '| '+' | '.join([str(elem) for elem in uniques])+' |'
          print('-' * len(columns))
          print(columns)
          print('-' * len(columns))
          numbers = []
          print(
              '| '+'%| '.join([(str(
                  df[col].where(
                      df[col]==elem).dropna().count() /df[col].shape[0] *100
                  )+ ' '*len(str(elem)) )[:len(str(elem))] for elem in uniques
            ]) + '%|'
          )
          print('-' * len(columns))
          
    print(f"contains {df[col].shape[0]- df[col].dropna().shape[0]} NaN values\n")
예제 #3
0
파일: stats.py 프로젝트: weif000/cameo
 def display(self):
     plot_hist(list(self.knockouts_hist),
               title="Knockout size distribution",
               colour="blue")
     lines = [
         "%s, %s" % (x, y)
         for x, y in zip(self.solution.solutions['Size'],
                         self.solution.solutions['Fitness'])
     ]
     plot_scatter(
         lines, None, None, 20, "*", "blue",
         "Correlation between number of knockouts and fitness")
예제 #4
0
def plot(clk_json):
    try:
        # data was writen with: json.dump({'clks': clk_data}, output); so ...
        clks = json.load(clk_json)['clks']
    except ValueError as e:  # In Python 3 we can be more specific
        # with json.decoder.JSONDecodeError,
        # but that doesn't exist in Python 2.
        msg = 'The input is not a valid JSON file.'
        raise_from(DescribeError(msg), e)
        
    if len(clks) == 0:
        msg = 'No clks found'
        raise DescribeError(msg)

    popcounts = [deserialize_bitarray(clk).count() for clk in clks]
    plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
예제 #5
0
    def write(self, estimator):
        """Print all keys and values from estimator structure

        they include also a metadata read from binary output file
        """
        for name, value in sorted(estimator.__dict__.items()):
            # skip non-metadata fields
            if name not in {'data', 'data_raw', 'error', 'error_raw', 'counter', 'pages'}:
                line = "{:24s}: '{:s}'".format(str(name), str(value))
                print(line)
        # print some data-related statistics
        print(75 * "*")

        for page_no, page in enumerate(estimator.pages):
            print("Page {} / {}".format(page_no, len(estimator.pages)))
            for name, value in sorted(page.__dict__.items()):
                # skip non-metadata fields
                if name not in {'data', 'data_raw', 'error', 'error_raw'}:
                    line = "\t{:24s}: '{:s}'".format(str(name), str(value))
                    print(line)
            print("Data min: {:g}, max: {:g}".format(page.data_raw.min(), page.data_raw.max()))
            print(75 * "-")

        if self.options.details:
            # print data scatter-plot if possible
            if estimator.dimension == 1 and len(self.pages) == 1:
                try:
                    from hipsterplot import plot
                    print(75 * "*")
                    print("Data scatter plot")
                    plot(estimator.data_raw)
                except ImportError:
                    logger.warning("Detailed summary requires installation of hipsterplot package")
            # print data histogram if possible
            try:
                from bashplotlib.histogram import plot_hist
                print(75 * "*")
                print("Data histogram")
                plot_hist(estimator.data_raw, bincount=70, xlab=False, showSummary=True)
            except ImportError:
                logger.warning("Detailed summary requires installation of bashplotlib package")

        return 0
예제 #6
0
def _do_dir(target_dir: pathlib.Path):
    width_list = []
    height_list = []
    files = [p for p in target_dir.glob("**/*") if p.is_file()]
    for file in tqdm.tqdm(files, ascii=True, ncols=100):
        try:
            with PIL.Image.open(file) as img:
                try:
                    img = PIL.ImageOps.exif_transpose(img)
                except Exception:
                    pass
                width_list.append(img.width)
                height_list.append(img.height)
                with tqdm.tqdm.external_write_mode():
                    logger.info(
                        f"{file.relative_to(target_dir)}: width={img.width} height={img.height}"
                    )
        except Exception:
            logger.info(f"{file.relative_to(target_dir)}: skip")

    width_list = np.array(width_list)
    height_list = np.array(height_list)
    plot_hist(width_list, title="width", bincount=20, showSummary=True)
    plot_hist(height_list, title="height", bincount=20, showSummary=True)

    sizes = np.sqrt(width_list * height_list)
    plot_hist(sizes,
              title="sqrt(width * height)",
              bincount=20,
              showSummary=True)
예제 #7
0
from bashplotlib.scatterplot import plot_scatter
from bashplotlib.histogram import plot_hist
plot_hist(5,
          height=20.0,
          bincount=None,
          pch='0',
          colour='white',
          title='',
          xlab=None)
예제 #8
0
#!/usr/bin/env python3

# sudo pip3 install bashplotlib
# bashplotlib is a python package and command line tool for making basic plots in the terminal.
# It’s a quick way to visualize data when you don’t have a GUI.

import numpy as np
from bashplotlib.histogram import plot_hist

arr = np.random.normal(size=10000, loc=0, scale=1)

plot_hist(arr, bincount=50)

#* 647|                         oo
#* 613|                        ooooo
#* 579|                       oooooo
#* 545|                      ooooooo o
#* 511|                     oooooooooo
#* 477|                     ooooooooooo
#* 443|                     ooooooooooo
#* 409|                     oooooooooooo
#* 375|                    ooooooooooooo
#* 341|                   ooooooooooooooo
#* 307|                   ooooooooooooooo
#* 273|                   oooooooooooooooo
#* 239|                  ooooooooooooooooo
#* 205|                 oooooooooooooooooo
#* 171|                 ooooooooooooooooooo
#* 137|                ooooooooooooooooooooo
#* 103|               ooooooooooooooooooooooo
#*  69|             oooooooooooooooooooooooooo
예제 #9
0
def plot_beta(a, b, max=1.0):
    sample = beta.rvs(a, b, size=10000) * max
    plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
예제 #10
0
#! /home/gregory.ashton/anaconda2/bin/python

import sys

files = sys.argv[1:]

CPU_hours = []
for file in files:
    with open(file, 'r') as f:
        for line in f:
            if 'Total Remote Usage' in line:
                string = line.lstrip(' ').split(' ')[2].rstrip(',')
                h, m, s = [float(u) for u in string.split(':')]
                CPU_hours.append(h + m / 60. + s / (60.**2))

print('\nCPU hours summary')
print('  Total: {}'.format(sum(CPU_hours)))
print('  Shortest: {}'.format(min(CPU_hours)))
print('  Longest: {}'.format(max(CPU_hours)))
print('')

try:
    from bashplotlib.histogram import plot_hist
    plot_hist(CPU_hours, bincount=50, xlab=True)
except IOError:
    pass
def plot_norm(mean, variation):
        sample = norm.rvs(mean, variation, size=10000)
        plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
def plot_beta(a, b, max=1.0):
        sample = beta.rvs(a, b, size=10000) * max
        plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
def plot_gamma(k, theta):
        sample = gamma.rvs(k, scale=theta, size=10000)
        plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
예제 #14
0
import json
from bashplotlib.histogram import plot_hist

data = json.load(open('lambdaSamples.json'))
plot_hist(data, height=25, bincount=80, xlab=True, title="Lambda samples")

data = json.load(open('muSamples.json'))
plot_hist(data, height=25, bincount=80, xlab=True, title="Mu samples")
예제 #15
0
def main():
    args = Args()
    datasets = {
        # "train": read_pairs(args.train_file, tuple_separator=" ▁|SEP|▁ ", token_separator=" ", decode=True),
        # "train": read_pairs(args.train_file),
        "test": read_pairs(args.test_file),
    }
    train_src, train_tgt = [], []
    for train_path in args.train_files.split(","):
        cur_src, cur_tgt = read_pairs(train_path)
        train_src += cur_src
        train_tgt += cur_tgt
    datasets["train"] = (train_src, train_tgt)

    merged_dataset = sorted([(tgt, idx if key == "test" else -1)
                             for key, (src_data, tgt_data) in datasets.items()
                             for idx, tgt in enumerate(tgt_data)])

    cnt = 0
    progress = tqdm(merged_dataset)
    overlap_ids = []
    overlap_scores = {}
    for idx, (test_sent, key) in enumerate(progress):
        if key == -1: continue
        candidates = []
        left = next(
            (i for i in range(idx - 1, -1, -1) if merged_dataset[i][1] == -1),
            None)
        right = next((i for i in range(idx + 1, len(merged_dataset))
                      if merged_dataset[i][1] == -1), None)
        if left: candidates.append(tokenize(merged_dataset[left][0]))
        if right: candidates.append(tokenize(merged_dataset[right][0]))
        if len(candidates) == 0:
            breakpoint()
            continue
        test_sent = tokenize(test_sent)
        max_overlap, cand_sent = max(
            (cotra.utils.lcs(test_sent, cand) / max(len(test_sent), len(cand)),
             cand) for cand in candidates)
        if max_overlap > 0.8:
            # progress.write(" ".join(test_sent))
            # progress.write(" ".join(cand_sent))
            cnt += 1
            progress.set_postfix(cnt=cnt)
            overlap_ids.append(key)
        overlap_scores[key] = max_overlap
    print(cnt)
    print(overlap_ids)
    if len(overlap_scores) != len(datasets["test"][0]):
        breakpoint()
    overlap_scores = [
        overlap_scores[idx] for idx in range(len(overlap_scores))
    ]
    plot_hist(overlap_scores,
              height=10,
              pch="x",
              xlab=True,
              showSummary=True,
              bincount=70)

    with open(args.output_file, "w") as f:
        f.write("\n".join(map(str, overlap_scores)))
예제 #16
0
def describe(clk_json):
    """show distribution of clk's popcounts using a ascii plot.
    """
    clks = json.load(clk_json)['clks']
    counts = get_encoding_popcounts([deserialize_bitarray(clk) for clk in clks])
    plot_hist(counts, bincount=60, title='popcounts', xlab=True, showSummary=True)
예제 #17
0
import matplotlib
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
from bashplotlib.histogram import plot_hist

cred = credentials.Certificate("db-rix.json")
firebase_admin.initialize_app(cred)

ref = db.reference('/traffic_v2', url="https://<your_db_name>.firebaseio.com/")
data = ref.get()

up = [value['upload'] for value in data.values()]
down = [value['download'] for value in data.values()]
up_delta = [up[i] - up[i - 1] for i in range(1, len(up))]
down_delta = [down[i] - down[i - 1] for i in range(1, len(down))]
index = list(range(1, len(up)))
plot_hist(up)

예제 #18
0
def plot_gamma(k, theta):
    sample = gamma.rvs(k, scale=theta, size=10000)
    plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
def main_work():

    #################################################

    # ======== Get stuff from command line ==========

    a = ArgumentParser()
    a.add_argument('-i', dest='infile', required=True)
    a.add_argument('-cmp', dest='cmpdir', required=False, default='')
    a.add_argument('-maxframes', dest='maxframes', type=int, default=10000000)
    a.add_argument('-maxletters',
                   dest='maxletters',
                   type=int,
                   default=10000000)
    a.add_argument(
        '-phone',
        action='store_true',
        help=
        'Use the 4th field of transcript, containing space-delimited phones')
    a.add_argument('-speaker', action='store_true')
    a.add_argument('-o', dest='outfile', required=False, default='')

    opts = a.parse_args()

    # ===============================================

    texts = codecs.open(opts.infile, 'r', 'utf-8', errors='ignore').readlines()
    texts = [line.strip('\n\r |') for line in texts]
    texts = [t for t in texts if t != '']
    texts = [line.strip().split("|") for line in texts]

    for line in texts:
        assert len(line) == len(texts[0]), line

    #print prompts

    all_frame_lengths = []
    all_letter_lengths = []

    frame_lengths = []
    letter_lengths = []
    outlines = []
    letter_inventory = {}
    speaker_ids = {}

    missing_audio = []
    for text in tqdm(texts):

        base, plaintext, normtext = text[:3]
        if opts.phone:
            assert len(text) >= 4
            symbols = text[3]
            symbols = re.split('\s+', symbols)
        else:
            symbols = list(normtext.lower())
        all_letter_lengths.append(len(symbols))

        if opts.speaker:
            assert len(text) >= 5
            speaker_id = text[4]
            speaker_ids[speaker_id] = ''

        if opts.cmpdir:
            cmpfile = os.path.join(opts.cmpdir, base + '.npy')
            if not os.path.exists(cmpfile):
                missing_audio.append(base)
                continue
            nframe_audio, ndim = np.load(cmpfile).shape
            #print base

            all_frame_lengths.append(nframe_audio)
            if nframe_audio > opts.maxframes:
                continue
            frame_lengths.append(nframe_audio)

        if len(symbols) > opts.maxletters:
            continue

        letter_lengths.append(len(symbols))

        letter_inventory.update(dict(zip(symbols, symbols)))
        #print phones
        #outline = '%s||%s'%(base,  normtext)

        #outlines.append(outline)

    print
    print '%s of %s sentences processed successfully -- to debug rerun ....' % (
        len(outlines), len(texts))
    print

    print 'missing audio for :'
    print missing_audio
    print
    print
    print 'Keep %s of %s letter/phone tokens' % (len(letter_lengths),
                                                 len(all_letter_lengths))
    print 'Keep %s of %s frames' % (len(frame_lengths), len(all_frame_lengths))
    print

    if opts.phone:
        print '------------------'
        print 'Observed phones:'
        print '------------------'
        print
        print sorted([str(k) for k in letter_inventory.keys()])
        print '------------------'

        junctures = [
            p for p in letter_inventory.keys()
            if p.startswith(u'<') and p.endswith(u'>')
        ]
        junctures.remove('<_START_>')
        junctures.remove('<_END_>')

        junctures = [p.strip('<>') for p in junctures]
        junctures = [j for j in junctures if j != '']
        print
        print
        print 'Junctures (many of these may be spurious):'
        print
        print ' '.join(sorted(junctures))
        print

    else:
        print '------------------'
        print 'Observed letters:'
        print '------------------'
        print
        print[''.join(sorted(letter_inventory.keys()))]
        print '------------------'
    print
    print 'Letter/phone length max:'
    print max(letter_lengths)

    if opts.speaker:
        print '------------------'
        print 'Observed speakers:'
        print '------------------'
        print
        print sorted(speaker_ids.keys())
        print '------------------'

    if opts.cmpdir:
        print 'Frame length max:'
        print max(frame_lengths)

    ## 5% for testing:
    # outlines = np.array(outlines) ## for indexing with lists
    # n_train = int(len(outlines) * 0.95)
    # ixx = range(len(outlines))
    # random.seed(12345)
    # random.shuffle(ixx) ## TODO seeds everywhere
    # train_ixx = sorted(ixx[:n_train])
    # test_ixx = sorted(ixx[n_train:])

    #writelist(outlines, opts.outfile)
    # writelist(outlines[test_ixx], opts.outfile + '.test')

    plot_hist(letter_lengths,
              xlab=True,
              bincount=30,
              title='Histogram of sentence length in letters',
              height=10,
              colour='k')

    if opts.cmpdir:
        plot_hist(frame_lengths,
                  xlab=True,
                  bincount=30,
                  title='Histogram of sentence length in frames',
                  height=10,
                  colour='k')

    print
    print 'Run again with -maxframes and -maxletters to exclude the tail...'

    if opts.outfile:
        f = open(opts.outfile, 'w')
        for text in tqdm(texts):

            base = text[0]
            if base not in missing_audio:
                f.write('|'.join(text) + '\n')
        f.close()
예제 #20
0
def plot_norm(mean, variation):
    sample = norm.rvs(mean, variation, size=10000)
    plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
예제 #21
0
# scratch.py
from bashplotlib.horizontal_histogram import plot_horiz_hist
from bashplotlib.histogram import plot_hist
import random

nums = []
for i in range(1, 100):
    nums.append(random.randint(1, 100))

height = 20
bincount = 10
binwidth = 1
char = 'o'
color = 'default'
title = 'My Bar Chart'
displayXLabel = True
showSum = True
yStart = True

plot_hist(nums,
          height,
          bincount,
          binwidth,
          char,
          color,
          title,
          displayXLabel,
          showSum,
          yStart,
          xtitle="My x",
          ytitle="My y")
예제 #22
0
 def display(self):
     plot_hist(list(self.knockouts_hist), title="Knockout size distribution", colour="blue")
     lines = ["%s, %s" % (x, y) for x, y in
              zip(self.solution.solutions['Size'], self.solution.solutions['Fitness'])]
     plot_scatter(lines, None, None, 20, "*", "blue", "Correlation between number of knockouts and fitness")
예제 #23
0
#!/usr/bin/env python3.6

from bashplotlib.histogram import plot_hist
import glob
import re
import argparse

import fi_tools

parser = argparse.ArgumentParser('Run profiling experiments')
parser.add_argument('-t', '--tool', help='tool to run', required=True)
args = parser.parse_args()

bitflips = []
for f in glob.glob('*/' + fi_tools.files[args.tool]['injection']):
    with open(f, 'r') as f:
        m = re.match('.*bitflip=(\d+).*', f.read())
        bitflips.append(m[1])

print(bitflips)
plot_hist(bitflips, title='Distro bitflip', bincount=128, xlab=True)

fi_index = []
for f in glob.glob('*/' + fi_tools.files[args.tool]['target']):
    with open(f, 'r') as f:
        m = re.match('.*fi_index=(\d+).*', f.read())
        fi_index.append(m[1])
plot_hist(fi_index, title='Distro fi_index', bincount=128, xlab=True)