def write(self, detector): """Print all keys and values from detector structure they include also a metadata read from binary output file """ for name, value in sorted(detector.__dict__.items()): # be careful not to check for np.array but for np.ndarray! if name not in {'data', 'data_raw', 'error', 'error_raw', 'counter'}: # skip non-metadata fields line = "{:24s}: '{:s}'".format(str(name), str(value)) print(line) # print some data-related statistics print(75 * "*") print("Data min: {:g}, max: {:g}".format(detector.data.min(), detector.data.max())) if self.options.details: # print data scatter-plot if possible if detector.dimension == 1: try: from hipsterplot import plot print(75 * "*") print("Data scatter plot") plot(detector.data_raw) except ImportError: logger.warning("Detailed summary requires installation of hipsterplot package") # print data histogram if possible try: from bashplotlib.histogram import plot_hist print(75 * "*") print("Data histogram") plot_hist(detector.data_raw, bincount=70, xlab=False, showSummary=True) except ImportError: logger.warning("Detailed summary requires installation of bashplotlib package") return 0
def explore_col(df, col, histo=True, nb_uniques=10): print(f"{'-'*(len(col)+4)}") print(f"| {col} |") print(f"{'-'*(len(col)+4)}") print(f"type: {type(df[col][0])}") print(df[col][:10]) print(f"\nmin: {df[col].dropna().min()}") print(f"max: {df[col].dropna().max()}") if (isinstance(df[col][0], int) or isinstance(df[col][0], float) or np.issubdtype(type(df[col][0]), np.integer) or np.issubdtype(type(df[col][0]), np.floating) ): print(f"mean: {np.mean(df[col])}") print("\n---------------------------------") print("| 25% | 50% | 75% | std |") print("---------------------------------") print("|" + "|".join([(str(val)+' ')[:7] for val in df[col].quantile([.25, .5, .75])]) + "|" + (str(df[col].std())+' ')[:7] + "|" ) print("---------------------------------") if histo: print("histogram:") if isinstance(df[col][0], bool): plot_hist( df[col].dropna().apply(lambda row: 0 if row is False else 1 if row is True else -1), bincount = 30, pch = '.', xlab=True ) else: plot_hist(df[col].dropna(),bincount = 30, pch = '.', xlab=True) uniques = sorted(df[col].dropna().unique()) print(f"contains {len(uniques)} different values") if len(uniques) < nb_uniques: print("uniques values: ") columns = '| '+' | '.join([str(elem) for elem in uniques])+' |' print('-' * len(columns)) print(columns) print('-' * len(columns)) numbers = [] print( '| '+'%| '.join([(str( df[col].where( df[col]==elem).dropna().count() /df[col].shape[0] *100 )+ ' '*len(str(elem)) )[:len(str(elem))] for elem in uniques ]) + '%|' ) print('-' * len(columns)) print(f"contains {df[col].shape[0]- df[col].dropna().shape[0]} NaN values\n")
def display(self): plot_hist(list(self.knockouts_hist), title="Knockout size distribution", colour="blue") lines = [ "%s, %s" % (x, y) for x, y in zip(self.solution.solutions['Size'], self.solution.solutions['Fitness']) ] plot_scatter( lines, None, None, 20, "*", "blue", "Correlation between number of knockouts and fitness")
def plot(clk_json): try: # data was writen with: json.dump({'clks': clk_data}, output); so ... clks = json.load(clk_json)['clks'] except ValueError as e: # In Python 3 we can be more specific # with json.decoder.JSONDecodeError, # but that doesn't exist in Python 2. msg = 'The input is not a valid JSON file.' raise_from(DescribeError(msg), e) if len(clks) == 0: msg = 'No clks found' raise DescribeError(msg) popcounts = [deserialize_bitarray(clk).count() for clk in clks] plot_hist(popcounts, bincount=60, title='popcounts', xlab=True, showSummary=True)
def write(self, estimator): """Print all keys and values from estimator structure they include also a metadata read from binary output file """ for name, value in sorted(estimator.__dict__.items()): # skip non-metadata fields if name not in {'data', 'data_raw', 'error', 'error_raw', 'counter', 'pages'}: line = "{:24s}: '{:s}'".format(str(name), str(value)) print(line) # print some data-related statistics print(75 * "*") for page_no, page in enumerate(estimator.pages): print("Page {} / {}".format(page_no, len(estimator.pages))) for name, value in sorted(page.__dict__.items()): # skip non-metadata fields if name not in {'data', 'data_raw', 'error', 'error_raw'}: line = "\t{:24s}: '{:s}'".format(str(name), str(value)) print(line) print("Data min: {:g}, max: {:g}".format(page.data_raw.min(), page.data_raw.max())) print(75 * "-") if self.options.details: # print data scatter-plot if possible if estimator.dimension == 1 and len(self.pages) == 1: try: from hipsterplot import plot print(75 * "*") print("Data scatter plot") plot(estimator.data_raw) except ImportError: logger.warning("Detailed summary requires installation of hipsterplot package") # print data histogram if possible try: from bashplotlib.histogram import plot_hist print(75 * "*") print("Data histogram") plot_hist(estimator.data_raw, bincount=70, xlab=False, showSummary=True) except ImportError: logger.warning("Detailed summary requires installation of bashplotlib package") return 0
def _do_dir(target_dir: pathlib.Path): width_list = [] height_list = [] files = [p for p in target_dir.glob("**/*") if p.is_file()] for file in tqdm.tqdm(files, ascii=True, ncols=100): try: with PIL.Image.open(file) as img: try: img = PIL.ImageOps.exif_transpose(img) except Exception: pass width_list.append(img.width) height_list.append(img.height) with tqdm.tqdm.external_write_mode(): logger.info( f"{file.relative_to(target_dir)}: width={img.width} height={img.height}" ) except Exception: logger.info(f"{file.relative_to(target_dir)}: skip") width_list = np.array(width_list) height_list = np.array(height_list) plot_hist(width_list, title="width", bincount=20, showSummary=True) plot_hist(height_list, title="height", bincount=20, showSummary=True) sizes = np.sqrt(width_list * height_list) plot_hist(sizes, title="sqrt(width * height)", bincount=20, showSummary=True)
from bashplotlib.scatterplot import plot_scatter from bashplotlib.histogram import plot_hist plot_hist(5, height=20.0, bincount=None, pch='0', colour='white', title='', xlab=None)
#!/usr/bin/env python3 # sudo pip3 install bashplotlib # bashplotlib is a python package and command line tool for making basic plots in the terminal. # It’s a quick way to visualize data when you don’t have a GUI. import numpy as np from bashplotlib.histogram import plot_hist arr = np.random.normal(size=10000, loc=0, scale=1) plot_hist(arr, bincount=50) #* 647| oo #* 613| ooooo #* 579| oooooo #* 545| ooooooo o #* 511| oooooooooo #* 477| ooooooooooo #* 443| ooooooooooo #* 409| oooooooooooo #* 375| ooooooooooooo #* 341| ooooooooooooooo #* 307| ooooooooooooooo #* 273| oooooooooooooooo #* 239| ooooooooooooooooo #* 205| oooooooooooooooooo #* 171| ooooooooooooooooooo #* 137| ooooooooooooooooooooo #* 103| ooooooooooooooooooooooo #* 69| oooooooooooooooooooooooooo
def plot_beta(a, b, max=1.0): sample = beta.rvs(a, b, size=10000) * max plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
#! /home/gregory.ashton/anaconda2/bin/python import sys files = sys.argv[1:] CPU_hours = [] for file in files: with open(file, 'r') as f: for line in f: if 'Total Remote Usage' in line: string = line.lstrip(' ').split(' ')[2].rstrip(',') h, m, s = [float(u) for u in string.split(':')] CPU_hours.append(h + m / 60. + s / (60.**2)) print('\nCPU hours summary') print(' Total: {}'.format(sum(CPU_hours))) print(' Shortest: {}'.format(min(CPU_hours))) print(' Longest: {}'.format(max(CPU_hours))) print('') try: from bashplotlib.histogram import plot_hist plot_hist(CPU_hours, bincount=50, xlab=True) except IOError: pass
def plot_norm(mean, variation): sample = norm.rvs(mean, variation, size=10000) plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
def plot_gamma(k, theta): sample = gamma.rvs(k, scale=theta, size=10000) plot_hist(sample, bincount=BINCOUNT, height=10, xlab=True)
import json from bashplotlib.histogram import plot_hist data = json.load(open('lambdaSamples.json')) plot_hist(data, height=25, bincount=80, xlab=True, title="Lambda samples") data = json.load(open('muSamples.json')) plot_hist(data, height=25, bincount=80, xlab=True, title="Mu samples")
def main(): args = Args() datasets = { # "train": read_pairs(args.train_file, tuple_separator=" ▁|SEP|▁ ", token_separator=" ", decode=True), # "train": read_pairs(args.train_file), "test": read_pairs(args.test_file), } train_src, train_tgt = [], [] for train_path in args.train_files.split(","): cur_src, cur_tgt = read_pairs(train_path) train_src += cur_src train_tgt += cur_tgt datasets["train"] = (train_src, train_tgt) merged_dataset = sorted([(tgt, idx if key == "test" else -1) for key, (src_data, tgt_data) in datasets.items() for idx, tgt in enumerate(tgt_data)]) cnt = 0 progress = tqdm(merged_dataset) overlap_ids = [] overlap_scores = {} for idx, (test_sent, key) in enumerate(progress): if key == -1: continue candidates = [] left = next( (i for i in range(idx - 1, -1, -1) if merged_dataset[i][1] == -1), None) right = next((i for i in range(idx + 1, len(merged_dataset)) if merged_dataset[i][1] == -1), None) if left: candidates.append(tokenize(merged_dataset[left][0])) if right: candidates.append(tokenize(merged_dataset[right][0])) if len(candidates) == 0: breakpoint() continue test_sent = tokenize(test_sent) max_overlap, cand_sent = max( (cotra.utils.lcs(test_sent, cand) / max(len(test_sent), len(cand)), cand) for cand in candidates) if max_overlap > 0.8: # progress.write(" ".join(test_sent)) # progress.write(" ".join(cand_sent)) cnt += 1 progress.set_postfix(cnt=cnt) overlap_ids.append(key) overlap_scores[key] = max_overlap print(cnt) print(overlap_ids) if len(overlap_scores) != len(datasets["test"][0]): breakpoint() overlap_scores = [ overlap_scores[idx] for idx in range(len(overlap_scores)) ] plot_hist(overlap_scores, height=10, pch="x", xlab=True, showSummary=True, bincount=70) with open(args.output_file, "w") as f: f.write("\n".join(map(str, overlap_scores)))
def describe(clk_json): """show distribution of clk's popcounts using a ascii plot. """ clks = json.load(clk_json)['clks'] counts = get_encoding_popcounts([deserialize_bitarray(clk) for clk in clks]) plot_hist(counts, bincount=60, title='popcounts', xlab=True, showSummary=True)
import matplotlib import firebase_admin from firebase_admin import credentials from firebase_admin import db from bashplotlib.histogram import plot_hist cred = credentials.Certificate("db-rix.json") firebase_admin.initialize_app(cred) ref = db.reference('/traffic_v2', url="https://<your_db_name>.firebaseio.com/") data = ref.get() up = [value['upload'] for value in data.values()] down = [value['download'] for value in data.values()] up_delta = [up[i] - up[i - 1] for i in range(1, len(up))] down_delta = [down[i] - down[i - 1] for i in range(1, len(down))] index = list(range(1, len(up))) plot_hist(up)
def main_work(): ################################################# # ======== Get stuff from command line ========== a = ArgumentParser() a.add_argument('-i', dest='infile', required=True) a.add_argument('-cmp', dest='cmpdir', required=False, default='') a.add_argument('-maxframes', dest='maxframes', type=int, default=10000000) a.add_argument('-maxletters', dest='maxletters', type=int, default=10000000) a.add_argument( '-phone', action='store_true', help= 'Use the 4th field of transcript, containing space-delimited phones') a.add_argument('-speaker', action='store_true') a.add_argument('-o', dest='outfile', required=False, default='') opts = a.parse_args() # =============================================== texts = codecs.open(opts.infile, 'r', 'utf-8', errors='ignore').readlines() texts = [line.strip('\n\r |') for line in texts] texts = [t for t in texts if t != ''] texts = [line.strip().split("|") for line in texts] for line in texts: assert len(line) == len(texts[0]), line #print prompts all_frame_lengths = [] all_letter_lengths = [] frame_lengths = [] letter_lengths = [] outlines = [] letter_inventory = {} speaker_ids = {} missing_audio = [] for text in tqdm(texts): base, plaintext, normtext = text[:3] if opts.phone: assert len(text) >= 4 symbols = text[3] symbols = re.split('\s+', symbols) else: symbols = list(normtext.lower()) all_letter_lengths.append(len(symbols)) if opts.speaker: assert len(text) >= 5 speaker_id = text[4] speaker_ids[speaker_id] = '' if opts.cmpdir: cmpfile = os.path.join(opts.cmpdir, base + '.npy') if not os.path.exists(cmpfile): missing_audio.append(base) continue nframe_audio, ndim = np.load(cmpfile).shape #print base all_frame_lengths.append(nframe_audio) if nframe_audio > opts.maxframes: continue frame_lengths.append(nframe_audio) if len(symbols) > opts.maxletters: continue letter_lengths.append(len(symbols)) letter_inventory.update(dict(zip(symbols, symbols))) #print phones #outline = '%s||%s'%(base, normtext) #outlines.append(outline) print print '%s of %s sentences processed successfully -- to debug rerun ....' % ( len(outlines), len(texts)) print print 'missing audio for :' print missing_audio print print print 'Keep %s of %s letter/phone tokens' % (len(letter_lengths), len(all_letter_lengths)) print 'Keep %s of %s frames' % (len(frame_lengths), len(all_frame_lengths)) print if opts.phone: print '------------------' print 'Observed phones:' print '------------------' print print sorted([str(k) for k in letter_inventory.keys()]) print '------------------' junctures = [ p for p in letter_inventory.keys() if p.startswith(u'<') and p.endswith(u'>') ] junctures.remove('<_START_>') junctures.remove('<_END_>') junctures = [p.strip('<>') for p in junctures] junctures = [j for j in junctures if j != ''] print print print 'Junctures (many of these may be spurious):' print print ' '.join(sorted(junctures)) print else: print '------------------' print 'Observed letters:' print '------------------' print print[''.join(sorted(letter_inventory.keys()))] print '------------------' print print 'Letter/phone length max:' print max(letter_lengths) if opts.speaker: print '------------------' print 'Observed speakers:' print '------------------' print print sorted(speaker_ids.keys()) print '------------------' if opts.cmpdir: print 'Frame length max:' print max(frame_lengths) ## 5% for testing: # outlines = np.array(outlines) ## for indexing with lists # n_train = int(len(outlines) * 0.95) # ixx = range(len(outlines)) # random.seed(12345) # random.shuffle(ixx) ## TODO seeds everywhere # train_ixx = sorted(ixx[:n_train]) # test_ixx = sorted(ixx[n_train:]) #writelist(outlines, opts.outfile) # writelist(outlines[test_ixx], opts.outfile + '.test') plot_hist(letter_lengths, xlab=True, bincount=30, title='Histogram of sentence length in letters', height=10, colour='k') if opts.cmpdir: plot_hist(frame_lengths, xlab=True, bincount=30, title='Histogram of sentence length in frames', height=10, colour='k') print print 'Run again with -maxframes and -maxletters to exclude the tail...' if opts.outfile: f = open(opts.outfile, 'w') for text in tqdm(texts): base = text[0] if base not in missing_audio: f.write('|'.join(text) + '\n') f.close()
# scratch.py from bashplotlib.horizontal_histogram import plot_horiz_hist from bashplotlib.histogram import plot_hist import random nums = [] for i in range(1, 100): nums.append(random.randint(1, 100)) height = 20 bincount = 10 binwidth = 1 char = 'o' color = 'default' title = 'My Bar Chart' displayXLabel = True showSum = True yStart = True plot_hist(nums, height, bincount, binwidth, char, color, title, displayXLabel, showSum, yStart, xtitle="My x", ytitle="My y")
def display(self): plot_hist(list(self.knockouts_hist), title="Knockout size distribution", colour="blue") lines = ["%s, %s" % (x, y) for x, y in zip(self.solution.solutions['Size'], self.solution.solutions['Fitness'])] plot_scatter(lines, None, None, 20, "*", "blue", "Correlation between number of knockouts and fitness")
#!/usr/bin/env python3.6 from bashplotlib.histogram import plot_hist import glob import re import argparse import fi_tools parser = argparse.ArgumentParser('Run profiling experiments') parser.add_argument('-t', '--tool', help='tool to run', required=True) args = parser.parse_args() bitflips = [] for f in glob.glob('*/' + fi_tools.files[args.tool]['injection']): with open(f, 'r') as f: m = re.match('.*bitflip=(\d+).*', f.read()) bitflips.append(m[1]) print(bitflips) plot_hist(bitflips, title='Distro bitflip', bincount=128, xlab=True) fi_index = [] for f in glob.glob('*/' + fi_tools.files[args.tool]['target']): with open(f, 'r') as f: m = re.match('.*fi_index=(\d+).*', f.read()) fi_index.append(m[1]) plot_hist(fi_index, title='Distro fi_index', bincount=128, xlab=True)