def column_iterator(d, label, first): col = d.columns[label] f = format.get(col.type, str) it = d.iterate(sliceno, label, status_reporting=first) none_as = resolve_none(label, col) if none_as is not None: none_as = quote_func(none_as) if needs_quoting(col.type): if f: it = (none_as if v is None else quote_func(f(v)) for v in it) else: it = (none_as if v is None else quote_func(v) for v in it) else: if f: it = (none_as if v is None else f(v) for v in it) else: it = (none_as if v is None else v for v in it) elif f: if needs_quoting(col.type): it = (quote_func(f(v)) for v in it) else: it = imap(f, it) elif needs_quoting(col.type): it = imap(quote_func, it) return it
def csvexport(sliceno, filename, labelsonfirstline): assert len(options.separator) == 1 assert options.quote_fields in ('', "'", '"',) d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobids.previous: prev_source = job_params(jobids.previous).datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): mkwrite = mkwrite_gz elif filename.lower().endswith('.csv'): mkwrite = mkwrite_uncompressed else: raise Exception("Filename should end with .gz for compressed or .csv for uncompressed") iters = [] first = True for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ('float32', 'float64', 'number'): it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with mkwrite(filename) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write(enc(sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def mk_iter(col): if ds.columns[col].backing_type in ( 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='unicode') else: return imap(str, ds._column_iterator(sliceno, col))
def synthesis(slices, analysis_res, prepare_res): dw, dws, lines, _, column2type, columns, rev_rename = prepare_res analysis_res = list(analysis_res) if options.filter_bad: bad_line_count_per_slice = [sum(data[1]) for data in analysis_res] lines = [num - b for num, b in zip(lines, bad_line_count_per_slice)] bad_line_count_total = sum(bad_line_count_per_slice) if bad_line_count_total: print('Slice Bad line count') for sliceno, cnt in enumerate(bad_line_count_per_slice): print('%5d %d' % ( sliceno, cnt, )) print('total %d' % (bad_line_count_total, )) print() print('Slice Bad line number') reported_count = 0 for sliceno, data in enumerate(analysis_res): if sum(data[1]) and reported_count < 32: with open('badmap%d' % (sliceno, ), 'rb') as fh: badmap = mmap(fh.fileno(), 0, prot=PROT_READ) for ix, v in enumerate(imap(ord, badmap)): if v: for jx in range(8): if v & (1 << jx): print('%5d %d' % ( sliceno, ix * 8 + jx, )) reported_count += 1 if reported_count >= 32: break if reported_count >= 32: break badmap.close() if reported_count >= 32: print('...') print() print('Bad line count Column') for colname in columns: cnt = sum( sum(data[0].get(colname, ())) for data in analysis_res) if cnt: print('%14d %s' % ( cnt, colname, )) print() for sliceno in range(slices): unlink('badmap%d' % (sliceno, )) if options.defaults and sum( sum(data[2].values()) for data in analysis_res): print('Defaulted values') for colname in sorted(options.defaults): defaulted = [data[2][colname] for data in analysis_res] if sum(defaulted): print(' %s:' % (colname, )) print(' Slice Defaulted line count') slicecnt = 0 for sliceno, cnt in enumerate(defaulted): if cnt: print(' %5d %d' % ( sliceno, cnt, )) slicecnt += 1 if slicecnt > 1: print(' total %d' % (sum(defaulted), )) if dws: # rehashing if dw: # not as a chain final_bad_count = [data[1] for data in analysis_res] hash_lines = [data[4] for data in analysis_res] for colname in dw.columns: for sliceno in range(slices): out_fn = dw.column_filename(colname, sliceno=sliceno) with open(out_fn, 'wb') as out_fh: for s in range(slices): if hash_lines[s][sliceno] - final_bad_count[s][ sliceno]: src_fn = dws[s].column_filename( colname, sliceno=sliceno) with open(src_fn, 'rb') as in_fh: copyfileobj(in_fh, out_fh) for sliced_dw in dws: if sliced_dw: sliced_dw.discard() for sliceno, counts in enumerate( zip(*[data[4] for data in analysis_res])): bad_counts = (data[1][sliceno] for data in analysis_res) dw.set_lines(sliceno, sum(counts) - sum(bad_counts)) for sliceno, data in enumerate(analysis_res): dw.set_minmax(sliceno, data[3]) else: for sliceno, data in enumerate(analysis_res): if dws[sliceno]: dws[sliceno].set_minmax(-1, data[3]) for s, count in enumerate(data[4]): dws[sliceno].set_lines(s, count - data[1][s]) else: for sliceno, count in enumerate(lines): dw.set_lines(sliceno, count) for sliceno, data in enumerate(analysis_res): dw.set_minmax(sliceno, data[3]) used = {rev_rename.get(colname, colname) for colname in column2type} discarded = set(datasets.source.columns) - used if discarded: print('Discarded columns:') template = ' %%-%ds %%s' % (max( len(colname) for colname in discarded), ) for colname in discarded: print(template % ( colname, datasets.source.columns[colname].type, ))
def csvexport(sliceno, filename, labelsonfirstline): d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobs.previous: prev_source = jobs.previous.params.datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): open_func = partial(gzip.open, compresslevel=options.compression) elif filename.lower().endswith('.csv'): open_func = open else: raise Exception( "Filename should end with .gz for compressed or .csv for uncompressed" ) if PY2: open_func = partial(open_func, mode='wb') else: open_func = partial(open_func, mode='xt', encoding='utf-8') iters = [] first = True dumps = JSONEncoder( sort_keys=True, ensure_ascii=True, check_circular=False, ).encode for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if d.columns[label].none_support: if t == 'bytes' or (PY2 and t == 'ascii'): it = imap(nonefix_b, it) elif t in ( 'ascii', 'unicode', ): it = imap(nonefix_u, it) if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ( 'float32', 'float64', ): it = imap(repr, it) elif t == 'number': if PY2: it = imap(lambda n: str(n) if isinstance(n, long) else repr(n), it) else: it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with writer(open_func(filename)) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write( enc( sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def analysis(sliceno): chain = datasets.source.chain(stop_ds={jobs.previous: 'source'}, length=options.length) return set(imap(unicode, chain.iterate(sliceno, options.column)))
def grep(ds, sliceno): # Use bytes for everything if anything is bytes, str otherwise. (For speed.) if any(ds.columns[col].backing_type == 'bytes' for col in (grep_columns or columns or ds.columns)): def strbytes(v): return str(v).encode('utf-8', 'replace') def mk_iter(col): if ds.columns[col].backing_type in ( 'bytes', 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='bytes') else: return imap(strbytes, ds._column_iterator(sliceno, col)) chk = pat_b.search else: def mk_iter(col): if ds.columns[col].backing_type in ( 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='unicode') else: return imap(str, ds._column_iterator(sliceno, col)) chk = pat_s.search def fmt(v): if not isinstance(v, (unicode, bytes)): v = str(v) if isinstance(v, unicode): v = v.encode('utf-8', 'replace') return v def color(item): pos = 0 parts = [] for m in pat_b.finditer(item): a, b = m.span() parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m')) pos = b parts.append(item[pos:]) return b''.join(parts) prefix = [] if args.show_dataset: prefix.append(ds.encode('utf-8')) if args.show_sliceno: prefix.append(str(sliceno).encode('utf-8')) prefix = tuple(prefix) def show(prefix, items): items = map(fmt, items) if args.color: items = map(color, items) # This will be atomic if the line is not too long # (at least up to PIPE_BUF bytes, should be at least 512). write(1, separator_b.join(prefix + tuple(items)) + b'\n') if grep_columns and grep_columns != set(columns or ds.columns): grep_iter = izip(*(mk_iter(col) for col in grep_columns)) lines_iter = ds.iterate(sliceno, columns) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in (columns or sorted(ds.columns)))) lines = izip(grep_iter, lines_iter) if args.show_lineno: for lineno, (grep_items, items) in enumerate(lines): if any(imap(chk, grep_items or items)): show(prefix + (str(lineno).encode('utf-8'), ), items) else: for grep_items, items in lines: if any(imap(chk, grep_items or items)): show(prefix, items)