def sort(columniter): with status('Determining sort order'): info = datasets.source.columns if sum(info[column].type not in nononehandling_types for column in options.sort_columns): # At least one sort column can have unsortable values first = True iters = [] for column in options.sort_columns: it = columniter(column, status_reporting=first) first = False if info[column].type not in nononehandling_types: it = filter_unsortable(column, it) iters.append(it) if len(iters) == 1: # Special case to not make tuples when there is only one column. lst = list(iters[0]) else: lst = list(izip(*iters)) else: columns = options.sort_columns if len(columns) == 1: # Special case to not make tuples when there is only one column. columns = columns[0] lst = list(columniter(columns)) reverse = (options.sort_order == 'descending') with status('Creating sort list'): return sorted(range(len(lst)), key=lst.__getitem__, reverse=reverse)
def analysis(sliceno, prepare_res): write = prepare_res[0].write_list ix = prepare_res[1] d = datasets.source to_copy = d.lines[sliceno] if to_copy == 0: # bail out empty slices right away return to_skip = sum(d.lines[:sliceno]) if to_skip: it = d.iterate('roundrobin', slice=to_skip - bool(options.trigger_column)) if options.trigger_column: trigger_v = next(it)[ix] # keep skipping until trigger value changes for v in it: to_copy -= 1 if v[ix] != trigger_v: write(v) break if to_copy == 0: return # no lines left for this slice else: it = d.iterate('roundrobin') # write the lines belonging here # (zip so we don't have to count down to_copy manually) for _, v in izip(range(to_copy), it): write(v) if options.trigger_column: trigger_v = v[ix] # keep copying until trigger value changes or lines run out for v in it: if trigger_v != v[ix]: break write(v)
def analysis(sliceno, prepare_res): writers, columns, chain = prepare_res key_it = chain.iterate(sliceno, options.column) # we can't just use chain.iterate because of protections against changing types with copy_mode values_it = itertools.chain.from_iterable( ds.iterate(sliceno, columns, copy_mode=True, status_reporting=False) for ds in chain) for key, values in izip(key_it, values_it): writers[unicode(key)].write(*values)
def __init__(self, slices): slices = range(slices) self._slices = iter(slices) tuple_len = pickle_load("Analysis.tuple") if tuple_len is False: self._is_tupled = False else: self._is_tupled = True self._loaders = [self._loader(ix, iter(slices)) for ix in range(tuple_len)] self._tupled = izip(*self._loaders)
def csvexport(sliceno, filename, labelsonfirstline): assert len(options.separator) == 1 assert options.quote_fields in ('', "'", '"',) d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobids.previous: prev_source = job_params(jobids.previous).datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): mkwrite = mkwrite_gz elif filename.lower().endswith('.csv'): mkwrite = mkwrite_uncompressed else: raise Exception("Filename should end with .gz for compressed or .csv for uncompressed") iters = [] first = True for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ('float32', 'float64', 'number'): it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with mkwrite(filename) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write(enc(sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def sort(columniter): with status('Determining sort order'): info = datasets.source.columns special_handling = set() for column in options.sort_columns: if info[column].type.startswith( 'float') or info[column].type == 'number': # for NaN special_handling.add(column) if info[column].none_support: special_handling.add(column) if special_handling: # At least one sort column can have unsortable values first = True iters = [] for column in options.sort_columns: it = columniter(column, status_reporting=first) first = False if column in special_handling: it = filter_unsortable(column, it) iters.append(it) if len(iters) == 1: # Special case to not make tuples when there is only one column. lst = list(iters[0]) else: lst = list(izip(*iters)) else: columns = options.sort_columns if len(columns) == 1: # Special case to not make tuples when there is only one column. columns = columns[0] lst = list(columniter(columns)) if options.trigger_column: if len(options.sort_columns) == 1: sort_extra = lst else: with status('Creating trigger list'): ix = options.sort_columns.index(options.trigger_column) sort_extra = [el[ix] for el in lst] else: sort_extra = None reverse = (options.sort_order == 'descending') with status('Creating sort list'): return sorted(range(len(lst)), key=lst.__getitem__, reverse=reverse), sort_extra
def grep(ds, sliceno): def no_conv(v): return v def mk_conv(col): if ds.columns[col].type in ( 'bytes', 'unicode', 'ascii', ): if not ds.columns[col].none_support: return no_conv return unicode chk = pat_s.search def mk_iter(col): if ds.columns[col].type == 'ascii': it = ds._column_iterator(sliceno, col, _type='unicode') else: it = ds._column_iterator(sliceno, col) if ds.columns[col].type == 'bytes': errors = 'replace' if PY2 else 'surrogateescape' if ds.columns[col].none_support: it = (None if v is None else v.decode('utf-8', errors) for v in it) else: it = (v.decode('utf-8', errors) for v in it) return it def colour_item(item): pos = 0 parts = [] for m in pat_s.finditer(item): a, b = m.span() parts.extend((item[pos:a], colour.red(item[a:b]))) pos = b parts.append(item[pos:]) return ''.join(parts) if args.format == 'json': prefix = {} dumps = json.JSONEncoder(ensure_ascii=False, default=json_default).encode if args.show_dataset: prefix['dataset'] = ds if args.show_sliceno: prefix['sliceno'] = sliceno def show(): d = dict(zip(used_columns, items)) if args.show_lineno: prefix['lineno'] = lineno if prefix: prefix['data'] = d d = prefix return dumps(d).encode('utf-8', 'surrogatepass') else: prefix = [] if args.show_dataset: prefix.append(ds) if args.show_sliceno: prefix.append(str(sliceno)) prefix = tuple(prefix) def show(): data = list(prefix) if args.show_lineno: data.append(unicode(lineno)) if PY2: show_items = (v if isinstance(v, unicode) else str(v).decode('utf-8', 'replace') for v in items) else: show_items = map(str, items) show_items = list(show_items) lens = (len(item) for item in data + show_items) if highlight_matches: show_items = list(map(colour_item, show_items)) if escape_item: lens_unesc = (len(item) for item in data + show_items) show_items = list(map(escape_item, show_items)) lens_esc = (len(item) for item in data + show_items) lens = ( l + esc - unesc for l, unesc, esc in zip(lens, lens_unesc, lens_esc)) data.extend(show_items) return separate(data, lens).encode('utf-8', errors) used_columns = columns or sorted(ds.columns) if grep_columns and grep_columns != set(used_columns): grep_iter = izip(*(mk_iter(col) for col in grep_columns)) conv_items = [mk_conv(col) for col in grep_columns] else: grep_iter = repeat(None) conv_items = [mk_conv(col) for col in used_columns] lines_iter = izip(*(mk_iter(col) for col in used_columns)) for lineno, (grep_items, items) in enumerate(izip(grep_iter, lines_iter)): if any( chk(conv(item)) for conv, item in izip(conv_items, grep_items or items)): # This will be atomic if the line is not too long # (at least up to PIPE_BUF bytes, should be at least 512). write(1, show() + b'\n')
def csvexport(sliceno, filename, labelsonfirstline): d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobs.previous: prev_source = jobs.previous.params.datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if filename.lower().endswith('.gz'): open_func = partial(gzip.open, compresslevel=options.compression) elif filename.lower().endswith('.csv'): open_func = open else: raise Exception( "Filename should end with .gz for compressed or .csv for uncompressed" ) if PY2: open_func = partial(open_func, mode='wb') else: open_func = partial(open_func, mode='xt', encoding='utf-8') iters = [] first = True dumps = JSONEncoder( sort_keys=True, ensure_ascii=True, check_circular=False, ).encode for label in options.labels: it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first) first = False t = d.columns[label].type if d.columns[label].none_support: if t == 'bytes' or (PY2 and t == 'ascii'): it = imap(nonefix_b, it) elif t in ( 'ascii', 'unicode', ): it = imap(nonefix_u, it) if t == 'unicode' and PY2: it = imap(enc, it) elif t == 'bytes' and PY3: it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it) elif t in ( 'float32', 'float64', ): it = imap(repr, it) elif t == 'number': if PY2: it = imap(lambda n: str(n) if isinstance(n, long) else repr(n), it) else: it = imap(repr, it) elif t == 'json': it = imap(dumps, it) elif t not in ('unicode', 'ascii', 'bytes'): it = imap(str, it) iters.append(it) it = izip(*iters) with writer(open_func(filename)) as write: q = options.quote_fields sep = options.separator if q: qq = q + q if labelsonfirstline: write( enc( sep.join(q + n.replace(q, qq) + q for n in options.labels))) for data in it: write(sep.join(q + n.replace(q, qq) + q for n in data)) else: if labelsonfirstline: write(enc(sep.join(options.labels))) for data in it: write(sep.join(data))
def csvexport(sliceno, filename, labelsonfirstline): d = datasets.source[0] if not options.labels: options.labels = sorted(d.columns) if options.chain_source: if jobs.previous: prev_source = jobs.previous.params.datasets.source assert len(datasets.source) == len(prev_source) else: prev_source = [None] * len(datasets.source) lst = [] for src, stop in zip(datasets.source, prev_source): lst.extend(src.chain(stop_ds=stop)) datasets.source = lst if options.filename.lower().endswith( '.gz') or '.gz.' in options.filename.lower(): open_func = partial(gzip.open, compresslevel=options.compression) else: open_func = open if PY2: open_func = partial(open_func, mode='wb') else: open_func = partial(open_func, mode='xt', encoding='utf-8') if options.none_as: if isinstance(options.none_as, dict): bad_none = set(options.none_as) - set(options.labels) assert not bad_none, 'Unknown labels in none_as: %r' % (bad_none, ) else: assert isinstance(options.none_as, str), "What did you pass as none_as?" def resolve_none(label, col): d = options.none_as or {} if col.type in ( 'json', 'pickle', ): if isinstance(options.none_as, str): return options.none_as return d.get(label) elif col.none_support: if isinstance(options.none_as, str): return options.none_as return d.get(label, 'None') q = options.quote_fields qq = q + q sep = options.separator def quote_always(v): return q + v.replace(q, qq) + q if q in '"\'': # special case so both quotes will quote the other def quote_if_needed(v): if v and (v[0] in '"\'' or v[-1] in '"\'' or sep in v): return q + v.replace(q, qq) + q else: return v else: def quote_if_needed(v): if v.startswith(q) or v.endswith(q) or sep in v: return q + v.replace(q, qq) + q else: return v if not q: quote_func = str elif options.lazy_quotes and sep: # always quote if no separator quote_func = quote_if_needed else: quote_func = quote_always def needs_quoting(typ): if not q: return False if not options.lazy_quotes: return True # maybe we can skip quoting because values that need quoting are impossible? if typ in ( 'int32', 'int64', 'bits32', 'bits64', ): possible = '0123456789-' elif typ in ( 'float32', 'float64', 'number', ): possible = '0123456789-+einfa.' else: possible = False if possible: q_s = set(q) sep_s = set(sep) possible_s = set(possible) if q_s - possible_s and sep_s - possible_s: return False return True def column_iterator(d, label, first): col = d.columns[label] f = format.get(col.type, str) it = d.iterate(sliceno, label, status_reporting=first) none_as = resolve_none(label, col) if none_as is not None: none_as = quote_func(none_as) if needs_quoting(col.type): if f: it = (none_as if v is None else quote_func(f(v)) for v in it) else: it = (none_as if v is None else quote_func(v) for v in it) else: if f: it = (none_as if v is None else f(v) for v in it) else: it = (none_as if v is None else v for v in it) elif f: if needs_quoting(col.type): it = (quote_func(f(v)) for v in it) else: it = imap(f, it) elif needs_quoting(col.type): it = imap(quote_func, it) return it def outer_iterator(label, first): return chain.from_iterable( column_iterator(d, label, first) for d in datasets.source) iters = [] first = True for label in options.labels: iters.append(outer_iterator(label, first)) first = False it = izip(*iters) with writer(open_func(filename)) as write: if labelsonfirstline: write(enc(sep.join(map(quote_func, options.labels)))) for data in it: write(sep.join(data))
def grep(ds, sliceno, out): out.start(ds) if len(patterns) == 1: chk = patterns[0].search else: def chk(s): return any(p.search(s) for p in patterns) first = [True] def mk_iter(col): kw = {} if first[0]: first[0] = False lines = ds.lines[sliceno] if lines > status_interval[sliceno]: def cb(n): q_status.put((sliceno, False)) out.excite() kw['callback'] = cb kw['callback_interval'] = status_interval[sliceno] if ds.columns[col].type == 'ascii': kw['_type'] = 'unicode' it = ds._column_iterator(sliceno, col, **kw) if ds.columns[col].type == 'bytes': errors = 'replace' if PY2 else 'surrogateescape' if ds.columns[col].none_support: it = (None if v is None else v.decode('utf-8', errors) for v in it) else: it = (v.decode('utf-8', errors) for v in it) return it used_columns = columns_for_ds(ds) used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns) if grep_columns and set(used_grep_columns) != set(used_columns): grep_iter = izip(*(mk_iter(col) for col in used_grep_columns)) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in used_columns)) if args.before_context: before = deque((), args.before_context) else: before = None if args.format == 'json': prefix = {} if args.show_dataset: prefix['dataset'] = ds if args.show_sliceno: prefix['sliceno'] = sliceno show = make_show(prefix, used_columns) else: prefix = [] if args.show_dataset: prefix.append(ds) if args.show_sliceno: prefix.append(str(sliceno)) prefix = tuple(prefix) show = make_show(prefix, used_columns) if args.invert_match: maybe_invert = operator.not_ else: maybe_invert = bool to_show = 0 for lineno, (grep_items, items) in enumerate(izip(grep_iter, lines_iter)): if maybe_invert( any(chk(unicode(item)) for item in grep_items or items)): if q_list: q_list.put((ds, sliceno)) return while before: out.put(show(*before.popleft())) to_show = 1 + args.after_context if to_show: out.put(show(lineno, items)) to_show -= 1 elif before is not None: before.append((lineno, items)) out.end(ds)
def grep(ds, sliceno): # Use bytes for everything if anything is bytes, str otherwise. (For speed.) if any(ds.columns[col].backing_type == 'bytes' for col in (grep_columns or columns or ds.columns)): def strbytes(v): return str(v).encode('utf-8', 'replace') def mk_iter(col): if ds.columns[col].backing_type in ( 'bytes', 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='bytes') else: return imap(strbytes, ds._column_iterator(sliceno, col)) chk = pat_b.search else: def mk_iter(col): if ds.columns[col].backing_type in ( 'unicode', 'ascii', ): return ds._column_iterator(sliceno, col, _type='unicode') else: return imap(str, ds._column_iterator(sliceno, col)) chk = pat_s.search def fmt(v): if not isinstance(v, (unicode, bytes)): v = str(v) if isinstance(v, unicode): v = v.encode('utf-8', 'replace') return v def color(item): pos = 0 parts = [] for m in pat_b.finditer(item): a, b = m.span() parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m')) pos = b parts.append(item[pos:]) return b''.join(parts) prefix = [] if args.show_dataset: prefix.append(ds.encode('utf-8')) if args.show_sliceno: prefix.append(str(sliceno).encode('utf-8')) prefix = tuple(prefix) def show(prefix, items): items = map(fmt, items) if args.color: items = map(color, items) # This will be atomic if the line is not too long # (at least up to PIPE_BUF bytes, should be at least 512). write(1, separator_b.join(prefix + tuple(items)) + b'\n') if grep_columns and grep_columns != set(columns or ds.columns): grep_iter = izip(*(mk_iter(col) for col in grep_columns)) lines_iter = ds.iterate(sliceno, columns) else: grep_iter = repeat(None) lines_iter = izip(*(mk_iter(col) for col in (columns or sorted(ds.columns)))) lines = izip(grep_iter, lines_iter) if args.show_lineno: for lineno, (grep_items, items) in enumerate(lines): if any(imap(chk, grep_items or items)): show(prefix + (str(lineno).encode('utf-8'), ), items) else: for grep_items, items in lines: if any(imap(chk, grep_items or items)): show(prefix, items)