Esempio n. 1
0
def main(argv, cfg):
    descr = 'show setup.json, dataset list, etc for jobs'
    parser = ArgumentParser(
        prog=argv.pop(0),
        description=descr,
        formatter_class=RawTextHelpFormatter,
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-o',
                       '--output',
                       action='store_true',
                       help='show job output')
    group.add_argument('-O',
                       '--just-output',
                       action='store_true',
                       help='show only job output')
    group.add_argument('-P',
                       '--just-path',
                       action='store_true',
                       help='show only job path')
    parser.add_argument(
        'jobid',
        nargs='+',
        metavar='jobid/jobspec',
        help='jobid is just a jobid.\n' +
        'you can also use path, method or :urdlist:[entry].\n' +
        'path is to a jobdir (with setup.json in it).\n' +
        'method is the latest (current) job with that method (i.e\n' +
        'the latest finished job with current source code).\n' +
        ':urdlist:[entry] looks up jobs in urd. details are in the\n' +
        'urd help, except here entry defaults to -1 and you can\'t\n' +
        'list things (no .../ or .../since/x).\n' +
        'you can use spec~ or spec~N to go back N current jobs\n' +
        'with that method or spec^ or spec^N to follow .previous')
    args = parser.parse_intermixed_args(argv)
    res = 0
    for path in args.jobid:
        try:
            job = name2job(cfg, path)
            if args.just_output:
                out = job.output()
                if out:
                    print(out, end='' if out.endswith('\n') else '\n')
            elif args.just_path:
                print(job.path)
            else:
                show(cfg.url, job, args.output)
        except JobNotFound as e:
            print(e)
            res = 1
        except Exception as e:
            if isinstance(e, OSError) and e.errno == errno.EPIPE:
                raise
            print_exc(file=sys.stderr)
            print("Failed to show %r" % (path, ), file=sys.stderr)
            res = 1
    return res
Esempio n. 2
0
def main(argv, cfg):
    descr = "lists and describes build scripts"
    parser = ArgumentParser(
        prog=argv.pop(0),
        description=descr,
    )
    parser.add_argument('-s',
                        '--short',
                        action='store_true',
                        help='short listing')
    parser.add_argument('-p',
                        '--path',
                        action='store_true',
                        help='show package paths')
    parser.add_argument('match',
                        nargs='*',
                        default=[],
                        help='substring used for matching')
    args = parser.parse_intermixed_args(argv)
    columns = terminal_size().columns

    if not args.match:
        # no args => list everything in short format
        args.match = ['']
        args.short = True

    packages = []
    for package in cfg.method_directories:
        path = dirname(import_module(package).__file__)
        scripts = []
        packages.append((package, path, scripts))
        for item in sorted(
                glob(path + '/build.py') + glob(path + '/build_*.py')):
            name = basename(item[:-3])
            modname = '.'.join((package, name))
            if any(m in modname for m in args.match):
                try:
                    module = import_module(modname)
                except Exception as e:
                    print('%s%s: %s%s' % (colour.RED, item, e, colour.RESET),
                          file=sys.stderr)
                    continue
                scripts.append((name, getattr(module, 'description', '')))

    for package, path, scripts in sorted(packages):
        if scripts:
            if args.path:
                print(path + '/')
            else:
                print(package)
            printdesc(sorted(scripts), columns, full=not args.short)
Esempio n. 3
0
def main(argv, cfg):
    descr = 'show setup.json, dataset list, etc for jobs'
    parser = ArgumentParser(prog=argv.pop(0), description=descr)
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-o',
                       '--output',
                       action='store_true',
                       help='show job output')
    group.add_argument('-O',
                       '--just-output',
                       action='store_true',
                       help='show only job output')
    group.add_argument('-P',
                       '--just-path',
                       action='store_true',
                       help='show only job path')
    parser.add_argument(
        'jobid',
        nargs='+',
        metavar='jobid/path/method',
        help='method shows the latest (current) job with that method\n' +
        '(i.e. the latest finished job with current source code)\n' +
        'you can use spec~ or spec~N to go back N current jobs\n' +
        'with that method or spec^ or spec^N to follow .previous')
    args = parser.parse_intermixed_args(argv)
    res = 0
    for path in args.jobid:
        try:
            job = name2job(cfg, path)
            if args.just_output:
                out = job.output()
                if out:
                    print(out, end='' if out.endswith('\n') else '\n')
            elif args.just_path:
                print(job.path)
            else:
                show(cfg.url, job, args.output)
        except JobNotFound as e:
            print(e)
            res = 1
        except Exception as e:
            if isinstance(e, IOError) and e.errno == errno.EPIPE:
                raise
            print_exc()
            print("Failed to show %r" % (path, ))
            res = 1
    return res
Esempio n. 4
0
def main(argv, cfg):
    parser = ArgumentParser(
        usage="%(prog)s [options] pattern ds [ds [...]] [column [column [...]]",
        prog=argv.pop(0),
    )
    parser.add_argument(
        '-c',
        '--chain',
        action='store_true',
        help="follow dataset chains",
    )
    parser.add_argument(
        '--colour',
        '--color',
        nargs='?',
        const='always',
        choices=['auto', 'never', 'always'],
        type=str.lower,
        help="colour matched text. can be auto, never or always",
        metavar='WHEN',
    )
    parser.add_argument(
        '-i',
        '--ignore-case',
        action='store_true',
        help="case insensitive pattern",
    )
    parser.add_argument(
        '-H',
        '--headers',
        action='store_true',
        help="print column names before output (and on each change)",
    )
    parser.add_argument(
        '-O',
        '--ordered',
        action='store_true',
        help="output in order (one slice at a time)",
    )
    parser.add_argument(
        '-g',
        '--grep',
        action='append',
        help="grep this column only, can be specified multiple times",
        metavar='COLUMN')
    parser.add_argument(
        '-s',
        '--slice',
        action='append',
        help="grep this slice only, can be specified multiple times",
        type=int)
    parser.add_argument(
        '-D',
        '--show-dataset',
        action='store_true',
        help="show dataset on matching lines",
    )
    parser.add_argument(
        '-S',
        '--show-sliceno',
        action='store_true',
        help="show sliceno on matching lines",
    )
    parser.add_argument(
        '-L',
        '--show-lineno',
        action='store_true',
        help="show lineno (per slice) on matching lines",
    )
    supported_formats = (
        'csv',
        'raw',
        'json',
    )
    parser.add_argument(
        '-f',
        '--format',
        default='csv',
        choices=supported_formats,
        help="output format, csv (default) / " +
        ' / '.join(supported_formats[1:]),
        metavar='FORMAT',
    )
    parser.add_argument(
        '-t',
        '--separator',
        help="field separator, default tab / tab-like spaces",
    )
    parser.add_argument('pattern')
    parser.add_argument(
        'dataset', help='can be specified in the same ways as for "ax ds"')
    parser.add_argument('columns', nargs='*', default=[])
    args = parser.parse_intermixed_args(argv)

    pat_s = re.compile(args.pattern, re.IGNORECASE if args.ignore_case else 0)
    datasets = [name2ds(cfg, args.dataset)]
    columns = []

    for ds_or_col in args.columns:
        if columns:
            columns.append(ds_or_col)
        else:
            try:
                datasets.append(name2ds(cfg, ds_or_col))
            except Exception:
                columns.append(ds_or_col)

    if not datasets:
        parser.print_help(file=sys.stderr)
        return 1

    grep_columns = set(args.grep or ())
    if grep_columns == set(columns):
        grep_columns = None

    if args.slice:
        want_slices = []
        for s in args.slice:
            assert 0 <= s < g.slices, "Slice %d not available" % (s, )
            if s not in want_slices:
                want_slices.append(s)
    else:
        want_slices = list(range(g.slices))

    if args.chain:
        datasets = list(chain.from_iterable(ds.chain() for ds in datasets))

    if columns or grep_columns:
        bad = False
        need_cols = set(columns)
        if grep_columns:
            need_cols.update(grep_columns)
        for ds in datasets:
            missing = need_cols - set(ds.columns)
            if missing:
                print('ERROR: %s does not have columns %r' % (
                    ds,
                    missing,
                ),
                      file=sys.stderr)
                bad = True
        if bad:
            return 1

    # never and always override env settings, auto (default) sets from env/tty
    if args.colour == 'never':
        colour.disable()
        highlight_matches = False
    elif args.colour == 'always':
        colour.enable()
        highlight_matches = True
    else:
        highlight_matches = colour.enabled

    # Don't highlight everything when just trying to cat
    if args.pattern == '':
        highlight_matches = False

    separator = args.separator
    if separator is None and not sys.stdout.isatty():
        separator = '\t'

    if separator is None:
        # special case where we try to be like a tab, but with spaces.
        # this is useful because terminals typically don't style tabs.
        def separate(items, lens):
            things = []
            for item, item_len in zip(items, lens):
                things.append(item)
                spaces = 8 - (item_len % 8)
                things.append(colour(' ' * spaces, 'cyan', 'underline'))
            return ''.join(things[:-1])

        separator = '\t'
    else:
        separator_coloured = colour(separator, 'cyan', 'underline')

        def separate(items, lens):
            return separator_coloured.join(items)

    def json_default(obj):
        if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
            return str(obj)
        elif isinstance(obj, complex):
            return [obj.real, obj.imag]
        else:
            return repr(obj)

    if args.format == 'csv':

        def escape_item(item):
            if item and (separator in item or item[0] in '\'"'
                         or item[-1] in '\'"'):
                return '"' + item.replace('\n', '\\n').replace('"', '""') + '"'
            else:
                return item.replace('\n', '\\n')

        errors = 'surrogatepass'
    else:
        escape_item = None
        errors = 'replace' if PY2 else 'surrogateescape'

    def grep(ds, sliceno):
        def no_conv(v):
            return v

        def mk_conv(col):
            if ds.columns[col].type in (
                    'bytes',
                    'unicode',
                    'ascii',
            ):
                if not ds.columns[col].none_support:
                    return no_conv
            return unicode

        chk = pat_s.search

        def mk_iter(col):
            if ds.columns[col].type == 'ascii':
                it = ds._column_iterator(sliceno, col, _type='unicode')
            else:
                it = ds._column_iterator(sliceno, col)
            if ds.columns[col].type == 'bytes':
                errors = 'replace' if PY2 else 'surrogateescape'
                if ds.columns[col].none_support:
                    it = (None if v is None else v.decode('utf-8', errors)
                          for v in it)
                else:
                    it = (v.decode('utf-8', errors) for v in it)
            return it

        def colour_item(item):
            pos = 0
            parts = []
            for m in pat_s.finditer(item):
                a, b = m.span()
                parts.extend((item[pos:a], colour.red(item[a:b])))
                pos = b
            parts.append(item[pos:])
            return ''.join(parts)

        if args.format == 'json':
            prefix = {}
            dumps = json.JSONEncoder(ensure_ascii=False,
                                     default=json_default).encode
            if args.show_dataset:
                prefix['dataset'] = ds
            if args.show_sliceno:
                prefix['sliceno'] = sliceno

            def show():
                d = dict(zip(used_columns, items))
                if args.show_lineno:
                    prefix['lineno'] = lineno
                if prefix:
                    prefix['data'] = d
                    d = prefix
                return dumps(d).encode('utf-8', 'surrogatepass')
        else:
            prefix = []
            if args.show_dataset:
                prefix.append(ds)
            if args.show_sliceno:
                prefix.append(str(sliceno))
            prefix = tuple(prefix)

            def show():
                data = list(prefix)
                if args.show_lineno:
                    data.append(unicode(lineno))
                if PY2:
                    show_items = (v if isinstance(v, unicode) else
                                  str(v).decode('utf-8', 'replace')
                                  for v in items)
                else:
                    show_items = map(str, items)
                show_items = list(show_items)
                lens = (len(item) for item in data + show_items)
                if highlight_matches:
                    show_items = list(map(colour_item, show_items))
                if escape_item:
                    lens_unesc = (len(item) for item in data + show_items)
                    show_items = list(map(escape_item, show_items))
                    lens_esc = (len(item) for item in data + show_items)
                    lens = (
                        l + esc - unesc
                        for l, unesc, esc in zip(lens, lens_unesc, lens_esc))
                data.extend(show_items)
                return separate(data, lens).encode('utf-8', errors)

        used_columns = columns or sorted(ds.columns)
        if grep_columns and grep_columns != set(used_columns):
            grep_iter = izip(*(mk_iter(col) for col in grep_columns))
            conv_items = [mk_conv(col) for col in grep_columns]
        else:
            grep_iter = repeat(None)
            conv_items = [mk_conv(col) for col in used_columns]
        lines_iter = izip(*(mk_iter(col) for col in used_columns))
        for lineno, (grep_items,
                     items) in enumerate(izip(grep_iter, lines_iter)):
            if any(
                    chk(conv(item))
                    for conv, item in izip(conv_items, grep_items or items)):
                # This will be atomic if the line is not too long
                # (at least up to PIPE_BUF bytes, should be at least 512).
                write(1, show() + b'\n')

    def one_slice(sliceno, q, wait_for):
        try:
            if q:
                q.get()
            for ds in datasets:
                if ds in wait_for:
                    q.task_done()
                    q.get()
                grep(ds, sliceno)
        except KeyboardInterrupt:
            return
        except IOError as e:
            if e.errno == errno.EPIPE:
                return
            else:
                raise
        finally:
            # Make sure we are joinable
            try:
                q.task_done()
            except Exception:
                pass

    headers_prefix = []
    if args.show_dataset:
        headers_prefix.append('[DATASET]')
    if args.show_sliceno:
        headers_prefix.append('[SLICE]')
    if args.show_lineno:
        headers_prefix.append('[LINE]')

    headers = {}
    if args.headers:
        if columns:
            current_headers = columns
        else:
            current_headers = None
            for ds in datasets:
                candidate_headers = sorted(ds.columns)
                if candidate_headers != current_headers:
                    headers[ds] = current_headers = candidate_headers
            current_headers = headers.pop(datasets[0])

        def show_headers(headers):
            if args.format != 'json':
                show_items = headers_prefix + headers
                if escape_item:
                    show_items = list(map(escape_item, show_items))
                print(
                    separate(map(colour.blue, show_items),
                             map(len, show_items)))

        show_headers(current_headers)

    queues = []
    children = []
    if not args.ordered:
        q = None
        wait_for = set(headers)
        for sliceno in want_slices[1:]:
            if wait_for:
                q = JoinableQueue()
                q.put(None)
                queues.append(q)
            p = Process(
                target=one_slice,
                args=(sliceno, q, wait_for),
                name='slice-%d' % (sliceno, ),
            )
            p.daemon = True
            p.start()
            children.append(p)
        want_slices = want_slices[:1]

    try:
        for ds in datasets:
            if ds in headers:
                for q in queues:
                    q.join()
                show_headers(headers.pop(ds))
                for q in queues:
                    q.put(None)
            for sliceno in want_slices:
                grep(ds, sliceno)
        for c in children:
            c.join()
    except KeyboardInterrupt:
        print()
Esempio n. 5
0
def main(argv, cfg):
    usage = "%(prog)s [options] ds [ds [...]]"
    parser = ArgumentParser(prog=argv.pop(0), usage=usage)
    parser.add_argument('-c',
                        '--chain',
                        action='store_true',
                        help='list all datasets in a chain')
    parser.add_argument('-C',
                        '--non-empty-chain',
                        action='store_true',
                        help='list all non-empty datasets in a chain')
    parser.add_argument('-l',
                        '--list',
                        action='store_true',
                        help='list all datasets in a job with number of rows')
    parser.add_argument(
        '-L',
        '--chainedlist',
        action='store_true',
        help='list all datasets in a job with number of chained rows')
    parser.add_argument('-m',
                        '--suppress-minmax',
                        action='store_true',
                        help='do not print min/max column values')
    parser.add_argument('-n',
                        '--suppress-columns',
                        action='store_true',
                        help='do not print columns')
    parser.add_argument('-q',
                        '--suppress-errors',
                        action='store_true',
                        help='silently ignores bad input datasets/jobids')
    parser.add_argument(
        '-s',
        '--slices',
        action='store_true',
        help='list relative number of lines per slice in sorted order')
    parser.add_argument('-S',
                        '--chainedslices',
                        action='store_true',
                        help='same as -s but for full chain')
    parser.add_argument('-w',
                        '--location',
                        action='store_true',
                        help='show where (ds/filename) each column is stored')
    parser.add_argument(
        "dataset",
        nargs='+',
        help=
        'the job part of the dataset name can be specified in the same ways as for "ax job". you can use ds~ or ds~N to follow the chain N steps backwards, or ^ to follow .parent. this requires specifying the ds-name, so wd-1~ will not do this, but wd-1/default~ will.'
    )
    args = parser.parse_intermixed_args(argv)
    args.chain = args.chain or args.non_empty_chain

    def finish(badinput):
        if badinput and not args.suppress_errors:
            print('Error, failed to resolve datasets:', file=sys.stderr)
            for n, e in badinput:
                print('    %r: %s' % (
                    n,
                    e,
                ), file=sys.stderr)
            exit(1)
        exit()

    badinput = []

    if args.list or args.chainedlist:
        for n in args.dataset:
            try:
                try:
                    dsvec = name2ds(cfg, n).job.datasets
                except NoSuchWhateverError:
                    dsvec = name2job(cfg, n).datasets
            except Exception as e:
                badinput.append((n, e))
                dsvec = None
            if dsvec:
                print('%s' % (dsvec[0].job, ))
                v = []
                for ds in dsvec:
                    if args.chainedlist:
                        lines = sum(sum(x.lines) for x in ds.chain())
                    else:
                        lines = sum(ds.lines)
                    v.append((ds.name, '{:n}'.format(lines)))
                len_n, len_l = colwidth(v)
                template = "{0:%d}  ({1:>%d})" % (len_n, len_l)
                for name, numlines in sorted(v):
                    print('    ' + template.format(name, numlines))
        finish(badinput)

    for n in args.dataset:
        try:
            ds = name2ds(cfg, n)
        except NoSuchWhateverError as e:
            badinput.append((n, e))
            continue

        print(ds.quoted)
        if ds.parent:
            if isinstance(ds.parent, tuple):
                print("    Parents:")
                max_n = max(len(x.quoted) for x in ds.parent)
                template = "{1:%d}" % (max_n, )
                data = tuple(
                    (None, x.quoted) for ix, x in enumerate(ds.parent))
                data = sorted(data, key=lambda x: x[1])
                printcolwise(data, template, lambda x: x, minrows=8, indent=8)
            else:
                print("    Parent:", ds.parent.quoted)
        print("    Method:", quote(ds.job.method))
        if ds.filename:
            print("    Filename:", quote(ds.filename))
        if ds.previous:
            print("    Previous:", ds.previous.quoted)
        if ds.hashlabel is not None:
            print("    Hashlabel:", quote(ds.hashlabel))

        def prettyminmax(minval, maxval):
            if args.suppress_minmax:
                return ''
            s = '[%%%ds, %%%ds]' % (MINMAXWIDTH, MINMAXWIDTH)
            if minval is None:
                return ''
            elif isinstance(minval, float):

                def intdigits(x):
                    if isinf(x) or isnan(x):
                        return 3
                    return min(MINMAXWIDTH -
                               2, floor(log10(abs(x)) +
                                        1)) if x else (MINMAXWIDTH - 2) // 2

                ints = max(intdigits(minval), intdigits(maxval))
                if ints > 0:
                    format = "%% %d.%df" % (ints, MINMAXWIDTH - ints - 2)
                elif ints < -4:
                    format = "%% .%de" % (MINMAXWIDTH - 7, )
                else:
                    format = "%% .%df" % (MINMAXWIDTH - 3, )

                def format_or_int(v):
                    try:
                        i = int(v)
                        if v == i:
                            return i
                    except (OverflowError, ValueError):
                        pass
                    return locale.format_string(format, v)

                return s % (format_or_int(minval), format_or_int(maxval))
            elif isinstance(minval, int):
                return s % (minval, maxval)
            elif isinstance(minval, (date, time, datetime)):
                return s % (minval, maxval)
            else:
                return s % (minval, maxval)

        if not args.suppress_columns:
            print("    Columns:")
            name2typ = {
                n: c.type + '+None' if c.none_support else c.type
                for n, c in ds.columns.items()
            }
            len_n, len_t = colwidth(
                (quote(n), name2typ[n]) for n, c in ds.columns.items())
            if args.location:
                len_l = max(
                    len(quote(c.location)) for c in ds.columns.values())
                len_c = max(len(c.compression) for c in ds.columns.values())
                template = '        {2} {0:%d}  {1:%d}  {4:%d} {5:%d}  {3}' % (
                    len_n,
                    len_t,
                    len_l,
                    len_c,
                )
            else:
                template = '        {2} {0:%d}  {1:%d}  {3}' % (
                    len_n,
                    len_t,
                )
            chain = False
            if args.chainedslices or args.chain:
                chain = ds.chain()
            for n, c in sorted(ds.columns.items()):
                if chain:
                    minval, maxval = chain.min(n), chain.max(n)
                else:
                    minval, maxval = c.min, c.max
                hashdot = colour("*",
                                 "ds/highlight") if n == ds.hashlabel else " "
                print(
                    template.format(quote(n), name2typ[n], hashdot,
                                    prettyminmax(minval, maxval),
                                    quote(c.location), c.compression).rstrip())
            print("    {0:n} columns".format(len(ds.columns)))
        print("    {0:n} lines".format(sum(ds.lines)))

        if ds.previous or args.chain:
            chain = ds.chain()
            if args.non_empty_chain:
                print("    Full chain length {0:n}, from {1} to {2}".format(
                    len(chain), chain[0], chain[-1]))
                chain = [ds for ds in chain if sum(ds.lines)]
                print("    Filtered chain length {0:n}".format(len(chain)))
            if chain:
                if not args.non_empty_chain:
                    print("    Chain length {0:n}, from {1} to {2}".format(
                        len(chain), chain[0], chain[-1]))
                if args.chain:
                    data = tuple((ix, "%s/%s" % (x.job, x.name),
                                  "{:n}".format(sum(x.lines)))
                                 for ix, x in enumerate(chain))
                    max_n, max_l = colwidth(x[1:] for x in data)
                    template = "{0:3}: {1:%d} ({2:>%d})" % (max_n, max_l)
                    printcolwise(data,
                                 template,
                                 lambda x: (x[0], x[1], x[2]),
                                 minrows=8,
                                 indent=8)

        if args.slices or args.chainedslices:
            if args.chainedslices and ds.previous:
                data = (
                    (ix, '{:n}'.format(sum(x)), sum(x))
                    for ix, x in enumerate(zip(*(x.lines
                                                 for x in ds.chain()))))
                print('    Balance, lines per slice, full chain:')
            else:
                data = ((ix, '{:n}'.format(x), x)
                        for ix, x in enumerate(ds.lines))
                if ds.previous:
                    print('    Balance, lines per slice, tip dataset:')
                else:
                    print('    Balance, lines per slice:')
            data = sorted(data, key=lambda x: -x[2])
            s = sum(x[2] for x in data)
            len_n = max(len(x[1]) for x in data)
            template = "{0:3}: {1!s}%% ({2:>%d})" % (len_n, )
            printcolwise(
                data,
                template,
                lambda x:
                (x[0], locale.format_string("%6.2f", (100 * x[2] /
                                                      (s or 1e20))), x[1]),
                minrows=8,
                indent=8)
            print("    Max to average ratio: " +
                  locale.format_string("%2.3f", (max(x[2] for x in data) /
                                                 ((s or 1e20) / len(data)), )))

        if ds.previous:
            print("    {0:n} total lines in chain".format(
                sum(sum(ds.lines) for ds in chain)))

    finish(badinput)
Esempio n. 6
0
def main(argv, cfg):
    # -C overrides -A and -B (which in turn override -C)
    class ContextAction(Action):
        def __call__(self, parser, namespace, values, option_string=None):
            namespace.before_context = namespace.after_context = values

    parser = ArgumentParser(
        usage=
        "%(prog)s [options] [-e] pattern [...] [-d] ds [...] [[-n] column [...]]",
        description="""positional arguments:
  pattern               (-e, --regexp)
  dataset               (-d, --dataset) can be specified as for "ax ds"
  columns               (-n, --column)""",
        prog=argv.pop(0),
        formatter_class=RawTextHelpFormatter,
    )
    parser.add_argument(
        '-c',
        '--chain',
        action='store_true',
        help="follow dataset chains",
    )
    parser.add_argument(
        '--colour',
        '--color',
        nargs='?',
        const='always',
        choices=['auto', 'never', 'always'],
        type=str.lower,
        help="colour matched text. can be auto, never or always",
        metavar='WHEN',
    )
    parser.add_argument(
        '-i',
        '--ignore-case',
        action='store_true',
        help="case insensitive pattern",
    )
    parser.add_argument(
        '-v',
        '--invert-match',
        action='store_true',
        help="select non-matching lines",
    )
    parser.add_argument(
        '-o',
        '--only-matching',
        action='store_true',
        help="only print matching part (or columns with -l)",
    )
    parser.add_argument(
        '-l',
        '--list-matching',
        action='store_true',
        help=
        "only print matching datasets (or slices with -S)\nwhen used with -o, only print matching columns",
    )
    parser.add_argument(
        '-H',
        '--headers',
        action='store_true',
        help="print column names before output (and on each change)",
    )
    parser.add_argument(
        '-O',
        '--ordered',
        action='store_true',
        help="output in order (one slice at a time)",
    )
    parser.add_argument(
        '-M',
        '--allow-missing-columns',
        action='store_true',
        help="datasets are allowed to not have (some) columns",
    )
    parser.add_argument(
        '-g',
        '--grep',
        action='append',
        help="grep this column only, can be specified multiple times",
        metavar='COLUMN')
    parser.add_argument(
        '-s',
        '--slice',
        action='append',
        help="grep this slice only, can be specified multiple times",
        type=int)
    parser.add_argument(
        '-D',
        '--show-dataset',
        action='store_true',
        help="show dataset on matching lines",
    )
    parser.add_argument(
        '-S',
        '--show-sliceno',
        action='store_true',
        help="show sliceno on matching lines",
    )
    parser.add_argument(
        '-L',
        '--show-lineno',
        action='store_true',
        help="show lineno (per slice) on matching lines",
    )
    supported_formats = (
        'csv',
        'raw',
        'json',
    )
    parser.add_argument(
        '-f',
        '--format',
        default='csv',
        choices=supported_formats,
        help="output format, csv (default) / " +
        ' / '.join(supported_formats[1:]),
        metavar='FORMAT',
    )
    parser.add_argument(
        '-t',
        '--separator',
        help="field separator, default tab / tab-like spaces",
    )
    parser.add_argument(
        '-T',
        '--tab-length',
        type=int,
        metavar='LENGTH',
        help="field alignment, always uses spaces as separator",
    )
    parser.add_argument(
        '-B',
        '--before-context',
        type=int,
        default=0,
        metavar='NUM',
        help="print NUM lines of leading context",
    )
    parser.add_argument(
        '-A',
        '--after-context',
        type=int,
        default=0,
        metavar='NUM',
        help="print NUM lines of trailing context",
    )
    parser.add_argument(
        '-C',
        '--context',
        type=int,
        default=0,
        metavar='NUM',
        action=ContextAction,
        help="print NUM lines of context\n" +
        "context is only taken from the same slice of the same\n" +
        "dataset, and may intermix with output from other\n" +
        "slices. Use -O to avoid that, or -S -L to see it.",
    )
    parser.add_argument('-e',
                        '--regexp',
                        default=[],
                        action='append',
                        dest='patterns',
                        help=SUPPRESS)
    parser.add_argument('-d',
                        '--dataset',
                        default=[],
                        action='append',
                        dest='datasets',
                        help=SUPPRESS)
    parser.add_argument('-n',
                        '--column',
                        default=[],
                        action='append',
                        dest='columns',
                        help=SUPPRESS)
    parser.add_argument('words', nargs='*', help=SUPPRESS)
    args = parser.parse_intermixed_args(argv)

    if args.before_context < 0 or args.after_context < 0:
        print('Context must be >= 0', file=sys.stderr)
        return 1

    columns = args.columns

    try:
        args.datasets = [name2ds(cfg, ds) for ds in args.datasets]
    except NoSuchWhateverError as e:
        print(e, file=sys.stderr)
        return 1

    for word in args.words:
        if not args.patterns:
            args.patterns.append(word)
        elif columns and args.datasets:
            columns.append(word)
        else:
            try:
                args.datasets.append(name2ds(cfg, word))
            except NoSuchWhateverError as e:
                if not args.datasets:
                    print(e, file=sys.stderr)
                    return 1
                columns.append(word)

    if not args.patterns or not args.datasets:
        parser.print_help(file=sys.stderr)
        return 1

    datasets = args.datasets
    patterns = []
    for pattern in args.patterns:
        try:
            patterns.append(
                re.compile(pattern, re.IGNORECASE if args.ignore_case else 0))
        except re.error as e:
            print("Bad pattern %r:\n%s" % (
                pattern,
                e,
            ), file=sys.stderr)
            return 1

    grep_columns = set(args.grep or ())
    if grep_columns == set(columns):
        grep_columns = set()

    if args.slice:
        want_slices = []
        for s in args.slice:
            assert 0 <= s < g.slices, "Slice %d not available" % (s, )
            if s not in want_slices:
                want_slices.append(s)
    else:
        want_slices = list(range(g.slices))

    if len(want_slices) == 1:
        # it will be automatically ordered, so let's not work for it.
        args.ordered = False

    if args.only_matching:
        if args.list_matching:
            args.list_matching = False
            only_matching = 'columns'
        else:
            only_matching = 'part'
    else:
        only_matching = False

    if args.chain:
        datasets = list(chain.from_iterable(ds.chain() for ds in datasets))

    def columns_for_ds(ds, columns=columns):
        if columns:
            return [n for n in columns if n in ds.columns]
        else:
            return sorted(ds.columns)

    if columns or grep_columns:
        if args.allow_missing_columns:
            keep_datasets = []
            for ds in datasets:
                if not columns_for_ds(ds):
                    continue
                if grep_columns and not columns_for_ds(ds, grep_columns):
                    continue
                keep_datasets.append(ds)
            if not keep_datasets:
                return 0
            datasets = keep_datasets
        else:
            bad = False
            need_cols = set(columns)
            if grep_columns:
                need_cols.update(grep_columns)
            for ds in datasets:
                missing = need_cols - set(ds.columns)
                if missing:
                    print('ERROR: %s does not have columns %r' % (
                        ds,
                        missing,
                    ),
                          file=sys.stderr)
                    bad = True
            if bad:
                return 1

    # For the status reporting, this gives how many lines have been processed
    # when reaching each ds ix, per slice. Ends with an extra fictional ds,
    # i.e. the total number of lines for that slice. And then the same again,
    # to simplify the code in the status shower.
    total_lines_per_slice_at_ds = [[0] * g.slices]
    for ds in datasets:
        total_lines_per_slice_at_ds.append(
            [a + b for a, b in zip(total_lines_per_slice_at_ds[-1], ds.lines)])
    total_lines_per_slice_at_ds.append(total_lines_per_slice_at_ds[-1])
    status_interval = {
        # twice per percent, but not too often or too seldom
        sliceno: min(max(total_lines_per_slice_at_ds[-1][sliceno] // 200, 10),
                     5000)
        for sliceno in want_slices
    }

    # never and always override env settings, auto (default) sets from env/tty
    if args.colour == 'never':
        colour.disable()
        highlight_matches = False
    elif args.colour == 'always':
        colour.enable()
        highlight_matches = True
    else:
        args.colour = 'auto'
        highlight_matches = colour.enabled

    # Don't highlight everything when just trying to cat
    if args.patterns == ['']:
        highlight_matches = False
    # Don't highlight anything with -l
    if args.list_matching:
        highlight_matches = False

    if args.format == 'json':
        # headers was just a mistake, ignore it
        args.headers = False

    separator = args.separator
    if args.tab_length:
        separator = None
    elif separator is None and not sys.stdout.isatty():
        separator = '\t'

    if separator is None:
        # special case where we try to be like a tab, but with spaces.
        # this is useful because terminals typically don't style tabs.
        # and also so you can change the length of tabs.
        if (args.tab_length or 0) < 1:
            args.tab_length = 8

        def separate(items, lens):
            things = []
            for item, item_len in zip(items, lens):
                things.append(item)
                spaces = args.tab_length - (item_len % args.tab_length)
                things.append(colour(' ' * spaces, 'grep/separator'))
            return ''.join(things[:-1])

        separator = '\t'
    else:
        separator_coloured = colour(separator, 'grep/separator')

        def separate(items, lens):
            return separator_coloured.join(items)

    def json_default(obj):
        if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
            return str(obj)
        elif isinstance(obj, complex):
            return [obj.real, obj.imag]
        else:
            return repr(obj)

    if args.format == 'csv':

        def escape_item(item):
            if item and (separator in item or item[0] in '\'"'
                         or item[-1] in '\'"'):
                return '"' + item.replace('\n', '\\n').replace('"', '""') + '"'
            else:
                return item.replace('\n', '\\n')

        errors = 'surrogatepass'
    else:
        escape_item = None
        errors = 'replace' if PY2 else 'surrogateescape'

    # This is for the ^T handling. Each slice sends an update when finishing
    # a dataset, and every status_interval[sliceno] lines while iterating.
    # To minimise the data sent the only information sent over the queue
    # is (sliceno, finished_dataset).
    # Status printing is triggered by ^T (or SIGINFO if that is available)
    # or by SIGUSR1.
    # Pressing it again within two seconds prints stats per slice too.
    q_status = mp.LockFreeQueue()

    def status_collector():
        q_status.make_reader()
        status = {sliceno: [0, 0] for sliceno in want_slices}
        #            [ds_ix, done_lines]
        total_lines = sum(total_lines_per_slice_at_ds[-1])
        previous = [0]
        # base colour conf in if stderr is a tty, not stdout.
        if args.colour == 'auto':
            colour.configure_from_environ(stdout=sys.stderr)

        def show(sig, frame):
            t = monotonic()
            verbose = (previous[0] + 2 > t)  # within 2 seconds of previous
            previous[0] = t
            ds_ixes = []
            progress_lines = []
            progress_fraction = []
            for sliceno in want_slices:
                ds_ix, done_lines = status[sliceno]
                ds_ixes.append(ds_ix)
                max_possible = min(
                    done_lines + status_interval[sliceno],
                    total_lines_per_slice_at_ds[ds_ix + 1][sliceno])
                done_lines = (done_lines +
                              max_possible) / 2  # middle of the possibilities
                progress_lines.append(done_lines)
                total = total_lines_per_slice_at_ds[-1][sliceno]
                if total == 0:
                    progress_fraction.append(1)
                else:
                    progress_fraction.append(done_lines / total)
            progress_total = sum(progress_lines) / (total_lines or 1)
            bad_cutoff = progress_total - 0.1
            if verbose:
                show_ds = (len(datasets) > 1 and min(ds_ixes) != max(ds_ixes))
                for sliceno, ds_ix, p in zip(want_slices, ds_ixes,
                                             progress_fraction):
                    if ds_ix == len(datasets):
                        msg = 'DONE'
                    else:
                        msg = '{0:d}% of {1:n} lines'.format(
                            round(p * 100),
                            total_lines_per_slice_at_ds[-1][sliceno])
                        if show_ds:
                            msg = '%s (in %s)' % (
                                msg,
                                datasets[ds_ix].quoted,
                            )
                    msg = '%9d: %s' % (
                        sliceno,
                        msg,
                    )
                    if p < bad_cutoff:
                        msg = colour(msg, 'grep/infohighlight')
                    else:
                        msg = colour(msg, 'grep/info')
                    write(2, msg.encode('utf-8') + b'\n')
            msg = '{0:d}% of {1:n} lines'.format(round(progress_total * 100),
                                                 total_lines)
            if len(datasets) > 1:
                min_ds = min(ds_ixes)
                max_ds = max(ds_ixes)
                if min_ds < len(datasets):
                    ds_name = datasets[min_ds].quoted
                    extra = '' if min_ds == max_ds else ' ++'
                    msg = '%s (in %s%s)' % (
                        msg,
                        ds_name,
                        extra,
                    )
            worst = min(progress_fraction)
            if worst < bad_cutoff:
                msg = '%s, worst %d%%' % (
                    msg,
                    round(worst * 100),
                )
            msg = colour('  SUMMARY: %s' % (msg, ), 'grep/info')
            write(2, msg.encode('utf-8') + b'\n')

        for signame in ('SIGINFO', 'SIGUSR1'):
            if hasattr(signal, signame):
                sig = getattr(signal, signame)
                signal.signal(sig, show)
                if hasattr(signal, 'pthread_sigmask'):
                    signal.pthread_sigmask(signal.SIG_UNBLOCK, {sig})
        tc_original = None
        using_stdin = False
        if not hasattr(signal, 'SIGINFO') and sys.stdin.isatty():
            # ^T wont work automatically on this OS, so we need to handle it as terminal input
            import termios
            from accelerator.compat import selectors
            sel = selectors.DefaultSelector()
            sel.register(0, selectors.EVENT_READ)
            sel.register(q_status.r, selectors.EVENT_READ)
            try:
                tc_original = termios.tcgetattr(0)
                tc_changed = list(tc_original)
                tc_changed[3] &= ~(termios.ICANON | termios.IEXTEN)
                termios.tcsetattr(0, termios.TCSADRAIN, tc_changed)
                using_stdin = True
            except Exception:
                pass
            # we can't set stdin nonblocking, because it's probably the same
            # file description as stdout, so work around that with alarms.
            def got_alarm(sig, frame):
                raise IOError()

            signal.signal(signal.SIGALRM, got_alarm)
        try:
            while True:
                if using_stdin:
                    do_q = False
                    for key, _ in sel.select():
                        if key.fd == 0:
                            try:
                                signal.alarm(
                                    1
                                )  # in case something else read it we block for max 1 second
                                try:
                                    pressed = ord(os.read(0, 1))
                                finally:
                                    signal.alarm(0)
                                if pressed == 20:
                                    write(2,
                                          b'\n')  # "^T" shows in the terminal
                                    os.kill(os.getpid(), signal.SIGUSR1)
                            except Exception:
                                pass
                        elif key.fd == q_status.r:
                            do_q = True
                    if not do_q:
                        continue
                try:
                    sliceno, finished_dataset = q_status.get()
                except QueueEmpty:
                    return
                if finished_dataset:
                    ds_ix = status[sliceno][0] + 1
                    status[sliceno] = [
                        ds_ix, total_lines_per_slice_at_ds[ds_ix][sliceno]
                    ]
                else:
                    status[sliceno][1] += status_interval[sliceno]
        finally:
            if tc_original is not None:
                try:
                    termios.tcsetattr(0, termios.TCSADRAIN, tc_original)
                except Exception:
                    pass

    status_process = mp.SimplifiedProcess(target=status_collector,
                                          name='ax grep status')
    # everything else will write, so make it a writer right away
    q_status.make_writer()

    # Output is only allowed while holding this lock, so that long lines
    # do not get intermixed. (Or when alone in producing output.)
    io_lock = Lock()

    # This contains some extra stuff to be a better base for the other
    # outputters.
    # When used directly it enforces no ordering, but merges smaller writes
    # to keep the number of syscalls down.

    class Outputter:
        def __init__(self, q_in, q_out):
            self.q_in = q_in
            self.q_out = q_out
            self.buffer = []
            self.merge_buffer = b''

        def put(self, data):
            self.merge_buffer += data
            if len(self.merge_buffer) >= 1024:
                self.move_merge()

        def move_merge(self):
            if self.merge_buffer:
                with io_lock:
                    write(1, self.merge_buffer)
                self.merge_buffer = b''

        def start(self, ds):
            pass

        def end(self, ds):
            self.move_merge()

        def finish(self):
            pass

        def full(self):
            return len(self.buffer) > 5000

        def excite(self):
            self.move_merge()
            if self.buffer:
                self.pump(False)

    # Partially ordered output, each header change acts as a fence.
    # This is used in all slices except the first.
    #
    # The queue gets True when the previous slice is ready for the next
    # header change, and None when the header is printed (and it's ok
    # to resume output).

    class HeaderWaitOutputter(Outputter):
        def start(self, ds):
            if ds in headers:
                self.add_wait()
            else:
                self.excite()

        def add_wait(self):
            # Each sync point is separated by None in the buffer
            self.buffer.append(None)
            self.buffer.append(b'')  # Avoid need for special case in .drain
            self.pump()

        def move_merge(self):
            data = self.merge_buffer
            self.merge_buffer = b''
            if self.buffer:
                self.pump()
                if self.buffer:
                    self.buffer.append(data)
                    return
            with io_lock:
                write(1, data)

        def pump(self, wait=None):
            if wait is None:
                wait = self.full()
            try:
                got = self.q_in.get(wait)
            except QueueEmpty:
                if wait:
                    # previous slice has exited without sending all messages
                    raise
                return
            if got is True:
                # since pump is only called when we have outputted all
                # currently allowed output or when the next message is an
                # unblock for such output we can just unconditionally send
                # the True on to the next slice here.
                self.q_out.put(True)
                self.pump(wait)
                return
            else:
                self.q_out.put(None)
                self.drain()

        def drain(self):
            assert self.buffer[
                0] is None, 'The buffer must always stop at a sync point (or empty)'
            with io_lock:
                for pos, data in enumerate(self.buffer[1:], 1):
                    if data is None:
                        break
                    elif data:
                        write(1, data)
                else:
                    # We did not reach the next fence, so last item is real data
                    # and needs to be removed. (The buffer will then be empty and
                    # output will continue directly until reaching the sync point.)
                    pos += 1
            self.buffer[:pos] = ()

        def finish(self):
            while self.buffer:
                self.pump(True)

    # Partially ordered output, each header change acts as a fence.
    # This is used only in the first slice, and outputs the headers.
    #
    # When it is ready to output headers it sends True in the queue.
    # When the True has travelled around the queue ring all slices are
    # ready, the headers are printed, and None is sent to let the other
    # slices resume output.
    # (When the None returns it is ignored, because output is resumed
    # as soon as the headers are printed.)

    class HeaderOutputter(HeaderWaitOutputter):
        def add_wait(self):
            if not self.buffer:
                self.q_out.put(True)
            self.buffer.append(None)
            self.buffer.append(
                b'')  # Avoid need for special case in .drain/.put
            self.pump()

        def drain(self):
            assert self.buffer[
                0] is None, 'The buffer must always stop at a sync point (or empty)'
            with io_lock:
                for pos, data in enumerate(self.buffer[1:], 1):
                    if data is None:
                        self.q_out.put(True)
                        break
                    elif data:
                        write(1, data)
                else:
                    pos += 1
            self.buffer[:pos] = ()

        def pump(self, wait=None):
            if wait is None:
                wait = self.full()
            try:
                got = self.q_in.get(wait)
            except QueueEmpty:
                if wait:
                    # previous slice has exited without sending all messages
                    raise
                return
            if got is True:
                # The True we put in when reaching the fence has travelled
                # all the way around the queue ring, it's time to print the
                # new headers
                write(1, next(headers_iter))
                # and then unblock the other slices
                self.q_out.put(None)
                self.drain()
                # No else, when the None comes back we just drop it.
            if not wait:
                self.pump(False)

    # Fully ordered output, each slice waits for the previous slice.
    # For each ds, waits for None (anything really) before starting,
    # sends None when done.

    class OrderedOutputter(Outputter):
        def start(self, ds):
            # Each ds is separated by None in the buffer
            self.buffer.append(None)
            self.buffer.append(b'')  # Avoid need for special case in .drain
            self.pump()

        def end(self, ds):
            self.move_merge()
            if not self.buffer:
                # We are done with this ds, so let next slice continue
                self.q_out.put(None)

        def pump(self, wait=None):
            if wait is None:
                wait = self.full()
            try:
                self.q_in.get(wait)
            except QueueEmpty:
                if wait:
                    # previous slice has exited without sending all messages
                    raise
                return
            self.drain()

        def move_merge(self):
            data = self.merge_buffer
            self.merge_buffer = b''
            if self.buffer:
                self.pump()
                if self.buffer:
                    self.buffer.append(data)
                    return
            # No need for a lock, the other slices aren't writing concurrently.
            write(1, data)

        def drain(self):
            assert self.buffer[0] is None
            for pos, data in enumerate(self.buffer[1:], 1):
                if data is None:
                    # We are done with this ds, so let next slice continue
                    self.q_out.put(None)
                    break
                elif data:
                    write(1, data)
            else:
                # We did not reach the next ds, so last item is real data and
                # needs to be removed. (The buffer will then be empty and
                # output will continue directly until reaching the next ds.)
                pos += 1
            self.buffer[:pos] = ()

        def finish(self):
            not_finished = bool(self.buffer)
            while self.buffer:
                self.pump(True)
            if not_finished:
                self.q_out.put(None)

    # Same as above but for the first slice so it prints headers when needed.

    class OrderedHeaderOutputter(OrderedOutputter):
        def start(self, ds):
            # Each ds is separated by None in the buffer
            self.buffer.append(None)
            if ds in headers:
                # Headers changed, start with those.
                self.buffer.append(next(headers_iter))
            else:
                self.buffer.append(
                    b'')  # Avoid need for special case in .drain
            self.pump()

    # Choose the right outputter for the kind of sync we need.
    def outputter(q_in, q_out, first_slice=False):
        if args.list_matching:
            cls = Outputter
        elif args.ordered:
            if first_slice:
                cls = OrderedHeaderOutputter
            else:
                cls = OrderedOutputter
        elif headers:
            if first_slice:
                cls = HeaderOutputter
            else:
                cls = HeaderWaitOutputter
        else:
            cls = Outputter
        return cls(q_in, q_out)

    # Make printer for the selected output options
    def make_show(prefix, used_columns):
        def matching_ranges(item):
            ranges = []
            for p in patterns:
                ranges.extend(m.span() for m in p.finditer(item))
            if not ranges:
                return
            # merge overlapping/adjacent ranges
            ranges.sort()
            ranges = iter(ranges)
            start, stop = next(ranges)
            for a, b in ranges:
                if a <= stop:
                    stop = max(stop, b)
                else:
                    yield start, stop
                    start, stop = a, b
            yield start, stop

        def filter_item(item):
            return ''.join(item[a:b] for a, b in matching_ranges(item))

        if args.format == 'json':
            dumps = json.JSONEncoder(ensure_ascii=False,
                                     default=json_default).encode

            def show(lineno, items):
                if only_matching == 'part':
                    items = [filter_item(unicode(item)) for item in items]
                if only_matching == 'columns':
                    d = {
                        k: v
                        for k, v in zip(used_columns, items)
                        if filter_item(unicode(v))
                    }
                else:
                    d = dict(zip(used_columns, items))
                if args.show_lineno:
                    prefix['lineno'] = lineno
                if prefix:
                    prefix['data'] = d
                    d = prefix
                return dumps(d).encode('utf-8', 'surrogatepass') + b'\n'
        else:

            def colour_item(item):
                pos = 0
                parts = []
                for a, b in matching_ranges(item):
                    parts.extend(
                        (item[pos:a], colour(item[a:b], 'grep/highlight')))
                    pos = b
                parts.append(item[pos:])
                return ''.join(parts)

            def show(lineno, items):
                data = list(prefix)
                if args.show_lineno:
                    data.append(unicode(lineno))
                show_items = map(unicode, items)
                if only_matching:
                    if only_matching == 'columns':
                        show_items = (item if filter_item(item) else ''
                                      for item in show_items)
                    else:
                        show_items = map(filter_item, show_items)
                show_items = list(show_items)
                lens = (len(item) for item in data + show_items)
                if highlight_matches:
                    show_items = list(map(colour_item, show_items))
                if escape_item:
                    lens_unesc = (len(item) for item in data + show_items)
                    show_items = list(map(escape_item, show_items))
                    lens_esc = (len(item) for item in data + show_items)
                    lens = (
                        l + esc - unesc
                        for l, unesc, esc in zip(lens, lens_unesc, lens_esc))
                data.extend(show_items)
                return separate(data, lens).encode('utf-8', errors) + b'\n'

        return show

    # This is called for each slice in each dataset.
    # Each slice has a separate process (the same for all datasets).
    # The first slice runs in the main process (unless -l), everything
    # else runs from one_slice.

    def grep(ds, sliceno, out):
        out.start(ds)
        if len(patterns) == 1:
            chk = patterns[0].search
        else:

            def chk(s):
                return any(p.search(s) for p in patterns)

        first = [True]

        def mk_iter(col):
            kw = {}
            if first[0]:
                first[0] = False
                lines = ds.lines[sliceno]
                if lines > status_interval[sliceno]:

                    def cb(n):
                        q_status.put((sliceno, False))
                        out.excite()

                    kw['callback'] = cb
                    kw['callback_interval'] = status_interval[sliceno]
            if ds.columns[col].type == 'ascii':
                kw['_type'] = 'unicode'
            it = ds._column_iterator(sliceno, col, **kw)
            if ds.columns[col].type == 'bytes':
                errors = 'replace' if PY2 else 'surrogateescape'
                if ds.columns[col].none_support:
                    it = (None if v is None else v.decode('utf-8', errors)
                          for v in it)
                else:
                    it = (v.decode('utf-8', errors) for v in it)
            return it

        used_columns = columns_for_ds(ds)
        used_grep_columns = grep_columns and columns_for_ds(ds, grep_columns)
        if grep_columns and set(used_grep_columns) != set(used_columns):
            grep_iter = izip(*(mk_iter(col) for col in used_grep_columns))
        else:
            grep_iter = repeat(None)
        lines_iter = izip(*(mk_iter(col) for col in used_columns))
        if args.before_context:
            before = deque((), args.before_context)
        else:
            before = None
        if args.format == 'json':
            prefix = {}
            if args.show_dataset:
                prefix['dataset'] = ds
            if args.show_sliceno:
                prefix['sliceno'] = sliceno
            show = make_show(prefix, used_columns)
        else:
            prefix = []
            if args.show_dataset:
                prefix.append(ds)
            if args.show_sliceno:
                prefix.append(str(sliceno))
            prefix = tuple(prefix)
            show = make_show(prefix, used_columns)
        if args.invert_match:
            maybe_invert = operator.not_
        else:
            maybe_invert = bool
        to_show = 0
        for lineno, (grep_items,
                     items) in enumerate(izip(grep_iter, lines_iter)):
            if maybe_invert(
                    any(chk(unicode(item)) for item in grep_items or items)):
                if q_list:
                    q_list.put((ds, sliceno))
                    return
                while before:
                    out.put(show(*before.popleft()))
                to_show = 1 + args.after_context
            if to_show:
                out.put(show(lineno, items))
                to_show -= 1
            elif before is not None:
                before.append((lineno, items))
        out.end(ds)

    # This runs in a separate process for each slice except the first
    # one (unless -l), which is handled specially in the main process.

    def one_slice(sliceno, q_in, q_out, q_to_close):
        if q_to_close:
            q_to_close.close()
        if q_in:
            q_in.make_reader()
        if q_out:
            q_out.make_writer()
        if q_list:
            q_list.make_writer()
        try:
            out = outputter(q_in, q_out)
            for ds in datasets:
                if seen_list is None or ds not in seen_list:
                    grep(ds, sliceno, out)
                q_status.put((sliceno, True))
            out.finish()
        except QueueEmpty:
            # some other process died, no need to print an error here
            sys.exit(1)

    headers_prefix = []
    if args.show_dataset:
        headers_prefix.append('[DATASET]')
    if args.show_sliceno:
        headers_prefix.append('[SLICE]')
    if args.show_lineno:
        headers_prefix.append('[LINE]')

    # {ds: headers} for each ds where headers change (not including the first).
    # this is every ds where sync between slices has to happen when not --ordered.
    headers = OrderedDict()
    if args.headers:
        current_headers = None
        for ds in datasets:
            candidate_headers = columns_for_ds(ds)
            if candidate_headers != current_headers:
                headers[ds] = current_headers = candidate_headers

        def gen_headers(headers):
            show_items = headers_prefix + headers
            if escape_item:
                show_items = list(map(escape_item, show_items))
            coloured = (colour(item, 'grep/header') for item in show_items)
            txt = separate(coloured, map(len, show_items))
            return txt.encode('utf-8', 'surrogatepass') + b'\n'

        # remove the starting ds, so no header changes means no special handling.
        current_headers = headers.pop(datasets[0])
        if not args.list_matching:
            write(1, gen_headers(current_headers))
        headers_iter = iter(map(gen_headers, headers.values()))

    q_in = q_out = first_q_out = q_to_close = q_list = None
    children = [status_process]
    seen_list = None
    if args.list_matching:
        # in this case all slices get their own process
        # and the main process just prints the maching slices
        q_list = mp.LockFreeQueue()
        separate_process_slices = want_slices
        if not args.show_sliceno:
            seen_list = mp.MpSet()
    else:
        separate_process_slices = want_slices[1:]
        if args.ordered or headers:
            # needs to sync in some way
            q_in = first_q_out = mp.LockFreeQueue()
    for sliceno in separate_process_slices:
        if q_in:
            q_out = mp.LockFreeQueue()
        p = mp.SimplifiedProcess(
            target=one_slice,
            args=(
                sliceno,
                q_in,
                q_out,
                q_to_close,
            ),
            name='slice-%d' % (sliceno, ),
        )
        children.append(p)
        if q_in and q_in is not first_q_out:
            q_in.close()
        q_to_close = first_q_out
        q_in = q_out
    if q_in:
        q_out = first_q_out
        q_in.make_reader()
        q_out.make_writer()
        if args.ordered:
            q_in.put_local(None)
    del q_to_close
    del first_q_out

    try:
        if args.list_matching:
            if args.headers:
                headers_prefix = ['[DATASET]']
                if seen_list is None:
                    headers_prefix.append('[SLICE]')
                write(1, gen_headers([]))
            ordered_res = defaultdict(set)
            q_list.make_reader()
            if seen_list is None:
                used_columns = ['dataset', 'sliceno']
            else:
                used_columns = ['dataset']
            inner_show = make_show({} if args.format == 'json' else [],
                                   used_columns)

            def show(ds, sliceno=None):
                if sliceno is None:
                    items = [ds]
                else:
                    items = [ds, sliceno]
                write(1, inner_show(None, items))

            while True:
                try:
                    ds, sliceno = q_list.get()
                except QueueEmpty:
                    break
                if seen_list is None:
                    if args.ordered:
                        ordered_res[ds].add(sliceno)
                    else:
                        show(ds, sliceno)
                elif ds not in seen_list:
                    seen_list.add(ds)
                    if not args.ordered:
                        show(ds)
            if args.ordered:
                for ds in datasets:
                    if seen_list is None:
                        for sliceno in sorted(ordered_res[ds]):
                            show(ds, sliceno)
                    else:
                        if ds in seen_list:
                            show(ds)
        else:
            out = outputter(q_in, q_out, first_slice=True)
            sliceno = want_slices[0]
            for ds in datasets:
                grep(ds, sliceno, out)
                q_status.put((sliceno, True))
            out.finish()
    except QueueEmpty:
        # don't print an error, probably a subprocess died from EPIPE before
        # the main process. (or the subprocess already printed an error.)
        return 1

    q_status.close()
    for c in children:
        c.join()
        if c.exitcode:
            return 1
Esempio n. 7
0
def main(argv, cfg):
    usage = "%(prog)s [options] pattern ds [ds [...]] [column [column [...]]"
    parser = ArgumentParser(usage=usage, prog=argv.pop(0))
    parser.add_argument(
        '-c',
        '--chain',
        action='store_true',
        help="follow dataset chains",
    )
    parser.add_argument(
        '-C',
        '--color',
        action='store_true',
        help="color matched text",
    )
    parser.add_argument(
        '-i',
        '--ignore-case',
        action='store_true',
        help="case insensitive pattern",
    )
    parser.add_argument(
        '-H',
        '--headers',
        action='store_true',
        help="print column names before output (and on each change)",
    )
    parser.add_argument(
        '-o',
        '--ordered',
        action='store_true',
        help="output in order (one slice at a time)",
    )
    parser.add_argument(
        '-g',
        '--grep',
        action='append',
        help="grep this column only, can be specified multiple times",
        metavar='COLUMN')
    parser.add_argument(
        '-s',
        '--slice',
        action='append',
        help="grep this slice only, can be specified multiple times",
        type=int)
    parser.add_argument('-t',
                        '--separator',
                        help="field separator (default tab)",
                        default='\t')
    parser.add_argument(
        '-D',
        '--show-dataset',
        action='store_true',
        help="show dataset on matching lines",
    )
    parser.add_argument(
        '-S',
        '--show-sliceno',
        action='store_true',
        help="show sliceno on matching lines",
    )
    parser.add_argument(
        '-L',
        '--show-lineno',
        action='store_true',
        help="show lineno (per slice) on matching lines",
    )
    parser.add_argument('pattern')
    parser.add_argument(
        'dataset', help='can be specified in the same ways as for "ax ds"')
    parser.add_argument('columns', nargs='*', default=[])
    args = parser.parse_intermixed_args(argv)

    pat_s = re.compile(args.pattern, re.IGNORECASE if args.ignore_case else 0)
    pat_b = re.compile(args.pattern.encode('utf-8'),
                       re.IGNORECASE if args.ignore_case else 0)
    datasets = [name2ds(cfg, args.dataset)]
    columns = []

    separator_s = args.separator
    separator_b = separator_s.encode('utf-8')

    for ds_or_col in args.columns:
        if columns:
            columns.append(ds_or_col)
        else:
            try:
                datasets.append(name2ds(cfg, ds_or_col))
            except Exception:
                columns.append(ds_or_col)

    if not datasets:
        parser.print_help(file=sys.stderr)
        return 1

    grep_columns = set(args.grep or ())
    if grep_columns == set(columns):
        grep_columns = None

    if args.slice:
        want_slices = []
        for s in args.slice:
            assert 0 <= s < g.slices, "Slice %d not available" % (s, )
            if s not in want_slices:
                want_slices.append(s)
    else:
        want_slices = list(range(g.slices))

    if args.chain:
        datasets = list(chain.from_iterable(ds.chain() for ds in datasets))

    if columns:
        bad = False
        for ds in datasets:
            missing = set(columns) - set(ds.columns)
            if missing:
                print('ERROR: %s does not have columns %r' % (
                    ds,
                    missing,
                ),
                      file=sys.stderr)
                bad = True
        if bad:
            return 1

    def grep(ds, sliceno):
        # Use bytes for everything if anything is bytes, str otherwise. (For speed.)
        if any(ds.columns[col].backing_type == 'bytes'
               for col in (grep_columns or columns or ds.columns)):

            def strbytes(v):
                return str(v).encode('utf-8', 'replace')

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'bytes',
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='bytes')
                else:
                    return imap(strbytes, ds._column_iterator(sliceno, col))

            chk = pat_b.search
        else:

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='unicode')
                else:
                    return imap(str, ds._column_iterator(sliceno, col))

            chk = pat_s.search

        def fmt(v):
            if not isinstance(v, (unicode, bytes)):
                v = str(v)
            if isinstance(v, unicode):
                v = v.encode('utf-8', 'replace')
            return v

        def color(item):
            pos = 0
            parts = []
            for m in pat_b.finditer(item):
                a, b = m.span()
                parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m'))
                pos = b
            parts.append(item[pos:])
            return b''.join(parts)

        prefix = []
        if args.show_dataset:
            prefix.append(ds.encode('utf-8'))
        if args.show_sliceno:
            prefix.append(str(sliceno).encode('utf-8'))
        prefix = tuple(prefix)

        def show(prefix, items):
            items = map(fmt, items)
            if args.color:
                items = map(color, items)
            # This will be atomic if the line is not too long
            # (at least up to PIPE_BUF bytes, should be at least 512).
            write(1, separator_b.join(prefix + tuple(items)) + b'\n')

        if grep_columns and grep_columns != set(columns or ds.columns):
            grep_iter = izip(*(mk_iter(col) for col in grep_columns))
            lines_iter = ds.iterate(sliceno, columns)
        else:
            grep_iter = repeat(None)
            lines_iter = izip(*(mk_iter(col)
                                for col in (columns or sorted(ds.columns))))
        lines = izip(grep_iter, lines_iter)
        if args.show_lineno:
            for lineno, (grep_items, items) in enumerate(lines):
                if any(imap(chk, grep_items or items)):
                    show(prefix + (str(lineno).encode('utf-8'), ), items)
        else:
            for grep_items, items in lines:
                if any(imap(chk, grep_items or items)):
                    show(prefix, items)

    def one_slice(sliceno, q, wait_for):
        try:
            if q:
                q.get()
            for ds in datasets:
                if ds in wait_for:
                    q.task_done()
                    q.get()
                grep(ds, sliceno)
        except KeyboardInterrupt:
            return
        except IOError as e:
            if e.errno == errno.EPIPE:
                return
            else:
                raise
        finally:
            # Make sure we are joinable
            try:
                q.task_done()
            except Exception:
                pass

    headers_prefix = []
    if args.show_dataset:
        headers_prefix.append('[DATASET]')
    if args.show_sliceno:
        headers_prefix.append('[SLICE]')
    if args.show_lineno:
        headers_prefix.append('[LINE]')

    headers = {}
    if args.headers:
        if columns:
            current_headers = columns
        else:
            current_headers = None
            for ds in datasets:
                candidate_headers = sorted(ds.columns)
                if candidate_headers != current_headers:
                    headers[ds] = current_headers = candidate_headers
            current_headers = headers.pop(datasets[0])

        def show_headers(headers):
            print('\x1b[34m' + separator_s.join(headers_prefix + headers) +
                  '\x1b[m')

        show_headers(current_headers)

    queues = []
    children = []
    if not args.ordered:
        q = None
        wait_for = set(headers)
        for sliceno in want_slices[1:]:
            if wait_for:
                q = JoinableQueue()
                q.put(None)
                queues.append(q)
            p = Process(
                target=one_slice,
                args=(sliceno, q, wait_for),
                name='slice-%d' % (sliceno, ),
            )
            p.daemon = True
            p.start()
            children.append(p)
        want_slices = want_slices[:1]

    try:
        for ds in datasets:
            if ds in headers:
                for q in queues:
                    q.join()
                show_headers(headers.pop(ds))
                for q in queues:
                    q.put(None)
            for sliceno in want_slices:
                grep(ds, sliceno)
        for c in children:
            c.join()
    except KeyboardInterrupt:
        print()