Beispiel #1
0
 def column_iterator(d, label, first):
     col = d.columns[label]
     f = format.get(col.type, str)
     it = d.iterate(sliceno, label, status_reporting=first)
     none_as = resolve_none(label, col)
     if none_as is not None:
         none_as = quote_func(none_as)
         if needs_quoting(col.type):
             if f:
                 it = (none_as if v is None else quote_func(f(v))
                       for v in it)
             else:
                 it = (none_as if v is None else quote_func(v) for v in it)
         else:
             if f:
                 it = (none_as if v is None else f(v) for v in it)
             else:
                 it = (none_as if v is None else v for v in it)
     elif f:
         if needs_quoting(col.type):
             it = (quote_func(f(v)) for v in it)
         else:
             it = imap(f, it)
     elif needs_quoting(col.type):
         it = imap(quote_func, it)
     return it
def csvexport(sliceno, filename, labelsonfirstline):
	assert len(options.separator) == 1
	assert options.quote_fields in ('', "'", '"',)
	d = datasets.source[0]
	if not options.labels:
		options.labels = sorted(d.columns)
	if options.chain_source:
		if jobids.previous:
			prev_source = job_params(jobids.previous).datasets.source
			assert len(datasets.source) == len(prev_source)
		else:
			prev_source = [None] * len(datasets.source)
		lst = []
		for src, stop in zip(datasets.source, prev_source):
			lst.extend(src.chain(stop_ds=stop))
		datasets.source = lst
	if filename.lower().endswith('.gz'):
		mkwrite = mkwrite_gz
	elif filename.lower().endswith('.csv'):
		mkwrite = mkwrite_uncompressed
	else:
		raise Exception("Filename should end with .gz for compressed or .csv for uncompressed")
	iters = []
	first = True
	for label in options.labels:
		it = d.iterate_list(sliceno, label, datasets.source, status_reporting=first)
		first = False
		t = d.columns[label].type
		if t == 'unicode' and PY2:
			it = imap(enc, it)
		elif t == 'bytes' and PY3:
			it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'), it)
		elif t in ('float32', 'float64', 'number'):
			it = imap(repr, it)
		elif t == 'json':
			it = imap(dumps, it)
		elif t not in ('unicode', 'ascii', 'bytes'):
			it = imap(str, it)
		iters.append(it)
	it = izip(*iters)
	with mkwrite(filename) as write:
		q = options.quote_fields
		sep = options.separator
		if q:
			qq = q + q
			if labelsonfirstline:
				write(enc(sep.join(q + n.replace(q, qq) + q for n in options.labels)))
			for data in it:
				write(sep.join(q + n.replace(q, qq) + q for n in data))
		else:
			if labelsonfirstline:
				write(enc(sep.join(options.labels)))
			for data in it:
				write(sep.join(data))
Beispiel #3
0
 def mk_iter(col):
     if ds.columns[col].backing_type in (
             'unicode',
             'ascii',
     ):
         return ds._column_iterator(sliceno, col, _type='unicode')
     else:
         return imap(str, ds._column_iterator(sliceno, col))
Beispiel #4
0
def synthesis(slices, analysis_res, prepare_res):
    dw, dws, lines, _, column2type, columns, rev_rename = prepare_res
    analysis_res = list(analysis_res)
    if options.filter_bad:
        bad_line_count_per_slice = [sum(data[1]) for data in analysis_res]
        lines = [num - b for num, b in zip(lines, bad_line_count_per_slice)]
        bad_line_count_total = sum(bad_line_count_per_slice)
        if bad_line_count_total:
            print('Slice   Bad line count')
            for sliceno, cnt in enumerate(bad_line_count_per_slice):
                print('%5d   %d' % (
                    sliceno,
                    cnt,
                ))
            print('total   %d' % (bad_line_count_total, ))
            print()
            print('Slice   Bad line number')
            reported_count = 0
            for sliceno, data in enumerate(analysis_res):
                if sum(data[1]) and reported_count < 32:
                    with open('badmap%d' % (sliceno, ), 'rb') as fh:
                        badmap = mmap(fh.fileno(), 0, prot=PROT_READ)
                        for ix, v in enumerate(imap(ord, badmap)):
                            if v:
                                for jx in range(8):
                                    if v & (1 << jx):
                                        print('%5d   %d' % (
                                            sliceno,
                                            ix * 8 + jx,
                                        ))
                                        reported_count += 1
                                        if reported_count >= 32: break
                                if reported_count >= 32: break
                        badmap.close()
            if reported_count >= 32:
                print('...')
            print()
            print('Bad line count   Column')
            for colname in columns:
                cnt = sum(
                    sum(data[0].get(colname, ())) for data in analysis_res)
                if cnt:
                    print('%14d   %s' % (
                        cnt,
                        colname,
                    ))
            print()
        for sliceno in range(slices):
            unlink('badmap%d' % (sliceno, ))
    if options.defaults and sum(
            sum(data[2].values()) for data in analysis_res):
        print('Defaulted values')
        for colname in sorted(options.defaults):
            defaulted = [data[2][colname] for data in analysis_res]
            if sum(defaulted):
                print('    %s:' % (colname, ))
                print('        Slice   Defaulted line count')
                slicecnt = 0
                for sliceno, cnt in enumerate(defaulted):
                    if cnt:
                        print('        %5d   %d' % (
                            sliceno,
                            cnt,
                        ))
                        slicecnt += 1
                if slicecnt > 1:
                    print('        total   %d' % (sum(defaulted), ))
    if dws:  # rehashing
        if dw:  # not as a chain
            final_bad_count = [data[1] for data in analysis_res]
            hash_lines = [data[4] for data in analysis_res]
            for colname in dw.columns:
                for sliceno in range(slices):
                    out_fn = dw.column_filename(colname, sliceno=sliceno)
                    with open(out_fn, 'wb') as out_fh:
                        for s in range(slices):
                            if hash_lines[s][sliceno] - final_bad_count[s][
                                    sliceno]:
                                src_fn = dws[s].column_filename(
                                    colname, sliceno=sliceno)
                                with open(src_fn, 'rb') as in_fh:
                                    copyfileobj(in_fh, out_fh)
            for sliced_dw in dws:
                if sliced_dw:
                    sliced_dw.discard()
            for sliceno, counts in enumerate(
                    zip(*[data[4] for data in analysis_res])):
                bad_counts = (data[1][sliceno] for data in analysis_res)
                dw.set_lines(sliceno, sum(counts) - sum(bad_counts))
            for sliceno, data in enumerate(analysis_res):
                dw.set_minmax(sliceno, data[3])
        else:
            for sliceno, data in enumerate(analysis_res):
                if dws[sliceno]:
                    dws[sliceno].set_minmax(-1, data[3])
                    for s, count in enumerate(data[4]):
                        dws[sliceno].set_lines(s, count - data[1][s])
    else:
        for sliceno, count in enumerate(lines):
            dw.set_lines(sliceno, count)
        for sliceno, data in enumerate(analysis_res):
            dw.set_minmax(sliceno, data[3])
    used = {rev_rename.get(colname, colname) for colname in column2type}
    discarded = set(datasets.source.columns) - used
    if discarded:
        print('Discarded columns:')
        template = '    %%-%ds  %%s' % (max(
            len(colname) for colname in discarded), )
        for colname in discarded:
            print(template % (
                colname,
                datasets.source.columns[colname].type,
            ))
Beispiel #5
0
def csvexport(sliceno, filename, labelsonfirstline):
    d = datasets.source[0]
    if not options.labels:
        options.labels = sorted(d.columns)
    if options.chain_source:
        if jobs.previous:
            prev_source = jobs.previous.params.datasets.source
            assert len(datasets.source) == len(prev_source)
        else:
            prev_source = [None] * len(datasets.source)
        lst = []
        for src, stop in zip(datasets.source, prev_source):
            lst.extend(src.chain(stop_ds=stop))
        datasets.source = lst
    if filename.lower().endswith('.gz'):
        open_func = partial(gzip.open, compresslevel=options.compression)
    elif filename.lower().endswith('.csv'):
        open_func = open
    else:
        raise Exception(
            "Filename should end with .gz for compressed or .csv for uncompressed"
        )
    if PY2:
        open_func = partial(open_func, mode='wb')
    else:
        open_func = partial(open_func, mode='xt', encoding='utf-8')
    iters = []
    first = True
    dumps = JSONEncoder(
        sort_keys=True,
        ensure_ascii=True,
        check_circular=False,
    ).encode
    for label in options.labels:
        it = d.iterate_list(sliceno,
                            label,
                            datasets.source,
                            status_reporting=first)
        first = False
        t = d.columns[label].type
        if d.columns[label].none_support:
            if t == 'bytes' or (PY2 and t == 'ascii'):
                it = imap(nonefix_b, it)
            elif t in (
                    'ascii',
                    'unicode',
            ):
                it = imap(nonefix_u, it)
        if t == 'unicode' and PY2:
            it = imap(enc, it)
        elif t == 'bytes' and PY3:
            it = imap(lambda s: s.decode('utf-8', errors='backslashreplace'),
                      it)
        elif t in (
                'float32',
                'float64',
        ):
            it = imap(repr, it)
        elif t == 'number':
            if PY2:
                it = imap(lambda n: str(n)
                          if isinstance(n, long) else repr(n), it)
            else:
                it = imap(repr, it)
        elif t == 'json':
            it = imap(dumps, it)
        elif t not in ('unicode', 'ascii', 'bytes'):
            it = imap(str, it)
        iters.append(it)
    it = izip(*iters)
    with writer(open_func(filename)) as write:
        q = options.quote_fields
        sep = options.separator
        if q:
            qq = q + q
            if labelsonfirstline:
                write(
                    enc(
                        sep.join(q + n.replace(q, qq) + q
                                 for n in options.labels)))
            for data in it:
                write(sep.join(q + n.replace(q, qq) + q for n in data))
        else:
            if labelsonfirstline:
                write(enc(sep.join(options.labels)))
            for data in it:
                write(sep.join(data))
def analysis(sliceno):
    chain = datasets.source.chain(stop_ds={jobs.previous: 'source'},
                                  length=options.length)
    return set(imap(unicode, chain.iterate(sliceno, options.column)))
Beispiel #7
0
    def grep(ds, sliceno):
        # Use bytes for everything if anything is bytes, str otherwise. (For speed.)
        if any(ds.columns[col].backing_type == 'bytes'
               for col in (grep_columns or columns or ds.columns)):

            def strbytes(v):
                return str(v).encode('utf-8', 'replace')

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'bytes',
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='bytes')
                else:
                    return imap(strbytes, ds._column_iterator(sliceno, col))

            chk = pat_b.search
        else:

            def mk_iter(col):
                if ds.columns[col].backing_type in (
                        'unicode',
                        'ascii',
                ):
                    return ds._column_iterator(sliceno, col, _type='unicode')
                else:
                    return imap(str, ds._column_iterator(sliceno, col))

            chk = pat_s.search

        def fmt(v):
            if not isinstance(v, (unicode, bytes)):
                v = str(v)
            if isinstance(v, unicode):
                v = v.encode('utf-8', 'replace')
            return v

        def color(item):
            pos = 0
            parts = []
            for m in pat_b.finditer(item):
                a, b = m.span()
                parts.extend((item[pos:a], b'\x1b[31m', item[a:b], b'\x1b[m'))
                pos = b
            parts.append(item[pos:])
            return b''.join(parts)

        prefix = []
        if args.show_dataset:
            prefix.append(ds.encode('utf-8'))
        if args.show_sliceno:
            prefix.append(str(sliceno).encode('utf-8'))
        prefix = tuple(prefix)

        def show(prefix, items):
            items = map(fmt, items)
            if args.color:
                items = map(color, items)
            # This will be atomic if the line is not too long
            # (at least up to PIPE_BUF bytes, should be at least 512).
            write(1, separator_b.join(prefix + tuple(items)) + b'\n')

        if grep_columns and grep_columns != set(columns or ds.columns):
            grep_iter = izip(*(mk_iter(col) for col in grep_columns))
            lines_iter = ds.iterate(sliceno, columns)
        else:
            grep_iter = repeat(None)
            lines_iter = izip(*(mk_iter(col)
                                for col in (columns or sorted(ds.columns))))
        lines = izip(grep_iter, lines_iter)
        if args.show_lineno:
            for lineno, (grep_items, items) in enumerate(lines):
                if any(imap(chk, grep_items or items)):
                    show(prefix + (str(lineno).encode('utf-8'), ), items)
        else:
            for grep_items, items in lines:
                if any(imap(chk, grep_items or items)):
                    show(prefix, items)