Beispiel #1
0
def synthesis(prepare_res, params):
    if not options.as_chain:
        # If we don't want a chain we abuse our knowledge of dataset internals
        # to avoid recompressing. Don't do this stuff yourself.
        dws, names, prev_source, caption, filename = prepare_res
        merged_dw = DatasetWriter(
            caption=caption,
            hashlabel=options.hashlabel,
            filename=filename,
            previous=datasets.previous,
            meta_only=True,
            columns=datasets.source.columns,
        )
        for sliceno in range(params.slices):
            merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws))
            for dwno, dw in enumerate(dws):
                merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno])
            for n in names:
                fn = merged_dw.column_filename(n, sliceno=sliceno)
                with open(fn, "wb") as out_fh:
                    for dw in dws:
                        fn = dw.column_filename(n, sliceno=sliceno)
                        with open(fn, "rb") as in_fh:
                            copyfileobj(in_fh, out_fh)
        for dw in dws:
            dw.discard()
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in options.column2type.iteritems():
        out_fn = dw.column_filename(options.rename.get(
            colname, colname)).encode('ascii')
        if ':' in coltype and not coltype.startswith('number:'):
            coltype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*']
            if '%f' in fmt:
                # needs to fall back to python version
                cfunc = None
            if not cfunc:
                pyfunc = pyfunc(coltype, fmt)
        else:
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
            fmt = ffi.NULL
        d = datasets.source
        assert d.columns[colname].type in (
            'bytes',
            'ascii',
        ), colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno).encode('ascii')
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if coltype == 'number':
            cfunc = True
        if coltype == 'number:int':
            coltype = 'number'
            cfunc = True
            fmt = "int"
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + coltype)
            res = c(in_fn, out_fn, minmax_fn, default_value,
                    default_value_is_None, fmt, record_bad, skip_bad,
                    badmap_fd, badmap_size, bad_count, default_count, offset,
                    max_count, backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        elif pyfunc is str:
            # We skip it the first time around, and link it from
            # the source dataset if there were no bad lines.
            # (That happens at the end of analysis.)
            # We can't do that if the file is not slice-specific though.
            # And we also can't do it if the column is in the wrong (old) format.
            if skip_bad or '%s' not in d.column_filename(
                    colname, '%s') or backing_format != 3:
                res = backend.filter_strings(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
                assert not res, 'Failed to convert ' + colname
            else:
                link_candidates.append((
                    in_fn,
                    out_fn,
                ))
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        elif pyfunc is str.strip:
            res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                if options.defaults[colname] is None:
                    default_value = None
                else:
                    default_value = pyfunc(options.defaults[colname])
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
            bad_count = 0
            default_count = 0
            with typed_writer(dataset_typing.typerename.get(
                    coltype, coltype))(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(d.iterate(sliceno, colname)):
                    if skip_bad:
                        if ord(badmap[ix // 8]) & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = ord(badmap[ix // 8])
                            badmap[ix // 8] = chr(bv | (1 << (ix % 8)))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if not isinstance(v, (
                            NoneType,
                            str,
                            unicode,
                    )):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates
Beispiel #3
0
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in iteritems(options.column2type):
        out_fn = dw.column_filename(options.rename.get(colname, colname))
        fmt = fmt_b = None
        if coltype in dataset_typing.convfuncs:
            shorttype = coltype
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
        else:
            shorttype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*']
        if cfunc:
            cfunc = shorttype.replace(':', '_')
        if pyfunc:
            tmp = pyfunc(coltype)
            if callable(tmp):
                pyfunc = tmp
                cfunc = None
            else:
                pyfunc = None
                cfunc, fmt, fmt_b = tmp
        if coltype == 'number':
            cfunc = 'number'
        elif coltype == 'number:int':
            coltype = 'number'
            cfunc = 'number'
            fmt = "int"
        assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc"
        coltype = shorttype
        d = datasets.source
        assert d.columns[colname].type in byteslike_types, colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno)
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            default_len = 0
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
                if default_value != ffi.NULL:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode("utf-8")
                    default_len = len(default_value)
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + cfunc)
            res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value,
                               default_len, default_value_is_None, fmt, fmt_b,
                               record_bad, skip_bad, badmap_fd, badmap_size,
                               bad_count, default_count, offset, max_count,
                               backing_format))
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            coltype = coltype.split(':', 1)[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                default_value = options.defaults[colname]
                if default_value is not None:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode('utf-8')
                    default_value = pyfunc(default_value)
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
                if PY2:
                    badmap = IntegerBytesWrapper(badmap)
            bad_count = 0
            default_count = 0
            dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'}
            real_coltype = dataset_typing.typerename.get(coltype, coltype)
            do_minmax = real_coltype not in dont_minmax_types
            with typed_writer(real_coltype)(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(
                        d._column_iterator(sliceno,
                                           colname,
                                           _type='bytes' if backing_format == 3
                                           else '_v2_bytes')):
                    if skip_bad:
                        if badmap[ix // 8] & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = badmap[ix // 8]
                            badmap[ix // 8] = bv | (1 << (ix % 8))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if do_minmax and not isinstance(v, NoneType):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates