예제 #1
0
def analysis_lap(vars):
    if vars.rehashing:
        if vars.first_lap:
            out_fn = 'hashtmp.%d' % (vars.sliceno, )
            colname = vars.rev_rename.get(vars.dw.hashlabel, vars.dw.hashlabel)
            coltype = vars.column2type[options.rename.get(colname, colname)]
            vars.rehashing = False
            real_coltype = one_column(vars, colname, coltype, [out_fn], True)
            vars.rehashing = True
            assert vars.res_bad_count[colname] == [0
                                                   ]  # imlicitly has a default
            vars.slicemap_fd = map_init(vars, 'slicemap%d' % (vars.sliceno, ),
                                        'slicemap_size')
            slicemap = mmap(vars.slicemap_fd, vars.slicemap_size)
            slicemap = Int16BytesWrapper(slicemap)
            hash = typed_writer(real_coltype).hash
            slices = vars.slices
            vars.hash_lines = hash_lines = [0] * slices
            for ix, value in enumerate(typed_reader(real_coltype)(out_fn)):
                dest_slice = hash(value) % slices
                slicemap[ix] = dest_slice
                hash_lines[dest_slice] += 1
            unlink(out_fn)
    for colname, coltype in vars.column2type.items():
        if vars.rehashing:
            out_fns = [
                vars.dw.column_filename(colname, sliceno=s)
                for s in range(vars.slices)
            ]
        else:
            out_fns = [vars.dw.column_filename(colname)]
        one_column(vars, vars.rev_rename.get(colname, colname), coltype,
                   out_fns)
    return vars.res_bad_count, vars.res_default_count, vars.res_minmax
def analysis(sliceno, params):
    assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42]
    assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)]
    named = Dataset(datasets.source, "named")
    assert list(named.iterate(sliceno, "c")) == [True, False]
    assert list(named.iterate(sliceno, "d")) == [
        date(1536, 12, min(sliceno + 1, 31)),
        date(2236, 5, min(sliceno + 1, 31))
    ]
    if sliceno < test_data.value_cnt:
        passed = Dataset(datasets.source, "passed")
        good = tuple(v[sliceno] for _, v in sorted(test_data.data.items()))
        assert list(passed.iterate(sliceno)) == [good]
    synthesis_split = Dataset(datasets.source, "synthesis_split")
    values = zip((
        1,
        2,
        3,
    ), "abc")
    hash = typed_writer("int32").hash
    good = [v for v in values if hash(v[0]) % params.slices == sliceno]
    assert list(synthesis_split.iterate(sliceno)) == good
    synthesis_manual = Dataset(datasets.source, "synthesis_manual")
    assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno]
    nonetest = Dataset(datasets.source, "nonetest")
    good = tuple(v[0] if k in test_data.not_none_capable else None
                 for k, v in sorted(test_data.data.items()))
    assert list(nonetest.iterate(sliceno)) == [good]
예제 #3
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_hashpart",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl][0]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(source.columns)
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds
예제 #4
0
def one_column(vars, colname, coltype, out_fns, for_hasher=False):
    if for_hasher:
        record_bad = skip_bad = False
    elif vars.first_lap:
        record_bad = options.filter_bad
        skip_bad = False
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (vars.sliceno, )

    fmt = fmt_b = None
    is_null_converter = False
    if coltype in dataset_type.convfuncs:
        shorttype = coltype
        _, cfunc, pyfunc = dataset_type.convfuncs[coltype]
    elif coltype.startswith('null_'):
        shorttype = coltype
        pyfunc = False
        cfunc = True
        is_null_converter = True
    else:
        shorttype, fmt = coltype.split(':', 1)
        _, cfunc, pyfunc = dataset_type.convfuncs[shorttype + ':*']
    if cfunc:
        cfunc = shorttype.replace(':', '_')
    if pyfunc:
        tmp = pyfunc(coltype)
        if callable(tmp):
            pyfunc = tmp
            cfunc = None
        else:
            pyfunc = None
            cfunc, fmt, fmt_b = tmp
    if coltype == 'number':
        cfunc = 'number'
    elif coltype == 'number:int':
        coltype = 'number'
        cfunc = 'number'
        fmt = "int"
    assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc"
    coltype = shorttype
    in_fns = []
    offsets = []
    max_counts = []
    for d in vars.chain:
        assert colname in d.columns, '%s not in %s' % (
            colname,
            d,
        )
        if not is_null_converter:
            assert d.columns[
                colname].type in byteslike_types, '%s has bad type in %s' % (
                    colname,
                    d,
                )
        in_fns.append(d.column_filename(colname, vars.sliceno))
        if d.columns[colname].offsets:
            offsets.append(d.columns[colname].offsets[vars.sliceno])
            max_counts.append(d.lines[vars.sliceno])
        else:
            offsets.append(0)
            max_counts.append(-1)
    if cfunc:
        default_value = options.defaults.get(colname, cstuff.NULL)
        if for_hasher and default_value is cstuff.NULL:
            if coltype.startswith('bits'):
                # No None-support.
                default_value = '0'
            else:
                default_value = None
        default_len = 0
        if default_value is None:
            default_value = cstuff.NULL
            default_value_is_None = True
        else:
            default_value_is_None = False
            if default_value != cstuff.NULL:
                if isinstance(default_value, unicode):
                    default_value = default_value.encode("utf-8")
                default_len = len(default_value)
        c = getattr(cstuff.backend, 'convert_column_' + cfunc)
        if vars.rehashing:
            c_slices = vars.slices
        else:
            c_slices = 1
        bad_count = cstuff.mk_uint64(c_slices)
        default_count = cstuff.mk_uint64(c_slices)
        gzip_mode = "wb%d" % (options.compression, )
        safe_to_skip_write = vars.rehashing and not options.as_chain
        res = c(*cstuff.bytesargs(
            in_fns, len(in_fns), out_fns, gzip_mode, minmax_fn, default_value,
            default_len, default_value_is_None, fmt, fmt_b, record_bad,
            skip_bad, vars.badmap_fd, vars.badmap_size, c_slices,
            vars.slicemap_fd, vars.slicemap_size, bad_count, default_count,
            offsets, max_counts, safe_to_skip_write))
        assert not res, 'Failed to convert ' + colname
        vars.res_bad_count[colname] = list(bad_count)
        vars.res_default_count[colname] = sum(default_count)
        coltype = coltype.split(':', 1)[0]
        if is_null_converter:
            real_coltype = vars.chain[0].columns[colname].backing_type
            mins = []
            maxs = []
            # Some lines may have been filtered out, so these minmax values
            # could be wrong. There's no easy/cheap way to fix that though,
            # and they will never be wrong in the bad direction.
            for d in vars.chain:
                col = d.columns[colname]
                if col.min is not None:
                    mins.append(col.min)
                    maxs.append(col.max)
            if mins:
                vars.res_minmax[colname] = [min(mins), max(maxs)]
        else:
            real_coltype = dataset_type.typerename.get(coltype, coltype)
            with type2iter[real_coltype](minmax_fn) as it:
                vars.res_minmax[colname] = list(it)
            unlink(minmax_fn)
    else:
        # python func
        if for_hasher:
            raise Exception("Can't hash on column of type %s." % (coltype, ))
        nodefault = object()
        if colname in options.defaults:
            default_value = options.defaults[colname]
            if default_value is not None:
                if isinstance(default_value, unicode):
                    default_value = default_value.encode('utf-8')
                default_value = pyfunc(default_value)
        else:
            default_value = nodefault
        if options.filter_bad:
            badmap = mmap(vars.badmap_fd, vars.badmap_size)
            if PY2:
                badmap = IntegerBytesWrapper(badmap)
        if vars.rehashing:
            slicemap = mmap(vars.slicemap_fd, vars.slicemap_size)
            slicemap = Int16BytesWrapper(slicemap)
            bad_count = [0] * vars.slices
        else:
            bad_count = [0]
            chosen_slice = 0
        default_count = 0
        dont_minmax_types = {
            'bytes', 'ascii', 'unicode', 'json', 'complex32', 'complex64'
        }
        real_coltype = dataset_type.typerename.get(coltype, coltype)
        do_minmax = real_coltype not in dont_minmax_types
        fhs = [typed_writer(real_coltype)(fn) for fn in out_fns]
        write = fhs[0].write
        col_min = col_max = None
        it = itertools.chain.from_iterable(
            d._column_iterator(vars.sliceno, colname, _type='bytes')
            for d in vars.chain)
        for ix, v in enumerate(it):
            if vars.rehashing:
                chosen_slice = slicemap[ix]
                write = fhs[chosen_slice].write
            if skip_bad:
                if badmap[ix // 8] & (1 << (ix % 8)):
                    bad_count[chosen_slice] += 1
                    continue
            try:
                v = pyfunc(v)
            except ValueError:
                if default_value is not nodefault:
                    v = default_value
                    default_count += 1
                elif record_bad:
                    bad_count[chosen_slice] += 1
                    bv = badmap[ix // 8]
                    badmap[ix // 8] = bv | (1 << (ix % 8))
                    continue
                else:
                    raise Exception("Invalid value %r with no default in %s" %
                                    (
                                        v,
                                        colname,
                                    ))
            if do_minmax and not isinstance(v, NoneType):
                if col_min is None:
                    col_min = col_max = v
                if v < col_min: col_min = v
                if v > col_max: col_max = v
            write(v)
        for fh in fhs:
            fh.close()
        if vars.rehashing:
            slicemap.close()
        if options.filter_bad:
            badmap.close()
        vars.res_bad_count[colname] = bad_count
        vars.res_default_count[colname] = default_count
        vars.res_minmax[colname] = [col_min, col_max]
    return real_coltype
def synthesis(job, slices):
    # Test keeping untyped columns.
    dw = job.datasetwriter(name='a',
                           columns={
                               'a': 'unicode',
                               'b': ('bytes', True),
                               'c': ('ascii', True),
                               'd': ('number', True)
                           })
    write = dw.get_split_write()
    write('A', None, None, None)
    write('a', b'b', 'c', 0)
    a = dw.finish()
    assert a.hashlabel == None
    typed_a = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'a': 'ascii'}),
                            datasets=dict(source=a)).dataset()
    assert typed_a.hashlabel == 'a'
    assert set(typed_a.iterate(None)) == {('A', None, None, None),
                                          ('a', b'b', 'c', 0)}, typed_a

    # Test hashing on a column not explicitly typed.
    dw = job.datasetwriter(name='b',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           previous=a)
    write = dw.get_split_write()
    write('A', 'B', b'C', '1')
    b = dw.finish()
    assert b.hashlabel == None
    typed_b = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'b': 'ascii'}),
                            datasets=dict(source=b)).dataset()
    assert typed_b.hashlabel == 'a'
    assert set(typed_b.iterate(None)) == {('a', 'b'), ('A', None),
                                          ('A', 'B')}, typed_b

    # Test renaming over the original hashlabel
    dw = job.datasetwriter(name='c',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xe5', 'b', b'c', '0')
    c = dw.finish()
    assert c.hashlabel == 'a'
    typed_c = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'ascii',
                                'd': 'number'
                            },
                                         rename={'c': 'a'}),
                            datasets=dict(source=c)).dataset()
    assert typed_c.hashlabel == None
    assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c

    # Test using the original names but for different columns (keeping hashlabel under new name)
    dw = job.datasetwriter(name='d',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xc5', 'B', B'C', '1')
    d = dw.finish()
    assert d.hashlabel == 'a'
    typed_d = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'bytes',
                                'b': 'ascii',
                                'c': 'int32_10',
                                'd': 'bytes'
                            },
                                         rename={
                                             'b': 'a',
                                             'c': 'b',
                                             'd': 'c',
                                             'a': 'd'
                                         }),
                            datasets=dict(source=d)).dataset()
    assert typed_d.hashlabel == 'd'
    assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85')
                                           ], typed_c

    # Test various types for hashing and discarding of bad lines.
    for hl in (None, 'a', 'b', 'c'):
        dw = job.datasetwriter(name='hashed on %s' % (hl, ),
                               columns={
                                   'a': 'unicode',
                                   'b': 'unicode',
                                   'c': 'unicode'
                               },
                               hashlabel=hl)
        w = dw.get_split_write()
        for ix in range(1000):
            w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0),
              ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, ))
        src_ds = dw.finish()
        assert src_ds.hashlabel == hl
        test(
            src_ds,
            dict(column2type={
                'a': 'int32_10',
                'b': 'number:int'
            },
                 filter_bad=True), 800)
        test(
            src_ds,
            dict(column2type={
                'a': 'int64_10',
                'b': 'number',
                'c': 'json'
            },
                 filter_bad=True), 666)
        test(
            src_ds,
            dict(column2type={
                'a': 'floatint32ei',
                'b': 'number:int',
                'c': 'json'
            },
                 filter_bad=True), 533)
        test(
            src_ds,
            dict(column2type={
                'from_a': 'number',
                'from_b': 'float64',
                'from_c': 'ascii'
            },
                 rename=dict(a='from_a', b='from_b', c='from_c')), 1000)
        test(
            src_ds,
            dict(column2type={
                'c': 'bits32_16',
                'a': 'float32',
                'b': 'bytes'
            },
                 rename=dict(a='c', b='a', c='b')), 1000)

    # this doesn't test as many permutations, it's just to test more column types.
    dw = job.datasetwriter(name='more types')
    cols = {
        'floatbooli':
        cycle(['1.42 or so', '0 maybe', '1 (exactly)']),
        'datetime:%Y%m%d %H:%M': [
            '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60)
            for t in range(1000)
        ],
        'date:%Y%m%d':
        ['2019%02d%02d' % (
            t % 12 + 1,
            t % 28 + 1,
        ) for t in range(1000)],
        'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)],
        'timei:%H:%M': [
            '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65))
            for t in range(1000)
        ],
    }
    gens = []
    for coltype, gen in cols.items():
        dw.add(coltype.split(':')[0], 'ascii')
        gens.append(iter(gen))
    dw.add('half', 'bytes')
    gens.append(cycle([b'1', b'no']))
    w = dw.get_split_write()
    for _ in range(1000):
        w(*map(next, gens))
    src_ds = dw.finish()
    assert src_ds.hashlabel == None
    column2type = {t.split(':')[0]: t for t in cols}
    for hl in column2type:
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 1000
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())
        # and again with a bad column
        column2type['half'] = 'float32'
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl,
                                            filter_bad=True),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type,
                                              filter_bad=True),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        del column2type['half']
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 500
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())

    # test rehashing on a column we don't type, over all types.
    dw = job.datasetwriter(name='rehash all types',
                           columns={
                               '2type': ('ascii', True),
                               'ascii': ('ascii', True),
                               'bits32': ('bits32', False),
                               'bits64': ('bits64', False),
                               'bool': ('bool', True),
                               'bytes': ('bytes', True),
                               'date': ('date', True),
                               'datetime': ('datetime', True),
                               'float32': ('float32', True),
                               'float64': ('float64', True),
                               'int32': ('int32', True),
                               'int64': ('int64', True),
                               'json': ('json', True),
                               'number': ('number', True),
                               'time': ('time', True),
                               'unicode': ('unicode', True),
                           })
    write = dw.get_split_write()
    data = {
        '42':
        ('ascii string', 100, 1000, True, b'bytes string', date(2019, 12, 11),
         datetime(2019, 12, 11, 20, 7, 21), 1.5, 0.00000001, 99, -11, {
             "a": "b"
         }, 1e100, time(20, 7, 21), 'unicode string'),
        None: (None, 0, 0, None, None, None, None, None, None, None, None,
               None, None, None, None),
        '18': ('ASCII STRING', 111, 1111, False, b'BYTES STRING',
               date(1868, 1, 3), datetime(1868, 1, 3, 13, 14,
                                          5), 2.5, -0.0000001, 67, -99,
               [42, ".."], 5e100, time(13, 14, 5), 'UNICODE STRING'),
    }
    write('42', *data['42'])
    write(None, *data[None])
    write('18', *data['18'])
    src_ds = dw.finish()
    data['None'] = data.pop(None)
    type2type = {
        'ascii': 'unicode:ascii',
        'bool': 'unicode:ascii',
        'date': 'unicode:ascii',
        'datetime': 'unicode:ascii',
        'time': 'unicode:ascii',
        'bits32': 'bits32_10',
        'bits64': 'bits64_10',
        'bytes': 'bytes',
        'float32': 'float32',
        'float64': 'float64',
        'int32': 'int32_10',
        'int64': 'int64_10',
        'number': 'number',
        'unicode': 'unicode:ascii',
    }
    for hl, typeas in sorted(type2type.items()):
        ds = subjobs.build('dataset_type',
                           column2type={
                               '2type': typeas
                           },
                           hashlabel=hl,
                           source=src_ds).dataset()
        seen = set()
        hl_hash = typed_writer(hl).hash
        for sliceno in range(slices):
            for line in ds.iterate(sliceno, None):
                key = line[0] or None
                if isinstance(key, float):
                    key = int(key)
                if isinstance(key, bytes):
                    key = key.decode('ascii')
                else:
                    key = unicode(key)
                assert data.get(key) == line[
                    1:], "%s (hl %s) didn't have the right data for line %r" % (
                        ds,
                        hl,
                        line[0],
                    )
                hv = line[sorted(src_ds.columns).index(hl)]
                assert hl_hash(
                    hv
                ) % slices == sliceno, "%s (hl %s) didn't hash %r correctly" % (
                    ds,
                    hl,
                    hv,
                )
                assert key not in seen, "%s (hl %s) repeated line %s" % (
                    ds,
                    hl,
                    line[0],
                )
                seen.add(key)
        assert seen == {'42', 'None',
                        '18'}, "%s didn't have all lines (%r)" % (
                            ds,
                            seen,
                        )
예제 #6
0
def synthesis(prepare_res, params, job, slices):
    dws = prepare_res
    for dw in (
            dws.unhashed_split,
            dws.up_split,
    ):
        w = dw.get_split_write_list()
        for row in all_data:
            w(row)
    for dw in dws.values():
        dw.finish()

    # Verify that the different ways of writing gave the same result
    for names in (
        ("unhashed_split", "unhashed_manual"),
        ("up_checked", "up_split"),
        ("down_checked", "down_discarded", "down_discarded_list",
         "down_discarded_dict"),
    ):
        dws = {name: job.dataset(name) for name in names}
        assert dws == {name: Dataset((params.jobid, name))
                       for name in names
                       }, "Old style Dataset((params.jobid, name)) broken"
        for sliceno in range(slices):
            data = {name: list(dws[name].iterate(sliceno)) for name in names}
            good = data[names[0]]
            for name in names[1:]:
                assert data[
                    name] == good, "%s doesn't match %s in slice %d" % (
                        names[0],
                        name,
                        sliceno,
                    )

    # Verify that both up and down hashed on the expected column
    hash = typed_writer("int32").hash
    for colname in ("up", "down"):
        ds = job.dataset(colname + "_checked")
        for sliceno in range(slices):
            for value in ds.iterate(sliceno, colname):
                assert hash(
                    value
                ) % slices == sliceno, "Bad hashing on %s in slice %d" % (
                    colname,
                    sliceno,
                )

    # Verify that up and down are not the same, to catch hashing
    # not actually hashing.
    up = list(job.dataset("up_checked").iterate(None))
    down = list(job.dataset("down_checked").iterate(None))
    assert up != down, "Hashlabel did not change slice distribution"
    # And check that the data is still the same.
    assert sorted(up) == sorted(
        down) == all_data, "Hashed datasets have wrong data"

    # Verify that rehashing works.
    # (Can't use sliceno None, because that won't rehash, and even if it did
    # the order wouldn't match. Order doesn't even match in the rehashed
    # individual slices.)
    up = job.dataset("up_checked")
    down = job.dataset("down_checked")
    unhashed = job.dataset("unhashed_manual")
    for sliceno in range(slices):
        a = list(up.iterate(sliceno))
        b = list(down.iterate(sliceno, hashlabel="up", rehash=True))
        c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True))
        assert sorted(a) == sorted(b) == sorted(
            c), "Rehashing is broken (slice %d)" % (sliceno, )

    # And finally verify that we are not allowed to specify the wrong hashlabel
    good = True
    try:
        up.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    try:
        unhashed.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    assert good, "Iteration allowed on the wrong hashlabel"