Exemple #1
0
def analysis_lap(vars):
    if vars.rehashing:
        if vars.first_lap:
            out_fn = 'hashtmp.%d' % (vars.sliceno, )
            colname = vars.rev_rename.get(vars.dw.hashlabel, vars.dw.hashlabel)
            coltype = vars.column2type[options.rename.get(colname, colname)]
            vars.rehashing = False
            real_coltype = one_column(vars, colname, coltype, [out_fn], True)
            vars.rehashing = True
            assert vars.res_bad_count[colname] == [0
                                                   ]  # imlicitly has a default
            vars.slicemap_fd = map_init(vars, 'slicemap%d' % (vars.sliceno, ),
                                        'slicemap_size')
            slicemap = mmap(vars.slicemap_fd, vars.slicemap_size)
            slicemap = Int16BytesWrapper(slicemap)
            hash = typed_writer(real_coltype).hash
            slices = vars.slices
            vars.hash_lines = hash_lines = [0] * slices
            for ix, value in enumerate(typed_reader(real_coltype)(out_fn)):
                dest_slice = hash(value) % slices
                slicemap[ix] = dest_slice
                hash_lines[dest_slice] += 1
            unlink(out_fn)
    for colname, coltype in vars.column2type.items():
        if vars.rehashing:
            out_fns = [
                vars.dw.column_filename(colname, sliceno=s)
                for s in range(vars.slices)
            ]
        else:
            out_fns = [vars.dw.column_filename(colname)]
        one_column(vars, vars.rev_rename.get(colname, colname), coltype,
                   out_fns)
    return vars.res_bad_count, vars.res_default_count, vars.res_minmax
def analysis(sliceno, params):
    assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42]
    assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)]
    named = Dataset(datasets.source, "named")
    assert list(named.iterate(sliceno, "c")) == [True, False]
    assert list(named.iterate(sliceno, "d")) == [
        date(1536, 12, min(sliceno + 1, 31)),
        date(2236, 5, min(sliceno + 1, 31))
    ]
    if sliceno < test_data.value_cnt:
        passed = Dataset(datasets.source, "passed")
        good = tuple(v[sliceno] for _, v in sorted(test_data.data.items()))
        assert list(passed.iterate(sliceno)) == [good]
        if version_info > (3, 6, 0):
            want_fold = (sliceno == 1)
            assert next(passed.iterate(sliceno, "datetime")).fold == want_fold
            assert next(passed.iterate(sliceno, "time")).fold == want_fold
    synthesis_split = Dataset(datasets.source, "synthesis_split")
    values = zip((
        1,
        2,
        3,
    ), "abc")
    hash = typed_writer("int32").hash
    good = [v for v in values if hash(v[0]) % params.slices == sliceno]
    assert list(synthesis_split.iterate(sliceno)) == good
    synthesis_manual = Dataset(datasets.source, "synthesis_manual")
    assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno]
    nonetest = Dataset(datasets.source, "nonetest")
    good = tuple(v[0] if k in test_data.not_none_capable else None
                 for k, v in sorted(test_data.data.items()))
    assert list(nonetest.iterate(sliceno)) == [good]
def hashfilter(typ, values, sliceno):
	from accelerator.g import slices
	if typ == 'json':
		return values[sliceno::slices]
	else:
		from accelerator.dsutil import typed_writer
		h = typed_writer(typ).hash
		return [v for v in values if h(v) % slices == sliceno]
Exemple #4
0
def test_rehash_with_empty_slices():
    dw = DatasetWriter(name='rehash with empty slices', hashlabel='a')
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    w = dw.get_split_write()
    w('a', '42')
    w('42', 'b')
    source = dw.finish()
    hashfunc = typed_writer('int32').hash

    def verify_hashing(caption, want_values, **kw):
        ds = subjobs.build('dataset_type',
                           source=source,
                           column2type=dict(a='int32_10'),
                           caption=caption,
                           **kw).dataset()
        got_values = set()
        for sliceno in range(g.slices):
            for got in ds.iterate(sliceno):
                assert hashfunc(got[0]) % g.slices == sliceno
                assert got not in got_values
                got_values.add(got)
        assert want_values == got_values

    verify_hashing('with discard', {(
        42,
        'b',
    )}, filter_bad=True)
    # using defaults uses some different code paths
    verify_hashing('with default=0 (probably two slices)', {(
        0,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='0'))
    verify_hashing('with default=42 (one slice)', {(
        42,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='42'))
Exemple #5
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_hashpart",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl][0]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(source.columns)
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds
Exemple #6
0
def synthesis(job, slices):
    # Test keeping untyped columns.
    dw = job.datasetwriter(name='a',
                           columns={
                               'a': 'unicode',
                               'b': ('bytes', True),
                               'c': ('ascii', True),
                               'd': ('number', True)
                           })
    write = dw.get_split_write()
    write('A', None, None, None)
    write('a', b'b', 'c', 0)
    a = dw.finish()
    assert a.hashlabel == None
    typed_a = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'a': 'ascii'}),
                            datasets=dict(source=a)).dataset()
    assert typed_a.hashlabel == 'a'
    assert set(typed_a.iterate(None)) == {('A', None, None, None),
                                          ('a', b'b', 'c', 0)}, typed_a

    # Test hashing on a column not explicitly typed.
    dw = job.datasetwriter(name='b',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           previous=a)
    write = dw.get_split_write()
    write('A', 'B', b'C', '1')
    b = dw.finish()
    assert b.hashlabel == None
    typed_b = subjobs.build('dataset_type',
                            options=dict(hashlabel='a',
                                         column2type={'b': 'ascii'}),
                            datasets=dict(source=b)).dataset()
    assert typed_b.hashlabel == 'a'
    assert set(typed_b.iterate(None)) == {('a', 'b', b'c'), ('A', None, None),
                                          ('A', 'B', b'C')}, typed_b

    # Test renaming over the original hashlabel
    dw = job.datasetwriter(name='c',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xe5', 'b', b'c', '0')
    c = dw.finish()
    assert c.hashlabel == 'a'
    typed_c = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'ascii',
                                'd': 'number'
                            },
                                         rename={'c': 'a'}),
                            datasets=dict(source=c)).dataset()
    assert typed_c.hashlabel == None
    assert list(typed_c.iterate(None)) == [('c', 'b', b'c', 0)], typed_c

    # Test using the original names but for different columns (keeping hashlabel under new name)
    dw = job.datasetwriter(name='d',
                           columns={
                               'a': 'unicode',
                               'b': 'ascii',
                               'c': 'bytes',
                               'd': 'unicode'
                           },
                           hashlabel='a')
    write = dw.get_split_write()
    write('\xc5', 'B', B'C', '1')
    d = dw.finish()
    assert d.hashlabel == 'a'
    typed_d = subjobs.build('dataset_type',
                            options=dict(column2type={
                                'a': 'bytes',
                                'b': 'ascii',
                                'c': 'int32_10',
                                'd': 'bytes'
                            },
                                         rename={
                                             'b': 'a',
                                             'c': 'b',
                                             'd': 'c',
                                             'a': 'd'
                                         }),
                            datasets=dict(source=d)).dataset()
    assert typed_d.hashlabel == 'd'
    assert list(typed_d.iterate(None)) == [(b'B', 'C', 1, b'\xc3\x85')
                                           ], typed_c

    # Test various types for hashing and discarding of bad lines.
    for hl in (None, 'a', 'b', 'c'):
        dw = job.datasetwriter(name='hashed on %s' % (hl, ),
                               columns={
                                   'a': 'unicode',
                                   'b': 'unicode',
                                   'c': 'unicode'
                               },
                               hashlabel=hl)
        w = dw.get_split_write()
        for ix in range(1000):
            w(unicode(ix), '%d.%d' % (ix, ix % 5 == 0),
              ('{"a": %s}' if ix % 3 else '%d is bad') % (ix, ))
        src_ds = dw.finish()
        assert src_ds.hashlabel == hl
        test(
            src_ds,
            dict(column2type={
                'a': 'int32_10',
                'b': 'number:int'
            },
                 filter_bad=True), 800)
        test(
            src_ds,
            dict(column2type={
                'a': 'int64_10',
                'b': 'number',
                'c': 'json'
            },
                 filter_bad=True), 666)
        test(
            src_ds,
            dict(column2type={
                'a': 'floatint32ei',
                'b': 'number:int',
                'c': 'json'
            },
                 filter_bad=True), 533)
        test(
            src_ds,
            dict(column2type={
                'from_a': 'number',
                'from_b': 'float64',
                'from_c': 'ascii'
            },
                 rename=dict(a='from_a', b='from_b', c='from_c')), 1000)
        test(
            src_ds,
            dict(column2type={
                'c': 'bits32_16',
                'a': 'float32',
                'b': 'bytes'
            },
                 rename=dict(a='c', b='a', c='b')), 1000)

    # this doesn't test as many permutations, it's just to test more column types.
    dw = job.datasetwriter(name='more types')
    cols = {
        'floatbooli':
        cycle(['1.42 or so', '0 maybe', '1 (exactly)']),
        'datetime:%Y%m%d %H:%M': [
            '2019%02d%02d 17:%02d' % (t % 12 + 1, t % 28 + 1, t % 60)
            for t in range(1000)
        ],
        'date:%Y%m%d':
        ['2019%02d%02d' % (
            t % 12 + 1,
            t % 28 + 1,
        ) for t in range(1000)],
        'time:%H:%M': ['%02d:%02d' % (t // 60, t % 60) for t in range(1000)],
        'timei:%H:%M': [
            '%02d:%02d%c' % (t // 60, t % 60, chr(t % 26 + 65))
            for t in range(1000)
        ],
    }
    gens = []
    for coltype, gen in cols.items():
        dw.add(coltype.split(':')[0], 'ascii')
        gens.append(iter(gen))
    dw.add('half', 'bytes')
    gens.append(cycle([b'1', b'no']))
    w = dw.get_split_write()
    for _ in range(1000):
        w(*map(next, gens))
    src_ds = dw.finish()
    assert src_ds.hashlabel == None
    column2type = {t.split(':')[0]: t for t in cols}
    for hl in column2type:
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 1000
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())
        # and again with a bad column
        column2type['half'] = 'float32'
        hashed = subjobs.build('dataset_type',
                               options=dict(column2type=column2type,
                                            hashlabel=hl,
                                            filter_bad=True),
                               datasets=dict(source=src_ds)).dataset()
        assert hashed.hashlabel == hl
        unhashed = subjobs.build('dataset_type',
                                 options=dict(column2type=column2type,
                                              filter_bad=True),
                                 datasets=dict(source=src_ds)).dataset()
        assert unhashed.hashlabel == None
        rehashed = subjobs.build('dataset_hashpart',
                                 options=dict(hashlabel=hl),
                                 datasets=dict(source=unhashed)).dataset()
        assert rehashed.hashlabel == hl
        del column2type['half']
        assert hashed.lines == rehashed.lines
        assert sum(hashed.lines) == 500
        assert set(hashed.columns.keys()) == set(
            unhashed.columns.keys()) == set(rehashed.columns.keys())

    # test rehashing on a column we don't type, over all types.
    dw = job.datasetwriter(name='rehash all types',
                           columns={
                               '2type': ('ascii', True),
                               'ascii': ('ascii', True),
                               'bits32': ('bits32', False),
                               'bits64': ('bits64', False),
                               'bool': ('bool', True),
                               'bytes': ('bytes', True),
                               'date': ('date', True),
                               'datetime': ('datetime', True),
                               'float32': ('float32', True),
                               'float64': ('float64', True),
                               'int32': ('int32', True),
                               'int64': ('int64', True),
                               'json': ('json', True),
                               'number': ('number', True),
                               'time': ('time', True),
                               'unicode': ('unicode', True),
                           })
    write = dw.get_split_write()
    data = {
        '42':
        ('ascii string', 100, 1000, True, b'bytes string', date(2019, 12, 11),
         datetime(2019, 12, 11, 20, 7, 21), 1.5, 0.00000001, 99, -11, {
             "a": "b"
         }, 1e100, time(20, 7, 21), 'unicode string'),
        None: (None, 0, 0, None, None, None, None, None, None, None, None,
               None, None, None, None),
        '18': ('ASCII STRING', 111, 1111, False, b'BYTES STRING',
               date(1868, 1, 3), datetime(1868, 1, 3, 13, 14,
                                          5), 2.5, -0.0000001, 67, -99,
               [42, ".."], 5e100, time(13, 14, 5), 'UNICODE STRING'),
    }
    write('42', *data['42'])
    write(None, *data[None])
    write('18', *data['18'])
    src_ds = dw.finish()
    data['None'] = data.pop(None)
    type2type = {
        'ascii': 'unicode:ascii',
        'bool': 'unicode:ascii',
        'date': 'unicode:ascii',
        'datetime': 'unicode:ascii',
        'time': 'unicode:ascii',
        'bits32': 'bits32_10',
        'bits64': 'bits64_10',
        'bytes': 'bytes',
        'float32': 'float32',
        'float64': 'float64',
        'int32': 'int32_10',
        'int64': 'int64_10',
        'number': 'number',
        'unicode': 'unicode:ascii',
    }
    for hl, typeas in sorted(type2type.items()):
        ds = subjobs.build('dataset_type',
                           column2type={
                               '2type': typeas
                           },
                           hashlabel=hl,
                           source=src_ds).dataset()
        seen = set()
        hl_hash = typed_writer(hl).hash
        for sliceno in range(slices):
            for line in ds.iterate(sliceno, None):
                key = line[0] or None
                if isinstance(key, float):
                    key = int(key)
                if isinstance(key, bytes):
                    key = key.decode('ascii')
                else:
                    key = unicode(key)
                assert data.get(key) == line[
                    1:], "%s (hl %s) didn't have the right data for line %r" % (
                        ds,
                        hl,
                        line[0],
                    )
                hv = line[sorted(src_ds.columns).index(hl)]
                assert hl_hash(
                    hv
                ) % slices == sliceno, "%s (hl %s) didn't hash %r correctly" % (
                    ds,
                    hl,
                    hv,
                )
                assert key not in seen, "%s (hl %s) repeated line %s" % (
                    ds,
                    hl,
                    line[0],
                )
                seen.add(key)
        assert seen == {'42', 'None',
                        '18'}, "%s didn't have all lines (%r)" % (
                            ds,
                            seen,
                        )
def one_column(vars, colname, coltype, out_fns, for_hasher=False):
    if for_hasher:
        record_bad = skip_bad = False
    elif vars.first_lap:
        record_bad = options.filter_bad
        skip_bad = False
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (vars.sliceno, )

    fmt = fmt_b = None
    is_null_converter = False
    if coltype in dataset_type.convfuncs:
        shorttype = coltype
        _, cfunc, pyfunc = dataset_type.convfuncs[coltype]
    elif coltype.startswith('null_'):
        shorttype = coltype
        pyfunc = False
        cfunc = True
        is_null_converter = True
    else:
        shorttype, fmt = coltype.split(':', 1)
        _, cfunc, pyfunc = dataset_type.convfuncs[shorttype + ':*']
    if cfunc:
        cfunc = shorttype.replace(':', '_')
    if pyfunc:
        tmp = pyfunc(coltype)
        if callable(tmp):
            pyfunc = tmp
            cfunc = None
        else:
            pyfunc = None
            cfunc, fmt, fmt_b = tmp
    if coltype == 'number':
        cfunc = 'number'
    elif coltype == 'number:int':
        coltype = 'number'
        cfunc = 'number'
        fmt = "int"
    assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc"
    coltype = shorttype
    in_fns = []
    offsets = []
    max_counts = []
    dest_colname = options.rename.get(colname, colname)
    for d in vars.chain:
        assert colname in d.columns, '%s not in %s' % (
            colname,
            d,
        )
        if not d.lines[vars.sliceno]:
            continue
        if not is_null_converter:
            assert d.columns[
                colname].type in byteslike_types, '%s has bad type in %s' % (
                    colname,
                    d,
                )
        in_fns.append(d.column_filename(colname, vars.sliceno))
        if d.columns[colname].offsets:
            offsets.append(d.columns[colname].offsets[vars.sliceno])
            max_counts.append(d.lines[vars.sliceno])
        else:
            offsets.append(0)
            max_counts.append(-1)
    if cfunc:
        default_value = options.defaults.get(dest_colname, cstuff.NULL)
        if for_hasher and default_value is cstuff.NULL:
            if coltype.startswith('bits'):
                # No None-support.
                default_value = '0'
            else:
                default_value = None
        default_len = 0
        if default_value is None:
            default_value = cstuff.NULL
            default_value_is_None = True
        else:
            default_value_is_None = False
            if default_value != cstuff.NULL:
                if isinstance(default_value, unicode):
                    default_value = default_value.encode("utf-8")
                default_len = len(default_value)
        c = getattr(cstuff.backend, 'convert_column_' + cfunc)
        if vars.rehashing:
            c_slices = vars.slices
        else:
            c_slices = 1
        bad_count = cstuff.mk_uint64(c_slices)
        default_count = cstuff.mk_uint64(c_slices)
        gzip_mode = "wb%d" % (options.compression, )
        if in_fns:
            assert len(out_fns) == c_slices + vars.save_bad
            res = c(*cstuff.bytesargs(
                in_fns, len(in_fns), out_fns, gzip_mode, minmax_fn,
                default_value, default_len, default_value_is_None, fmt, fmt_b,
                record_bad, skip_bad, vars.badmap_fd, vars.badmap_size,
                vars.save_bad, c_slices, vars.slicemap_fd, vars.slicemap_size,
                bad_count, default_count, offsets, max_counts))
            assert not res, 'Failed to convert ' + colname
        vars.res_bad_count[dest_colname] = list(bad_count)
        vars.res_default_count[dest_colname] = sum(default_count)
        coltype = coltype.split(':', 1)[0]
        if is_null_converter:
            real_coltype = vars.chain[0].columns[colname].type
            # Some lines may have been filtered out, so these minmax values
            # could be wrong. There's no easy/cheap way to fix that though,
            # and they will never be wrong in the bad direction.
            vars.res_minmax[dest_colname] = [
                vars.chain.min(colname),
                vars.chain.max(colname)
            ]
        else:
            real_coltype = dataset_type.typerename.get(coltype, coltype)
            if exists(minmax_fn):
                with typed_reader(real_coltype)(minmax_fn) as it:
                    vars.res_minmax[dest_colname] = list(it)
                unlink(minmax_fn)
    else:
        # python func
        if for_hasher:
            raise Exception("Can't hash on column of type %s." % (coltype, ))
        nodefault = object()
        if dest_colname in options.defaults:
            default_value = options.defaults[dest_colname]
            if default_value is not None:
                if isinstance(default_value, unicode):
                    default_value = default_value.encode('utf-8')
                default_value = pyfunc(default_value)
        else:
            default_value = nodefault
        if options.filter_bad:
            badmap = mmap(vars.badmap_fd, vars.badmap_size)
            if PY2:
                badmap = IntegerBytesWrapper(badmap)
        if vars.rehashing:
            slicemap = mmap(vars.slicemap_fd, vars.slicemap_size)
            slicemap = Int16BytesWrapper(slicemap)
            bad_count = [0] * vars.slices
        else:
            bad_count = [0]
            chosen_slice = 0
        default_count = 0
        dont_minmax_types = {
            'bytes', 'ascii', 'unicode', 'json', 'complex32', 'complex64'
        }
        real_coltype = dataset_type.typerename.get(coltype, coltype)
        do_minmax = real_coltype not in dont_minmax_types
        if vars.save_bad:
            bad_fh = typed_writer('bytes')(out_fns.pop(), none_support=True)
        fhs = [typed_writer(real_coltype)(fn) for fn in out_fns]
        if vars.save_bad:
            fhs.append(bad_fh)
        write = fhs[0].write
        col_min = col_max = None
        it = itertools.chain.from_iterable(
            d._column_iterator(vars.sliceno, colname, _type='bytes')
            for d in vars.chain)
        for ix, v in enumerate(it):
            if vars.rehashing:
                chosen_slice = slicemap[ix]
                write = fhs[chosen_slice].write
            if skip_bad:
                if badmap[ix // 8] & (1 << (ix % 8)):
                    bad_count[chosen_slice] += 1
                    if vars.save_bad:
                        bad_fh.write(v)
                    continue
            try:
                v = pyfunc(v)
            except ValueError:
                if default_value is not nodefault:
                    v = default_value
                    default_count += 1
                elif record_bad:
                    bad_count[chosen_slice] += 1
                    bv = badmap[ix // 8]
                    badmap[ix // 8] = bv | (1 << (ix % 8))
                    continue
                else:
                    raise Exception("Invalid value %r with no default in %s" %
                                    (
                                        v,
                                        colname,
                                    ))
            if do_minmax and not isinstance(v, NoneType):
                if col_min is None:
                    col_min = col_max = v
                if v < col_min: col_min = v
                if v > col_max: col_max = v
            write(v)
        for fh in fhs:
            fh.close()
        if vars.rehashing:
            slicemap.close()
        if options.filter_bad:
            badmap.close()
        vars.res_bad_count[dest_colname] = bad_count
        vars.res_default_count[dest_colname] = default_count
        vars.res_minmax[dest_colname] = [col_min, col_max]
    return real_coltype
Exemple #8
0
def synthesis(job, slices):
    # All the representations we want to verify.
    values = [
        # 1 byte values
        [i, '=B', i + 128 + 5] for i in range(-5, 123)
    ] + [
        # 3 bytes values
        [-6, '=bh', 2, -6],
        [123, '=bh', 2, 123],
        [-0x8000, '=bh', 2, -0x8000],
        [0x7fff, '=bh', 2, 0x7fff],
        # 5 byte values
        [-0x8001, '=bi', 4, -0x8001],
        [0x8000, '=bi', 4, 0x8000],
        [-0x80000000, '=bi', 4, -0x80000000],
        [0x7fffffff, '=bi', 4, 0x7fffffff],
        # 9 byte values
        [-0x80000001, '=bq', 8, -0x80000001],
        [0x80000000, '=bq', 8, 0x80000000],
        [-0x8000000000000000, '=bq', 8, -0x8000000000000000],
        [0x7fffffffffffffff, '=bq', 8, 0x7fffffffffffffff],
        # special values
        [None, '=b', 0],
        [0.1, '=bd', 1, 0.1],
    ]

    # Verify each value through a manual typed_writer.
    # Also write to a dataset, a csv and a value2bytes dict.
    value2bytes = {}
    dw = job.datasetwriter()
    dw.add('num', 'number', none_support=True)
    write = dw.get_split_write()
    with job.open('data.csv', 'wt') as csv_fh:
        csv_fh.write('num\n')
        for v in values:
            value = v[0]
            write(value)
            csv_fh.write('%s\n' % (value, ))
            want_bytes = struct.pack(*v[1:])
            value2bytes[value] = want_bytes
            with typed_writer('number')('tmp',
                                        compression='gzip',
                                        none_support=True) as w:
                w.write(value)
            with gzip.open('tmp', 'rb') as fh:
                got_bytes = fh.read()
            assert want_bytes == got_bytes, "%r gave %r, wanted %r" % (
                value,
                got_bytes,
                want_bytes,
            )

    # Make sure we get the same representation through a dataset.
    # Assumes that the column is merged (a single file for all slices).
    ds = dw.finish()
    just_values = set(v[0] for v in values)
    assert set(ds.iterate(
        None, 'num')) == just_values, "Dataset contains wrong values"
    want_bytes = b''.join(value2bytes[v] for v in ds.iterate(None, 'num'))
    with gzip.open(ds.column_filename('num'), 'rb') as fh:
        got_bytes = fh.read()
    assert want_bytes == got_bytes, "All individual encoding are right, but not in a dataset?"

    # csvimport and dataset_type the same thing,
    # verify we got the same bytes
    jid = build('csvimport', filename=job.filename('data.csv'))
    jid = build('dataset_type',
                source=jid,
                column2type={'num': 'number'},
                defaults={'num': None})
    with gzip.open(jid.dataset().column_filename('num'), 'rb') as fh:
        got_bytes = fh.read()
    assert want_bytes == got_bytes, "csvimport + dataset_type (%s) gave different bytes" % (
        jid, )
Exemple #9
0
def synthesis(prepare_res, params, job, slices):
    dws = prepare_res
    for dw in (
            dws.unhashed_split,
            dws.up_split,
    ):
        w = dw.get_split_write_list()
        for row in all_data:
            w(row)
    hl2ds = {None: [], "up": [], "down": []}
    all_ds = {}
    special_cases = {
        "up_datetime",
        "down_time",
        "up_date",
        "down_date",
        "unhashed_bytes",
        "up_ascii",
        "down_unicode",
    }
    for name, dw in dws.items():
        ds = dw.finish()
        all_ds[ds.name] = ds
        if ds.name not in special_cases:
            hl2ds[ds.hashlabel].append(ds)

    # Verify that the different ways of writing gave the same result
    for hashlabel in (None, "up", "down"):
        for sliceno in range(slices):
            data = [(ds, list(ds.iterate(sliceno))) for ds in hl2ds[hashlabel]]
            good = data[0][1]
            for name, got in data:
                assert got == good, "%s doesn't match %s in slice %d" % (
                    data[0][0],
                    name,
                    sliceno,
                )

    # Verify that both up and down hashed on the expected column
    hash = typed_writer("int32").hash
    for colname in ("up", "down"):
        ds = all_ds[colname + "_checked"]
        for sliceno in range(slices):
            for value in ds.iterate(sliceno, colname):
                assert hash(
                    int(value)
                ) % slices == sliceno, "Bad hashing on %s in slice %d" % (
                    colname,
                    sliceno,
                )

    # Verify that up and down are not the same, to catch hashing
    # not actually hashing.
    for up_name, down_name in (
        ("up_checked", "down_checked"),
        ("up_datetime", "down_time"),
        ("up_date", "down_date"),
        ("up_ascii", "down_unicode"),
    ):
        up = cleanup(all_ds[up_name].iterate(None))
        down = cleanup(all_ds[down_name].iterate(None))
        assert up != down, "Hashlabel did not change slice distribution (%s vs %s)" % (
            up_name, down_name)
        # And check that the data is still the same.
        assert sorted(up) == sorted(
            down) == all_data, "Hashed datasets have wrong data (%s vs %s)" % (
                up_name, down_name)

    # Verify that rehashing works.
    # (Can't use sliceno None, because that won't rehash, and even if it did
    # the order wouldn't match. Order doesn't even match in the rehashed
    # individual slices.)
    def test_rehash(want_ds, chk_ds_lst):
        want_ds = all_ds[want_ds]
        for sliceno in range(slices):
            want = sorted(cleanup(want_ds.iterate(sliceno)))
            for chk_ds in chk_ds_lst:
                assert chk_ds.hashlabel != want_ds.hashlabel
                got = chk_ds.iterate(sliceno,
                                     hashlabel=want_ds.hashlabel,
                                     rehash=True)
                got = sorted(cleanup(got))
                assert want == got, "Rehashing is broken for %s (slice %d of %s)" % (
                    chk_ds.columns[want_ds.hashlabel].type,
                    sliceno,
                    chk_ds,
                )

    test_rehash("up_checked", hl2ds[None] + hl2ds["down"])
    test_rehash("down_checked", hl2ds[None] + hl2ds["up"])
    test_rehash("up_datetime", [all_ds["down_time"]])
    test_rehash("down_time", [all_ds["up_datetime"]])
    test_rehash("down_date", [all_ds["up_date"]])
    test_rehash("up_ascii", [all_ds["unhashed_bytes"], all_ds["down_unicode"]])
    test_rehash("down_unicode", [all_ds["unhashed_bytes"], all_ds["up_ascii"]])

    # And finally verify that we are not allowed to specify the wrong hashlabel
    good = True
    try:
        all_ds["up_checked"].iterate(None, hashlabel="down")
        good = False
    except DatasetUsageError:
        pass
    try:
        all_ds["unhashed_manual"].iterate(None, hashlabel="down")
        good = False
    except DatasetUsageError:
        pass
    assert good, "Iteration allowed on the wrong hashlabel"

    # verify that non-integral floats hash the same in the five types that can have them
    # using + 0.5 is safe for the values we use, it can be exactly represented in 32 bit floats.
    float_data = [v + 0.5 for v, _ in all_data]
    float_ds_lst = []
    for typ in ("float32", "float64", "complex32", "complex64", "number"):
        dw = job.datasetwriter(name="floattest_" + typ,
                               columns={"value": typ},
                               hashlabel="value")
        write = dw.get_split_write()
        for v in float_data:
            write(v)
        float_ds_lst.append(dw.finish())
    for sliceno in range(slices):
        values = [(ds, list(ds.iterate(sliceno, "value")))
                  for ds in float_ds_lst]
        want_ds, want = values.pop()
        for ds, got in values:
            assert got == want, "%s did not match %s in slice %d" % (
                ds,
                want_ds,
                sliceno,
            )