Ejemplo n.º 1
0
def _test():
	from gzwrite import typed_writer
	for key, data in convfuncs.iteritems():
		key = key.split(":")[0]
		typed_writer(typerename.get(key, key))
		assert data.size in (0, 1, 4, 8,), (key, data)
		assert isinstance(data.conv_code_str, (str, NoneType)), (key, data)
		if data.conv_code_str:
			assert typerename.get(key, key) in minmaxfuncs
		assert data.pyfunc is None or callable(data.pyfunc), (key, data)
	for key, mm in minmaxfuncs.iteritems():
		for v in mm:
			assert isinstance(v, str), key
Ejemplo n.º 2
0
 def add(self, colname, coltype, default=_nodefault):
     from g import running
     assert running == self._running, "Add all columns in the same step as creation"
     assert not self._started, "Add all columns before setting slice"
     colname = uni(colname)
     coltype = uni(coltype)
     assert colname not in self.columns, colname
     assert colname
     typed_writer(coltype)  # gives error for unknown types
     self.columns[colname] = (coltype, default)
     self._order.append(colname)
     if colname in self._pcolumns:
         self._clean_names[colname] = self._pcolumns[colname].name
     else:
         self._clean_names[colname] = _clean_name(colname, self._seen_n)
Ejemplo n.º 3
0
def analysis(sliceno, params):
	assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42]
	assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)]
	named = Dataset(datasets.source, "named")
	assert list(named.iterate(sliceno, "c")) == [True, False]
	assert list(named.iterate(sliceno, "d")) == [date(1536, 12, min(sliceno + 1, 31)), date(2236, 5, min(sliceno + 1, 31))]
	if sliceno < test_data.value_cnt:
		passed = Dataset(datasets.source, "passed")
		good = tuple(v[sliceno] for _, v in sorted(test_data.data.items()))
		assert list(passed.iterate(sliceno)) == [good]
	synthesis_split = Dataset(datasets.source, "synthesis_split")
	values = zip((1, 2, 3,), "abc")
	hash = typed_writer("int32").hash
	good = [v for v in values if hash(v[0]) % params.slices == sliceno]
	assert list(synthesis_split.iterate(sliceno)) == good
	synthesis_manual = Dataset(datasets.source, "synthesis_manual")
	assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno]
	nonetest = Dataset(datasets.source, "nonetest")
	good = tuple(v[0] if k in test_data.not_none_capable else None for k, v in sorted(test_data.data.items()))
	assert list(nonetest.iterate(sliceno)) == [good]
Ejemplo n.º 4
0
	def _mkwriters(self, sliceno, filtered=True):
		assert self.columns, "No columns in dataset"
		if self.hashlabel:
			assert self.hashlabel in self.columns, "Hashed column (%s) missing" % (self.hashlabel,)
		self._started = 2 - filtered
		if self.meta_only:
			return
		writers = {}
		for colname, (coltype, default) in self.columns.items():
			wt = typed_writer(coltype)
			kw = {} if default is _nodefault else {'default': default}
			fn = self.column_filename(colname, sliceno)
			if filtered and colname == self.hashlabel:
				from g import SLICES
				w = wt(fn, hashfilter=(sliceno, SLICES), **kw)
				self.hashcheck = w.hashcheck
			else:
				w = wt(fn, **kw)
			writers[colname] = w
		return writers
Ejemplo n.º 5
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_rehash",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(data[0])
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds
Ejemplo n.º 6
0
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in options.column2type.iteritems():
        out_fn = dw.column_filename(options.rename.get(
            colname, colname)).encode('ascii')
        if ':' in coltype and not coltype.startswith('number:'):
            coltype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*']
            if '%f' in fmt:
                # needs to fall back to python version
                cfunc = None
            if not cfunc:
                pyfunc = pyfunc(coltype, fmt)
        else:
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
            fmt = ffi.NULL
        d = datasets.source
        assert d.columns[colname].type in (
            'bytes',
            'ascii',
        ), colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno).encode('ascii')
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if coltype == 'number':
            cfunc = True
        if coltype == 'number:int':
            coltype = 'number'
            cfunc = True
            fmt = "int"
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + coltype)
            res = c(in_fn, out_fn, minmax_fn, default_value,
                    default_value_is_None, fmt, record_bad, skip_bad,
                    badmap_fd, badmap_size, bad_count, default_count, offset,
                    max_count, backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        elif pyfunc is str:
            # We skip it the first time around, and link it from
            # the source dataset if there were no bad lines.
            # (That happens at the end of analysis.)
            # We can't do that if the file is not slice-specific though.
            # And we also can't do it if the column is in the wrong (old) format.
            if skip_bad or '%s' not in d.column_filename(
                    colname, '%s') or backing_format != 3:
                res = backend.filter_strings(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
                assert not res, 'Failed to convert ' + colname
            else:
                link_candidates.append((
                    in_fn,
                    out_fn,
                ))
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        elif pyfunc is str.strip:
            res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd,
                                             badmap_size, offset, max_count,
                                             backing_format)
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = 0
            res_default_count[colname] = 0
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                if options.defaults[colname] is None:
                    default_value = None
                else:
                    default_value = pyfunc(options.defaults[colname])
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
            bad_count = 0
            default_count = 0
            with typed_writer(dataset_typing.typerename.get(
                    coltype, coltype))(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(d.iterate(sliceno, colname)):
                    if skip_bad:
                        if ord(badmap[ix // 8]) & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = ord(badmap[ix // 8])
                            badmap[ix // 8] = chr(bv | (1 << (ix % 8)))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if not isinstance(v, (
                            NoneType,
                            str,
                            unicode,
                    )):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates
Ejemplo n.º 7
0
def analysis_lap(sliceno, badmap_fh, first_lap):
    known_line_count = 0
    badmap_size = 0
    badmap_fd = -1
    res_bad_count = {}
    res_default_count = {}
    res_minmax = {}
    link_candidates = []
    if first_lap:
        record_bad = options.filter_bad
        skip_bad = 0
    else:
        record_bad = 0
        skip_bad = options.filter_bad
    minmax_fn = 'minmax%d' % (sliceno, )
    dw = DatasetWriter()
    for colname, coltype in iteritems(options.column2type):
        out_fn = dw.column_filename(options.rename.get(colname, colname))
        fmt = fmt_b = None
        if coltype in dataset_typing.convfuncs:
            shorttype = coltype
            _, cfunc, pyfunc = dataset_typing.convfuncs[coltype]
        else:
            shorttype, fmt = coltype.split(':', 1)
            _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*']
        if cfunc:
            cfunc = shorttype.replace(':', '_')
        if pyfunc:
            tmp = pyfunc(coltype)
            if callable(tmp):
                pyfunc = tmp
                cfunc = None
            else:
                pyfunc = None
                cfunc, fmt, fmt_b = tmp
        if coltype == 'number':
            cfunc = 'number'
        elif coltype == 'number:int':
            coltype = 'number'
            cfunc = 'number'
            fmt = "int"
        assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc"
        coltype = shorttype
        d = datasets.source
        assert d.columns[colname].type in byteslike_types, colname
        if options.filter_bad:
            line_count = d.lines[sliceno]
            if known_line_count:
                assert line_count == known_line_count, (colname, line_count,
                                                        known_line_count)
            else:
                known_line_count = line_count
                pagesize = getpagesize()
                badmap_size = (line_count // 8 // pagesize + 1) * pagesize
                badmap_fh.truncate(badmap_size)
                badmap_fd = badmap_fh.fileno()
        if d.columns[colname].backing_type.startswith('_v2_'):
            backing_format = 2
        else:
            backing_format = 3
        in_fn = d.column_filename(colname, sliceno)
        if d.columns[colname].offsets:
            offset = d.columns[colname].offsets[sliceno]
            max_count = d.lines[sliceno]
        else:
            offset = 0
            max_count = -1
        if cfunc:
            default_value = options.defaults.get(colname, ffi.NULL)
            default_len = 0
            if default_value is None:
                default_value = ffi.NULL
                default_value_is_None = True
            else:
                default_value_is_None = False
                if default_value != ffi.NULL:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode("utf-8")
                    default_len = len(default_value)
            bad_count = ffi.new('uint64_t [1]', [0])
            default_count = ffi.new('uint64_t [1]', [0])
            c = getattr(backend, 'convert_column_' + cfunc)
            res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value,
                               default_len, default_value_is_None, fmt, fmt_b,
                               record_bad, skip_bad, badmap_fd, badmap_size,
                               bad_count, default_count, offset, max_count,
                               backing_format))
            assert not res, 'Failed to convert ' + colname
            res_bad_count[colname] = bad_count[0]
            res_default_count[colname] = default_count[0]
            coltype = coltype.split(':', 1)[0]
            with type2iter[dataset_typing.typerename.get(
                    coltype, coltype)](minmax_fn) as it:
                res_minmax[colname] = list(it)
            unlink(minmax_fn)
        else:
            # python func
            nodefault = object()
            if colname in options.defaults:
                default_value = options.defaults[colname]
                if default_value is not None:
                    if isinstance(default_value, unicode):
                        default_value = default_value.encode('utf-8')
                    default_value = pyfunc(default_value)
            else:
                default_value = nodefault
            if options.filter_bad:
                badmap = mmap(badmap_fd, badmap_size)
                if PY2:
                    badmap = IntegerBytesWrapper(badmap)
            bad_count = 0
            default_count = 0
            dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'}
            real_coltype = dataset_typing.typerename.get(coltype, coltype)
            do_minmax = real_coltype not in dont_minmax_types
            with typed_writer(real_coltype)(out_fn) as fh:
                col_min = col_max = None
                for ix, v in enumerate(
                        d._column_iterator(sliceno,
                                           colname,
                                           _type='bytes' if backing_format == 3
                                           else '_v2_bytes')):
                    if skip_bad:
                        if badmap[ix // 8] & (1 << (ix % 8)):
                            bad_count += 1
                            continue
                    try:
                        v = pyfunc(v)
                    except ValueError:
                        if default_value is not nodefault:
                            v = default_value
                            default_count += 1
                        elif record_bad:
                            bad_count += 1
                            bv = badmap[ix // 8]
                            badmap[ix // 8] = bv | (1 << (ix % 8))
                            continue
                        else:
                            raise Exception(
                                "Invalid value %r with no default in %s" % (
                                    v,
                                    colname,
                                ))
                    if do_minmax and not isinstance(v, NoneType):
                        if col_min is None:
                            col_min = col_max = v
                        if v < col_min: col_min = v
                        if v > col_max: col_max = v
                    fh.write(v)
            if options.filter_bad:
                badmap.close()
            res_bad_count[colname] = bad_count
            res_default_count[colname] = default_count
            res_minmax[colname] = [col_min, col_max]
    return res_bad_count, res_default_count, res_minmax, link_candidates
Ejemplo n.º 8
0
def synthesis(prepare_res, params):
    dws = prepare_res
    for dw in (
            dws.unhashed_split,
            dws.up_split,
    ):
        w = dw.get_split_write_list()
        for row in all_data:
            w(row)
    for dw in dws.values():
        dw.finish()

    # Verify that the different ways of writing gave the same result
    for names in (
        ("unhashed_split", "unhashed_manual"),
        ("up_checked", "up_split"),
        ("down_checked", "down_discarded", "down_discarded_list",
         "down_discarded_dict"),
    ):
        dws = {name: Dataset((params.jobid, name)) for name in names}
        for sliceno in range(params.slices):
            data = {name: list(dws[name].iterate(sliceno)) for name in names}
            good = data[names[0]]
            for name in names[1:]:
                assert data[
                    name] == good, "%s doesn't match %s in slice %d" % (
                        names[0],
                        name,
                        sliceno,
                    )

    # Verify that both up and down hashed on the expected column
    hash = typed_writer("int32").hash
    for colname in ("up", "down"):
        ds = Dataset((params.jobid, colname + "_checked"))
        for sliceno in range(params.slices):
            for value in ds.iterate(sliceno, colname):
                assert hash(
                    value
                ) % params.slices == sliceno, "Bad hashing on %s in slice %d" % (
                    colname,
                    sliceno,
                )

    # Verify that up and down are not the same, to catch hashing
    # not actually hashing.
    up = list(Dataset((params.jobid, "up_checked")).iterate(None))
    down = list(Dataset((params.jobid, "down_checked")).iterate(None))
    assert up != down, "Hashlabel did not change slice distribution"
    # And check that the data is still the same.
    assert sorted(up) == sorted(
        down) == all_data, "Hashed datasets have wrong data"

    # Verify that rehashing works.
    # (Can't use sliceno None, because that won't rehash, and even if it did
    # the order wouldn't match. Order doesn't even match in the rehashed
    # individual slices.)
    up = Dataset((params.jobid, "up_checked"))
    down = Dataset((params.jobid, "down_checked"))
    unhashed = Dataset((params.jobid, "unhashed_manual"))
    for sliceno in range(params.slices):
        a = list(up.iterate(sliceno))
        b = list(down.iterate(sliceno, hashlabel="up", rehash=True))
        c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True))
        assert sorted(a) == sorted(b) == sorted(
            c), "Rehashing is broken (slice %d)" % (sliceno, )

    # And finally verify that we are not allowed to specify the wrong hashlabel
    good = True
    try:
        up.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    try:
        unhashed.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    assert good, "Iteration allowed on the wrong hashlabel"