def _test(): from gzwrite import typed_writer for key, data in convfuncs.iteritems(): key = key.split(":")[0] typed_writer(typerename.get(key, key)) assert data.size in (0, 1, 4, 8,), (key, data) assert isinstance(data.conv_code_str, (str, NoneType)), (key, data) if data.conv_code_str: assert typerename.get(key, key) in minmaxfuncs assert data.pyfunc is None or callable(data.pyfunc), (key, data) for key, mm in minmaxfuncs.iteritems(): for v in mm: assert isinstance(v, str), key
def add(self, colname, coltype, default=_nodefault): from g import running assert running == self._running, "Add all columns in the same step as creation" assert not self._started, "Add all columns before setting slice" colname = uni(colname) coltype = uni(coltype) assert colname not in self.columns, colname assert colname typed_writer(coltype) # gives error for unknown types self.columns[colname] = (coltype, default) self._order.append(colname) if colname in self._pcolumns: self._clean_names[colname] = self._pcolumns[colname].name else: self._clean_names[colname] = _clean_name(colname, self._seen_n)
def analysis(sliceno, params): assert list(datasets.source.iterate(sliceno, "a")) == [sliceno, 42] assert list(datasets.source.iterate(sliceno, "b")) == ["a", str(sliceno)] named = Dataset(datasets.source, "named") assert list(named.iterate(sliceno, "c")) == [True, False] assert list(named.iterate(sliceno, "d")) == [date(1536, 12, min(sliceno + 1, 31)), date(2236, 5, min(sliceno + 1, 31))] if sliceno < test_data.value_cnt: passed = Dataset(datasets.source, "passed") good = tuple(v[sliceno] for _, v in sorted(test_data.data.items())) assert list(passed.iterate(sliceno)) == [good] synthesis_split = Dataset(datasets.source, "synthesis_split") values = zip((1, 2, 3,), "abc") hash = typed_writer("int32").hash good = [v for v in values if hash(v[0]) % params.slices == sliceno] assert list(synthesis_split.iterate(sliceno)) == good synthesis_manual = Dataset(datasets.source, "synthesis_manual") assert list(synthesis_manual.iterate(sliceno, "sliceno")) == [sliceno] nonetest = Dataset(datasets.source, "nonetest") good = tuple(v[0] if k in test_data.not_none_capable else None for k, v in sorted(test_data.data.items())) assert list(nonetest.iterate(sliceno)) == [good]
def _mkwriters(self, sliceno, filtered=True): assert self.columns, "No columns in dataset" if self.hashlabel: assert self.hashlabel in self.columns, "Hashed column (%s) missing" % (self.hashlabel,) self._started = 2 - filtered if self.meta_only: return writers = {} for colname, (coltype, default) in self.columns.items(): wt = typed_writer(coltype) kw = {} if default is _nodefault else {'default': default} fn = self.column_filename(colname, sliceno) if filtered and colname == self.hashlabel: from g import SLICES w = wt(fn, hashfilter=(sliceno, SLICES), **kw) self.hashcheck = w.hashcheck else: w = wt(fn, **kw) writers[colname] = w return writers
def verify(slices, data, source, previous=None, **options): jid = subjobs.build( "dataset_rehash", datasets=dict(source=source, previous=previous), options=options, ) hl = options["hashlabel"] h = typed_writer(columns[hl]).hash ds = Dataset(jid) good = {row[hl]: row for row in data} names = list(data[0]) for slice in range(slices): for row in ds.iterate_chain(slice, names): row = dict(zip(names, row)) assert h( row[hl] ) % slices == slice, "row %r is incorrectly in slice %d in %s" % ( row, slice, ds) want = good[row[hl]] assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % ( ds, source, hl, want, row) return ds
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in options.column2type.iteritems(): out_fn = dw.column_filename(options.rename.get( colname, colname)).encode('ascii') if ':' in coltype and not coltype.startswith('number:'): coltype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*'] if '%f' in fmt: # needs to fall back to python version cfunc = None if not cfunc: pyfunc = pyfunc(coltype, fmt) else: _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] fmt = ffi.NULL d = datasets.source assert d.columns[colname].type in ( 'bytes', 'ascii', ), colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno).encode('ascii') if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if coltype == 'number': cfunc = True if coltype == 'number:int': coltype = 'number' cfunc = True fmt = "int" if cfunc: default_value = options.defaults.get(colname, ffi.NULL) if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + coltype) res = c(in_fn, out_fn, minmax_fn, default_value, default_value_is_None, fmt, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) elif pyfunc is str: # We skip it the first time around, and link it from # the source dataset if there were no bad lines. # (That happens at the end of analysis.) # We can't do that if the file is not slice-specific though. # And we also can't do it if the column is in the wrong (old) format. if skip_bad or '%s' not in d.column_filename( colname, '%s') or backing_format != 3: res = backend.filter_strings(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname else: link_candidates.append(( in_fn, out_fn, )) res_bad_count[colname] = 0 res_default_count[colname] = 0 elif pyfunc is str.strip: res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = 0 res_default_count[colname] = 0 else: # python func nodefault = object() if colname in options.defaults: if options.defaults[colname] is None: default_value = None else: default_value = pyfunc(options.defaults[colname]) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) bad_count = 0 default_count = 0 with typed_writer(dataset_typing.typerename.get( coltype, coltype))(out_fn) as fh: col_min = col_max = None for ix, v in enumerate(d.iterate(sliceno, colname)): if skip_bad: if ord(badmap[ix // 8]) & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = ord(badmap[ix // 8]) badmap[ix // 8] = chr(bv | (1 << (ix % 8))) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if not isinstance(v, ( NoneType, str, unicode, )): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in iteritems(options.column2type): out_fn = dw.column_filename(options.rename.get(colname, colname)) fmt = fmt_b = None if coltype in dataset_typing.convfuncs: shorttype = coltype _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] else: shorttype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*'] if cfunc: cfunc = shorttype.replace(':', '_') if pyfunc: tmp = pyfunc(coltype) if callable(tmp): pyfunc = tmp cfunc = None else: pyfunc = None cfunc, fmt, fmt_b = tmp if coltype == 'number': cfunc = 'number' elif coltype == 'number:int': coltype = 'number' cfunc = 'number' fmt = "int" assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc" coltype = shorttype d = datasets.source assert d.columns[colname].type in byteslike_types, colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno) if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if cfunc: default_value = options.defaults.get(colname, ffi.NULL) default_len = 0 if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False if default_value != ffi.NULL: if isinstance(default_value, unicode): default_value = default_value.encode("utf-8") default_len = len(default_value) bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + cfunc) res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value, default_len, default_value_is_None, fmt, fmt_b, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format)) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] coltype = coltype.split(':', 1)[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) else: # python func nodefault = object() if colname in options.defaults: default_value = options.defaults[colname] if default_value is not None: if isinstance(default_value, unicode): default_value = default_value.encode('utf-8') default_value = pyfunc(default_value) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) if PY2: badmap = IntegerBytesWrapper(badmap) bad_count = 0 default_count = 0 dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'} real_coltype = dataset_typing.typerename.get(coltype, coltype) do_minmax = real_coltype not in dont_minmax_types with typed_writer(real_coltype)(out_fn) as fh: col_min = col_max = None for ix, v in enumerate( d._column_iterator(sliceno, colname, _type='bytes' if backing_format == 3 else '_v2_bytes')): if skip_bad: if badmap[ix // 8] & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = badmap[ix // 8] badmap[ix // 8] = bv | (1 << (ix % 8)) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if do_minmax and not isinstance(v, NoneType): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates
def synthesis(prepare_res, params): dws = prepare_res for dw in ( dws.unhashed_split, dws.up_split, ): w = dw.get_split_write_list() for row in all_data: w(row) for dw in dws.values(): dw.finish() # Verify that the different ways of writing gave the same result for names in ( ("unhashed_split", "unhashed_manual"), ("up_checked", "up_split"), ("down_checked", "down_discarded", "down_discarded_list", "down_discarded_dict"), ): dws = {name: Dataset((params.jobid, name)) for name in names} for sliceno in range(params.slices): data = {name: list(dws[name].iterate(sliceno)) for name in names} good = data[names[0]] for name in names[1:]: assert data[ name] == good, "%s doesn't match %s in slice %d" % ( names[0], name, sliceno, ) # Verify that both up and down hashed on the expected column hash = typed_writer("int32").hash for colname in ("up", "down"): ds = Dataset((params.jobid, colname + "_checked")) for sliceno in range(params.slices): for value in ds.iterate(sliceno, colname): assert hash( value ) % params.slices == sliceno, "Bad hashing on %s in slice %d" % ( colname, sliceno, ) # Verify that up and down are not the same, to catch hashing # not actually hashing. up = list(Dataset((params.jobid, "up_checked")).iterate(None)) down = list(Dataset((params.jobid, "down_checked")).iterate(None)) assert up != down, "Hashlabel did not change slice distribution" # And check that the data is still the same. assert sorted(up) == sorted( down) == all_data, "Hashed datasets have wrong data" # Verify that rehashing works. # (Can't use sliceno None, because that won't rehash, and even if it did # the order wouldn't match. Order doesn't even match in the rehashed # individual slices.) up = Dataset((params.jobid, "up_checked")) down = Dataset((params.jobid, "down_checked")) unhashed = Dataset((params.jobid, "unhashed_manual")) for sliceno in range(params.slices): a = list(up.iterate(sliceno)) b = list(down.iterate(sliceno, hashlabel="up", rehash=True)) c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True)) assert sorted(a) == sorted(b) == sorted( c), "Rehashing is broken (slice %d)" % (sliceno, ) # And finally verify that we are not allowed to specify the wrong hashlabel good = True try: up.iterate(None, hashlabel="down") good = False except AssertionError: pass try: unhashed.iterate(None, hashlabel="down") good = False except AssertionError: pass assert good, "Iteration allowed on the wrong hashlabel"