def prepare(params): d = datasets.source caption = options.caption % dict(caption=d.caption, hashlabel=options.hashlabel) prev_p = job_params(datasets.previous, default_empty=True) prev_source = prev_p.datasets.source if len(d.chain(stop_jobid=prev_source, length=options.length)) == 1: filename = d.filename else: filename = None dws = [] previous = datasets.previous for sliceno in range(params.slices): if options.as_chain and sliceno == params.slices - 1: name = "default" else: name = str(sliceno) dw = DatasetWriter( caption="%s (slice %d)" % (caption, sliceno), hashlabel=options.hashlabel, filename=filename, previous=previous, name=name, for_single_slice=sliceno, ) previous = (params.jobid, name) dws.append(dw) names = [] for n, c in d.columns.items(): # names has to be in the same order as the add calls # so the iterator returns the same order the writer expects. names.append(n) for dw in dws: dw.add(n, c.type) return dws, names, prev_source, caption, filename
def prepare(): from dataset import DatasetWriter # previous allows chaining this method, should you wish to do so dw = DatasetWriter(previous=datasets.previous) dw.add('a string', 'ascii') # ascii is not "any string", use 'unicode' for that dw.add('large number', 'number') # number is any (real) number, a float or int of any size dw.add('small number', 'number') dw.add('small integer', 'int32') # int32 is a signed 32 bit number dw.add('gauss number', 'number') dw.add('gauss float', 'float64') # float64 is what many other languages call double return dw
def prepare(params): columns = dict( bytes="bytes", float="float64", int="int64", json="json", unicode="unicode", ) a = DatasetWriter(name="a", columns=columns) b = DatasetWriter(name="b", columns=columns, previous=(params.jobid, "a")) c = DatasetWriter(name="c", columns=columns) return a, b, c
def prepare(): d = datasets.source columns = {} for colname, coltype in options.column2type.iteritems(): assert d.columns[colname].type in ( 'bytes', 'ascii', ), colname coltype = coltype.split(':', 1)[0] columns[options.rename.get(colname, colname)] = dataset_typing.typerename.get( coltype, coltype) if options.filter_bad or options.discard_untyped: assert options.discard_untyped is not False, "Can't keep untyped when filtering bad" parent = None else: parent = datasets.source return DatasetWriter( columns=columns, caption=options.caption, hashlabel=options.rename.get(d.hashlabel, d.hashlabel), hashlabel_override=True, parent=parent, previous=datasets.previous, meta_only=True, )
def prepare(params): assert params.slices >= test_data.value_cnt dw_default = DatasetWriter() dw_default.add("a", "number") dw_default.add("b", "ascii") DatasetWriter(name="named", columns={"c": "bool", "d": "date"}) dw_passed = DatasetWriter(name="passed", columns={t: t for t in test_data.data}) return dw_passed, 42
def synthesis(prepare_res, params): if not options.as_chain: # If we don't want a chain we abuse our knowledge of dataset internals # to avoid recompressing. Don't do this stuff yourself. dws, names, prev_source, caption, filename = prepare_res merged_dw = DatasetWriter( caption=caption, hashlabel=options.hashlabel, filename=filename, previous=datasets.previous, meta_only=True, columns=datasets.source.columns, ) for sliceno in range(params.slices): merged_dw.set_lines(sliceno, sum(dw._lens[sliceno] for dw in dws)) for dwno, dw in enumerate(dws): merged_dw.set_minmax((sliceno, dwno), dw._minmax[sliceno]) for n in names: fn = merged_dw.column_filename(n, sliceno=sliceno) with open(fn, "wb") as out_fh: for dw in dws: fn = dw.column_filename(n, sliceno=sliceno) with open(fn, "rb") as in_fh: copyfileobj(in_fh, out_fh) for dw in dws: dw.discard()
def synthesis(jobid): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) ds.link_to_here(name, override_previous=prev) prev = ( jobid, name, ) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(jobid, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(jobid, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in Dataset(prev)._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) dw.finish() prev = ( jobid, name, ) ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(jobid, "nocache")._data assert "cache" in Dataset(jobid, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(jobid, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(jobid, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def real_prepare(d, previous, options): column_types = {n: c.type for n, c in d.columns.items()} column_sizes = [] column_names = list(column_types) column_names.remove(options.date_column) column_names.insert(0, options.date_column) minmax_typeidx = [] for colname in column_names: typ = column_types[colname] column_sizes.append(dataset_typing.typesizes[typ]) minmax_typeidx.append(minmax_type2idx.get(typ, -1)) minmax_typeidx = ffi.new('int []', minmax_typeidx) kw = dict( columns=column_types, hashlabel=d.hashlabel, caption=options.caption, previous=previous, meta_only=True, ) dw = DatasetWriter(**kw) dw_spill = DatasetWriter(name='SPILL', **kw) return dw, dw_spill, column_names, column_sizes, column_types, minmax_typeidx
def prepare(params): d = datasets.source ds_list = d.chain(stop_ds={datasets.previous: 'source'}) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, datasets=ds_list) sort_idx = sort(columniter) total = len(sort_idx) per_slice = [total // params.slices] * params.slices extra = total % params.slices if extra: # spread the left over length over pseudo-randomly selected slices # (using the start of sort_idx to select slices). # this will always select the first slices if data is already sorted # but at least it's deterministic. selector = sorted(range(min(params.slices, total)), key=sort_idx.__getitem__) for sliceno in selector[:extra]: per_slice[sliceno] += 1 # change per_slice to be the actual sort indexes start = 0 for ix, num in enumerate(per_slice): end = start + num per_slice[ix] = sort_idx[start:end] start = end assert sum(len(part) for part in per_slice) == total # all rows used assert len(set( len(part) for part in per_slice)) < 3 # only 1 or 2 lengths possible sort_idx = per_slice else: sort_idx = None if options.sort_across_slices: hashlabel = None else: hashlabel = d.hashlabel if len(ds_list) == 1: filename = d.filename else: filename = None dw = DatasetWriter( columns=d.columns, caption=params.caption, hashlabel=hashlabel, filename=filename, ) return dw, ds_list, sort_idx
def prepare(params): dws = {} prev = None for name in "abcdefgh": dw = DatasetWriter(name=name, previous=prev) dw.add("ds", "ascii") dw.add("num", "number") dws[name] = dw prev = "%s/%s" % ( params.jobid, name, ) return dws
def prepare(params): d = datasets.source jobs = d.chain(stop_jobid={datasets.previous: 'source'}) if options.sort_across_slices: columniter = partial(Dataset.iterate_list, None, jobids=jobs) sort_idx = sort(columniter) else: sort_idx = None if options.sort_across_slices: hashlabel = None else: hashlabel = d.hashlabel if len(jobs) == 1: filename = d.filename else: filename = None dw = DatasetWriter( columns=d.columns, caption=params.caption, hashlabel=hashlabel, filename=filename, ) return dw, jobs, sort_idx
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() for name, hashlabel in ( ("unhashed_manual", None), # manually interlaved ("unhashed_split", None), # split_write interlaved ("up_checked", "up"), # hashed on up using dw.hashcheck ("up_split", "up"), # hashed on up using split_write ("down_checked", "down"), # hashed on down using dw.hashcheck ("down_discarded", "down"), # hashed on down using discarding writes ("down_discarded_list", "down"), # hashed on down using discarding list writes ("down_discarded_dict", "down"), # hashed on down using discarding dict writes ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", "int32") dw.add("down", "int32") dws[name] = dw return dws
def synthesis(params): dw = DatasetWriter(name="parent") in_parent = [ # list because order matters "-", # becomes _ because everything must be a valid python identifier. "a b", # becomes a_b because everything must be a valid python identifier. "42", # becomes _42 because everything must be a valid python identifier. "print", # becomes print_ because print is a keyword (in py2). "print@", # becomes print__ because print_ is taken. "None", # becomes None_ because None is a keyword (in py3). ] for colname in in_parent: dw.add(colname, "unicode") w = dw.get_split_write() w(_="- 1", a_b="a b 1", _42="42 1", print_="print 1", None_="None 1", print__="Will be overwritten 1") w(_="- 2", a_b="a b 2", _42="42 2", print_="print 2", None_="None 2", print__="Will be overwritten 2") parent = dw.finish() dw = DatasetWriter(name="child", parent=parent) in_child = [ # order still matters "print_*", # becomes print___ because print__ is taken. "print_", # becomes print____ because all shorter are taken. "normal", # no collision. "Normal", # no collision. "print@", # re-uses print__ from the parent dataset. ] for colname in in_child: dw.add(colname, "unicode") w = dw.get_split_write() w(print__="print@ 1", print___="print_* 1", print____="print_ 1", normal="normal 1", Normal="Normal 1") w(print__="print@ 2", print___="print_* 2", print____="print_ 2", normal="normal 2", Normal="Normal 2") child = dw.finish() for colname in in_parent + in_child: data = set(child.iterate(None, colname)) assert data == {colname + " 1", colname + " 2"}, "Bad data for %s: %r" % (colname, data)
def prepare(): dw = DatasetWriter() dw.add("str", "ascii") dw.add("num", "number") return dw
def prepare(): dw = DatasetWriter(parent=datasets.source) dw.add('prod', 'number') # works for float as well as int return dw
def prepare(SOURCE_DIRECTORY): separator = options.separator assert len(separator) == 1 filename = os.path.join(SOURCE_DIRECTORY, options.filename) orig_filename = filename if filename.lower().endswith('.zip'): from zipfile import ZipFile filename = 'extracted' with ZipFile(orig_filename, 'r') as z: infos = z.infolist() assert len( infos ) == 1, 'There is only support for ZIP files with exactly one member.' # Wouldn't it be nice if ZipFile.extract let me choose the filename? with open(filename, 'wb') as ofh: zfh = z.open(infos[0]) while True: data = zfh.read(1024 * 1024) if not data: break ofh.write(data) if options.labelsonfirstline: with gzutil.GzBytesLines(filename, strip_bom=True) as fh: labels_str = next(fh).decode('ascii', 'replace').encode( 'ascii', 'replace') # garbage -> '?' if options.quote_support: labels = [] sep = options.separator while labels_str is not None: if labels_str.startswith(( '"', "'", )): q = labels_str[0] pos = 1 while pos + 1 < len(labels_str): pos = labels_str.find(q, pos) if pos == -1: # all is lost pos = len(labels_str) - 1 if pos + 1 == len(labels_str): # eol break if labels_str[pos + 1] == sep: break # we'll just assume it was a quote, because what else to do? labels_str = labels_str[:pos] + labels_str[pos + 1:] pos += 1 labels.append(labels_str[1:pos]) if len(labels_str) > pos + 1: labels_str = labels_str[pos + 2:] else: labels_str = None else: if sep in labels_str: field, labels_str = labels_str.split(sep, 1) else: field, labels_str = labels_str, None labels.append(field) else: labels = labels_str.split(options.separator) labels = options.labels or labels # only from file if not specified in options assert labels, "No labels" labels = [options.rename.get(x, x) for x in labels] assert '' not in labels, "Empty label for column %d" % (labels.index(''), ) assert len(labels) == len(set(labels)), "Duplicate labels: %r" % (labels, ) dw = DatasetWriter( columns={n: 'bytes' for n in labels}, filename=orig_filename, hashlabel=options.hashlabel, caption='csvimport of ' + orig_filename, previous=datasets.previous, meta_only=True, ) return separator, filename, orig_filename, labels, dw,
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in iteritems(options.column2type): out_fn = dw.column_filename(options.rename.get(colname, colname)) fmt = fmt_b = None if coltype in dataset_typing.convfuncs: shorttype = coltype _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] else: shorttype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[shorttype + ':*'] if cfunc: cfunc = shorttype.replace(':', '_') if pyfunc: tmp = pyfunc(coltype) if callable(tmp): pyfunc = tmp cfunc = None else: pyfunc = None cfunc, fmt, fmt_b = tmp if coltype == 'number': cfunc = 'number' elif coltype == 'number:int': coltype = 'number' cfunc = 'number' fmt = "int" assert cfunc or pyfunc, coltype + " didn't have cfunc or pyfunc" coltype = shorttype d = datasets.source assert d.columns[colname].type in byteslike_types, colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno) if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if cfunc: default_value = options.defaults.get(colname, ffi.NULL) default_len = 0 if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False if default_value != ffi.NULL: if isinstance(default_value, unicode): default_value = default_value.encode("utf-8") default_len = len(default_value) bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + cfunc) res = c(*bytesargs(in_fn, out_fn, minmax_fn, default_value, default_len, default_value_is_None, fmt, fmt_b, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format)) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] coltype = coltype.split(':', 1)[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) else: # python func nodefault = object() if colname in options.defaults: default_value = options.defaults[colname] if default_value is not None: if isinstance(default_value, unicode): default_value = default_value.encode('utf-8') default_value = pyfunc(default_value) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) if PY2: badmap = IntegerBytesWrapper(badmap) bad_count = 0 default_count = 0 dont_minmax_types = {'bytes', 'ascii', 'unicode', 'json'} real_coltype = dataset_typing.typerename.get(coltype, coltype) do_minmax = real_coltype not in dont_minmax_types with typed_writer(real_coltype)(out_fn) as fh: col_min = col_max = None for ix, v in enumerate( d._column_iterator(sliceno, colname, _type='bytes' if backing_format == 3 else '_v2_bytes')): if skip_bad: if badmap[ix // 8] & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = badmap[ix // 8] badmap[ix // 8] = bv | (1 << (ix % 8)) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if do_minmax and not isinstance(v, NoneType): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates
def write(data, **kw): dw = DatasetWriter(columns=columns, **kw) w = dw.get_split_write_dict() for values in data: w(values) return dw.finish()
def analysis(sliceno, prepare_res): dw_default = DatasetWriter() dw_named = DatasetWriter(name="named") dw_passed, num = prepare_res dw_default.write(a=sliceno, b="a") dw_default.write_list([num, str(sliceno)]) dw_named.write(True, date(1536, 12, min(sliceno + 1, 31))) dw_named.write_dict({"c": False, "d": date(2236, 5, min(sliceno + 1, 31))}) # slice 0 is written in synthesis if 0 < sliceno < test_data.value_cnt: dw_passed.write_dict( {k: v[sliceno] for k, v in test_data.data.items()})
def synthesis(prepare_res, params): dw_passed, _ = prepare_res # Using set_slice on a dataset that was written in analysis is not # actually supported, but since it currently works (as long as that # particular slice wasn't written in analysis) let's test it. dw_passed.set_slice(0) dw_passed.write(**{k: v[0] for k, v in test_data.data.items()}) dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a") dw_synthesis_split.add("a", "int32") dw_synthesis_split.add("b", "unicode") dw_synthesis_split.get_split_write()(1, "a") dw_synthesis_split.get_split_write_list()([2, "b"]) dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"}) dw_synthesis_manual = DatasetWriter(name="synthesis_manual", columns={"sliceno": "int32"}) dw_nonetest = DatasetWriter(name="nonetest", columns={t: t for t in test_data.data}) for sliceno in range(params.slices): dw_synthesis_manual.set_slice(sliceno) dw_synthesis_manual.write(sliceno) dw_nonetest.set_slice(sliceno) dw_nonetest.write( **{ k: v[0] if k in test_data.not_none_capable else None for k, v in test_data.data.items() })
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % ( want1, got, fromstr, ) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % ( bytes_ds, typ, )) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % ( bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % ( typed_ds, typ, bytes_ds, )) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check( got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % ( typed_ds, typ, bytes_ds, ), True) used_type(typ)
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ ( True, b'first', b'1.1', '1', '"a"', '001', b'ett', ), ( True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ), ( True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ), ( False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ), # number:int bad ( False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ), # int32_10 bad ( False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ), # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad ( False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ), # unicode:utf-8 bad ( True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ), ( True, b'tenth', b'10', '10', '"j"', '10', b'tio', ), ( False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ), # float64, int32_10 and number:int bad ( True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list( typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def analysis_lap(sliceno, badmap_fh, first_lap): known_line_count = 0 badmap_size = 0 badmap_fd = -1 res_bad_count = {} res_default_count = {} res_minmax = {} link_candidates = [] if first_lap: record_bad = options.filter_bad skip_bad = 0 else: record_bad = 0 skip_bad = options.filter_bad minmax_fn = 'minmax%d' % (sliceno, ) dw = DatasetWriter() for colname, coltype in options.column2type.iteritems(): out_fn = dw.column_filename(options.rename.get( colname, colname)).encode('ascii') if ':' in coltype and not coltype.startswith('number:'): coltype, fmt = coltype.split(':', 1) _, cfunc, pyfunc = dataset_typing.convfuncs[coltype + ':*'] if '%f' in fmt: # needs to fall back to python version cfunc = None if not cfunc: pyfunc = pyfunc(coltype, fmt) else: _, cfunc, pyfunc = dataset_typing.convfuncs[coltype] fmt = ffi.NULL d = datasets.source assert d.columns[colname].type in ( 'bytes', 'ascii', ), colname if options.filter_bad: line_count = d.lines[sliceno] if known_line_count: assert line_count == known_line_count, (colname, line_count, known_line_count) else: known_line_count = line_count pagesize = getpagesize() badmap_size = (line_count // 8 // pagesize + 1) * pagesize badmap_fh.truncate(badmap_size) badmap_fd = badmap_fh.fileno() if d.columns[colname].backing_type.startswith('_v2_'): backing_format = 2 else: backing_format = 3 in_fn = d.column_filename(colname, sliceno).encode('ascii') if d.columns[colname].offsets: offset = d.columns[colname].offsets[sliceno] max_count = d.lines[sliceno] else: offset = 0 max_count = -1 if coltype == 'number': cfunc = True if coltype == 'number:int': coltype = 'number' cfunc = True fmt = "int" if cfunc: default_value = options.defaults.get(colname, ffi.NULL) if default_value is None: default_value = ffi.NULL default_value_is_None = True else: default_value_is_None = False bad_count = ffi.new('uint64_t [1]', [0]) default_count = ffi.new('uint64_t [1]', [0]) c = getattr(backend, 'convert_column_' + coltype) res = c(in_fn, out_fn, minmax_fn, default_value, default_value_is_None, fmt, record_bad, skip_bad, badmap_fd, badmap_size, bad_count, default_count, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = bad_count[0] res_default_count[colname] = default_count[0] with type2iter[dataset_typing.typerename.get( coltype, coltype)](minmax_fn) as it: res_minmax[colname] = list(it) unlink(minmax_fn) elif pyfunc is str: # We skip it the first time around, and link it from # the source dataset if there were no bad lines. # (That happens at the end of analysis.) # We can't do that if the file is not slice-specific though. # And we also can't do it if the column is in the wrong (old) format. if skip_bad or '%s' not in d.column_filename( colname, '%s') or backing_format != 3: res = backend.filter_strings(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname else: link_candidates.append(( in_fn, out_fn, )) res_bad_count[colname] = 0 res_default_count[colname] = 0 elif pyfunc is str.strip: res = backend.filter_stringstrip(in_fn, out_fn, badmap_fd, badmap_size, offset, max_count, backing_format) assert not res, 'Failed to convert ' + colname res_bad_count[colname] = 0 res_default_count[colname] = 0 else: # python func nodefault = object() if colname in options.defaults: if options.defaults[colname] is None: default_value = None else: default_value = pyfunc(options.defaults[colname]) else: default_value = nodefault if options.filter_bad: badmap = mmap(badmap_fd, badmap_size) bad_count = 0 default_count = 0 with typed_writer(dataset_typing.typerename.get( coltype, coltype))(out_fn) as fh: col_min = col_max = None for ix, v in enumerate(d.iterate(sliceno, colname)): if skip_bad: if ord(badmap[ix // 8]) & (1 << (ix % 8)): bad_count += 1 continue try: v = pyfunc(v) except ValueError: if default_value is not nodefault: v = default_value default_count += 1 elif record_bad: bad_count += 1 bv = ord(badmap[ix // 8]) badmap[ix // 8] = chr(bv | (1 << (ix % 8))) continue else: raise Exception( "Invalid value %r with no default in %s" % ( v, colname, )) if not isinstance(v, ( NoneType, str, unicode, )): if col_min is None: col_min = col_max = v if v < col_min: col_min = v if v > col_max: col_max = v fh.write(v) if options.filter_bad: badmap.close() res_bad_count[colname] = bad_count res_default_count[colname] = default_count res_minmax[colname] = [col_min, col_max] return res_bad_count, res_default_count, res_minmax, link_candidates
def prepare(): dw = DatasetWriter(previous=datasets.previous) dw.add('rflt', 'float64') dw.add('rint', 'int64') return dw
def prepare(): return DatasetWriter(columns={t: t for t in test_data.data})
def do_one(params, name, data): dw = DatasetWriter(name=name, columns=columns) dw.set_slice(0) for v in data: if v is None: d = dict( ascii_new=None, ascii_old=None, bytes_new=None, bytes_old=None, unicode_new=None, unicode_old=None, ) else: d = dict( ascii_new=v, ascii_old=v, bytes_new=uni(v).encode("ascii"), bytes_old=uni(v).encode("ascii"), unicode_new=uni(v), unicode_old=uni(v), ) dw.write_dict(d) # We don't really want the other slices, but write one thing to # each, to make sure it doesn't show up in slice 0. # (Small slice merging will put it in the same file, so this is # a real risk.) for sliceno in range(1, params.slices): dw.set_slice(sliceno) dw.write_dict(d) dw.finish() # verify we got what we asked for me_ds = Dataset(params.jobid, name) for colname, coltype in columns.items(): col = me_ds.columns[colname] assert col.type == coltype.split("_")[-1], colname assert col.backing_type == coltype, colname for want, got in zip(data, me_ds.iterate(0, colname)): if want is not None: if PY2 and "unicode" in coltype: want = uni(want) if PY3 and "bytes" in coltype: want = want.encode("ascii") assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got) # check that both types of bytes filter correctly through typing jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="number", # fails on the string, so that gets filtered out everywhere bytes_new="bytes", bytes_old="bytes", ), filter_bad=True, )) ds = Dataset(jid) # verify the number first data_it = iter(raw_data) next(data_it) # skip the filtered out string for got in ds.iterate(0, "ascii_old"): want = next(data_it) if want is None: # Becomes 0 because the typer (unfortunately) sees it as an empty string want = 0 assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got) # now verify all the bytes ones are ok, no longer containing the string. for colname in ("ascii_new", "bytes_new", "bytes_old",): data_it = iter(data) next(data_it) # skip the filtered out string for got in ds.iterate(0, colname): want = next(data_it) if want is not None: want = want.encode("ascii") assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got) # and now check that the Nones are ok after making bytes from ascii and unicode from bytes. jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="bytes", bytes_new="unicode:ascii", bytes_old="unicode:ascii", ), )) ds = Dataset(jid) for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",): for want, got in ds.iterate(0, ["unicode_new", colname]): assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)