def synthesis(params): source = Dataset(subjobs.build("test_sorting_gendata")) # Test that all datatypes work for sorting for key in test_data.data: check_one(params.slices, key, source) # Check reverse sorting check_one(params.slices, "int32", source, reverse=True) # Check that sorting across slices and by two columns works jid = subjobs.build( "dataset_sort", options=dict( sort_columns=["int64", "int32"], sort_order="descending", sort_across_slices=True, ), datasets=dict(source=source), ) int64_off = sorted(test_data.data).index("int64") int32_off = sorted(test_data.data).index("int32") all_data = chain.from_iterable( test_data.sort_data_for_slice(sliceno) for sliceno in range(params.slices)) good = sorted(all_data, key=lambda t: ( t[int64_off], t[int32_off], ), reverse=True) ds = Dataset(jid) check = list(ds.iterate(None)) assert check == good, "Sorting across slices on [int64, int32] bad (%s)" % ( jid, )
def check_one(params, line_sep, sep, data, want_res=None, prefix="", quotes=False, leave_bad=False): sep_c = chr(sep) # Can't have separator character in unquoted values if not quotes and not leave_bad: data = [[el.replace(sep_c, "") for el in line] for line in data] if not want_res: want_res = [ tuple(s.encode("ascii") for s in line) for line in data[1:] ] filename = "%s_csv.%d.%r.txt" % (prefix, sep, line_sep) with open(filename, "w") as fh: for line in data: if quotes: line = [ quotes + el.replace(quotes, quotes + quotes) + quotes for el in line ] fh.write(sep_c.join(line)) fh.write(line_sep) try: jid = subjobs.build("csvimport", options=dict( filename=resolve_jobid_filename( params.jobid, filename), separator=sep_c, quote_support=bool(quotes), )) except JobError as e: raise CSVImportException( "Failed to csvimport for separator %d with line separator %r, csvimport error was:\n%s" % (sep, line_sep, e.format_msg())) ds = Dataset(jid) labels = sorted(ds.columns) if labels != data[0]: raise WrongLabelsException( "csvimport gave wrong labels for separator %d with line separator %r: %r (expected %r)" % ( sep, line_sep, labels, data[0], )) res = list(ds.iterate(None, data[0])) if res != want_res: raise WrongDataException( "csvimport gave wrong data for separator %d with line separator %r: %r (expected %r)" % ( sep, line_sep, res, want_res, ))
def synthesis(analysis_res): opts = DotDict(options) del opts.inside_filenames lst = analysis_res.merge_auto() for fn, dsn in lst: opts.filename = fn jid = subjobs.build('csvimport', options=opts) unlink(fn) Dataset(jid).link_to_here(dsn) if len(lst) == 1 and dsn != 'default': Dataset(jid).link_to_here('default')
def verify(zipname, inside_filenames, want_ds, **kw): opts = dict( filename=resolve_jobid_filename(g.jobid, zipname), inside_filenames=inside_filenames, ) opts.update(kw) jid = subjobs.build('csvimport_zip', options=opts) for dsn, want_data in want_ds.items(): got_data = list(Dataset(jid, dsn).iterate(None, '0')) assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % ( jid, dsn, zipname, want_data, got_data)
def synthesis(): typerename = dict( int64="int64_10", int32="int32_10", bits64="bits64_10", bits32="bits32_10", bool="strbool", datetime="datetime:%Y-%m-%d %H:%M:%S.%f", date="date:%Y-%m-%d", time="time:%H:%M:%S.%f", unicode="unicode:utf-8", ) columns = { k: typerename.get(v.type, v.type) for k, v in datasets.typed.columns.items() } retyped = subjobs.build("dataset_type", options=dict(column2type=columns), datasets=dict(source=datasets.untyped)) subjobs.build("test_compare_datasets", datasets=dict(a=datasets.typed, b=retyped))
def synthesis(): sum = 0 jobs = datasets.source.chain(length=options.chain_length, stop_jobid=datasets.stop) for src in jobs: jid = build('dataset_checksum', options=dict(columns=options.columns, sort=options.sort), datasets=dict(source=src)) data = blob.load(jobid=jid) sum ^= data.sum print("Total: %016x" % (sum, )) return DotDict(sum=sum, columns=data.columns, sort=options.sort, sources=jobs)
def synthesis(params, prepare_res): dw = prepare_res source = dw.finish() jid = subjobs.build( "dataset_sort", options=dict( sort_columns="num", sort_across_slices=True, ), datasets=dict(source=source), ) ds = Dataset(jid) data = list(ds.iterate(None, "str")) good = list("cghjabdefi") + \ [str(sliceno) for sliceno in range(params.slices)] * 64 assert data == good
def check_no_separator(params): def write(data): fh.write(data + nl_b) wrote_c[data] += 1 if q_b: data = q_b + data + q_b fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b) wrote_c[data] += 1 for nl in (10, 0, 255): for q in (None, 0, 34, 13, 10, 228): if nl == q: continue filename = "no separator.%r.%r.txt" % ( nl, q, ) nl_b = bytechr(nl) q_b = bytechr(q) if q else b'' wrote_c = Counter() with openx(filename) as fh: for splitpoint in range(256): write(byteline(0, splitpoint, nl, q)) write(byteline(splitpoint, 256, nl, q)) try: jid = subjobs.build("csvimport", options=dict( filename=resolve_jobid_filename( params.jobid, filename), quotes=q_b.decode("iso-8859-1"), newline=nl_b.decode("iso-8859-1"), separator='', labelsonfirstline=False, labels=["data"], )) except JobError: raise Exception("Importing %r failed" % (filename, )) got_c = Counter(Dataset(jid).iterate(None, "data")) assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % ( filename, jid, )
def check_one(slices, key, source, reverse=False): jid = subjobs.build( "dataset_sort", options=dict( sort_columns=key, sort_order="descending" if reverse else "ascending", ), datasets=dict(source=source), ) ds = Dataset(jid) key_off = sorted(test_data.data).index(key) # This provides better separation than the replacement values # used in the actual sort method (but this is slow). if 'date' in key or 'time' in key: nonepos = 1 else: nonepos = -1 def cmp(a, b): a = a[key_off] b = b[key_off] if a is None: if b is None: return 0 return nonepos if b is None: return -nonepos if isinstance(a, float): if isnan(a): if isnan(b): return 0 return 1 if isnan(b): return -1 if a < b: return -1 return a > b keycmp = cmp_to_key(cmp) for sliceno in range(slices): good = sorted(test_data.sort_data_for_slice(sliceno), key=keycmp, reverse=reverse) check = list(ds.iterate(sliceno)) assert unnan(check) == unnan(good), "Slice %d sorted on %s bad (%s)" % (sliceno, key, jid,)
def check_one(slices, key, source, reverse=False): jid = subjobs.build( "dataset_sort", options=dict( sort_columns=key, sort_order="descending" if reverse else "ascending", ), datasets=dict(source=source), ) ds = Dataset(jid) key_off = sorted(test_data.data).index(key) for sliceno in range(slices): good = sorted(test_data.sort_data_for_slice(sliceno), key=itemgetter(key_off), reverse=reverse) check = list(ds.iterate(sliceno)) assert check == good, "Slice %d sorted on %s bad (%s)" % ( sliceno, key, jid, )
def verify_ds(options, d, filename): jid = subjobs.build("csvimport", options=options) # Order varies depending on slice count, so we use a dict {ix: data} for ix, a, b in Dataset(jid).iterate(None, ["ix", "0", "1"]): try: ix = int(ix) except ValueError: # We have a few non-numeric ones pass assert ix in d, "Bad index %r in %r (%s)" % (ix, filename, jid) assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % ( ix, filename, jid, ) del d[ix] assert not d, "Not all lines returned from %r (%s), %r missing" % ( filename, jid, set(d.keys()), )
def synthesis(analysis_res, params): badnesses = next(analysis_res) for tmp in analysis_res: badnesses = {k: max(badnesses[k], tmp[k]) for k in tmp} badness2type = { 0: "number", # this used to be int64_10 1: "number", # and this used to be float64 2: "ascii:encode", } types = {k: badness2type[v] for k, v in badnesses.iteritems()} types.update(options.column2type) sub_opts = dict( column2type=types, defaults=options.defaults, rename=options.rename, caption=options.caption, discard_untyped=options.discard_untyped, filter_bad=options.filter_bad, numeric_comma=options.numeric_comma, ) jid = build("dataset_type", options=sub_opts, datasets=datasets) Dataset(jid).link_to_here()
def verify(slices, data, source, previous=None, **options): jid = subjobs.build( "dataset_rehash", datasets=dict(source=source, previous=previous), options=options, ) hl = options["hashlabel"] h = typed_writer(columns[hl]).hash ds = Dataset(jid) good = {row[hl]: row for row in data} names = list(data[0]) for slice in range(slices): for row in ds.iterate_chain(slice, names): row = dict(zip(names, row)) assert h( row[hl] ) % slices == slice, "row %r is incorrectly in slice %d in %s" % ( row, slice, ds) want = good[row[hl]] assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % ( ds, source, hl, want, row) return ds
def do_one(params, name, data): dw = DatasetWriter(name=name, columns=columns) dw.set_slice(0) for v in data: if v is None: d = dict( ascii_new=None, ascii_old=None, bytes_new=None, bytes_old=None, unicode_new=None, unicode_old=None, ) else: d = dict( ascii_new=v, ascii_old=v, bytes_new=uni(v).encode("ascii"), bytes_old=uni(v).encode("ascii"), unicode_new=uni(v), unicode_old=uni(v), ) dw.write_dict(d) # We don't really want the other slices, but write one thing to # each, to make sure it doesn't show up in slice 0. # (Small slice merging will put it in the same file, so this is # a real risk.) for sliceno in range(1, params.slices): dw.set_slice(sliceno) dw.write_dict(d) dw.finish() # verify we got what we asked for me_ds = Dataset(params.jobid, name) for colname, coltype in columns.items(): col = me_ds.columns[colname] assert col.type == coltype.split("_")[-1], colname assert col.backing_type == coltype, colname for want, got in zip(data, me_ds.iterate(0, colname)): if want is not None: if PY2 and "unicode" in coltype: want = uni(want) if PY3 and "bytes" in coltype: want = want.encode("ascii") assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got) # check that both types of bytes filter correctly through typing jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="number", # fails on the string, so that gets filtered out everywhere bytes_new="bytes", bytes_old="bytes", ), filter_bad=True, )) ds = Dataset(jid) # verify the number first data_it = iter(raw_data) next(data_it) # skip the filtered out string for got in ds.iterate(0, "ascii_old"): want = next(data_it) if want is None: # Becomes 0 because the typer (unfortunately) sees it as an empty string want = 0 assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got) # now verify all the bytes ones are ok, no longer containing the string. for colname in ("ascii_new", "bytes_new", "bytes_old",): data_it = iter(data) next(data_it) # skip the filtered out string for got in ds.iterate(0, colname): want = next(data_it) if want is not None: want = want.encode("ascii") assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got) # and now check that the Nones are ok after making bytes from ascii and unicode from bytes. jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="bytes", bytes_new="unicode:ascii", bytes_old="unicode:ascii", ), )) ds = Dataset(jid) for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",): for want, got in ds.iterate(0, ["unicode_new", colname]): assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)
def verify_ds(options, d, d_bad, d_skipped, filename): jid = subjobs.build("csvimport", options=options) # Order varies depending on slice count, so we use a dict {ix: data} for ix, a, b in Dataset(jid).iterate(None, ["ix", "0", "1"]): try: ix = int(ix) except ValueError: # We have a few non-numeric ones pass assert ix in d, "Bad index %r in %r (%s)" % ( ix, filename, jid, ) assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % ( ix, filename, jid, ) del d[ix] assert not d, "Not all lines returned from %r (%s), %r missing" % ( filename, jid, set(d.keys()), ) if options.get("allow_bad"): for ix, data in Dataset(jid, "bad").iterate(None, ["lineno", "data"]): assert ix in d_bad, "Bad bad_lineno %d in %r (%s/bad) %r" % ( ix, filename, jid, data, ) assert data == d_bad[ ix], "Wrong saved bad line %d in %r (%s/bad).\nWanted %r.\nGot %r." % ( ix, filename, jid, d_bad[ix], data, ) del d_bad[ix] assert not d_bad, "Not all bad lines returned from %r (%s), %r missing" % ( filename, jid, set(d_bad.keys()), ) if options.get("comment") or options.get("skip_lines"): for ix, data in Dataset(jid, "skipped").iterate(None, ["lineno", "data"]): assert ix in d_skipped, "Bad skipped_lineno %d in %r (%s/skipped) %r" % ( ix, filename, jid, data, ) assert data == d_skipped[ ix], "Wrong saved skipped line %d in %r (%s/skipped).\nWanted %r.\nGot %r." % ( ix, filename, jid, d_skipped[ix], data, ) del d_skipped[ix] assert not d_skipped, "Not all bad lines returned from %r (%s), %r missing" % ( filename, jid, set(d_skipped.keys()), )
def require_failure(name, options): try: subjobs.build("csvimport", options=options) except JobError: return raise Exception("File with %s was imported without error." % (name, ))
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % ( want1, got, fromstr, ) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % ( bytes_ds, typ, )) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % ( bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % ( typed_ds, typ, bytes_ds, )) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check( got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % ( typed_ds, typ, bytes_ds, ), True) used_type(typ)
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ ( True, b'first', b'1.1', '1', '"a"', '001', b'ett', ), ( True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ), ( True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ), ( False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ), # number:int bad ( False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ), # int32_10 bad ( False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ), # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad ( False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ), # unicode:utf-8 bad ( True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ), ( True, b'tenth', b'10', '10', '"j"', '10', b'tio', ), ( False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ), # float64, int32_10 and number:int bad ( True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list( typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def ck(jid, method="dataset_checksum", **kw): jid = subjobs.build(method, datasets=dict(source=jid), options=kw) return blob.load(jobid=jid).sum