def test_filter_bad_across_types(): columns={ 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ (True, b'first', b'1.1', '1', '"a"', '001', b'ett',), (True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5',), (True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre',), (False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra',), # number:int bad (False, b'fifth', b'5.5', '-', '"e"', '5', b'fem',), # int32_10 bad (False, b'sixth', b'6.b', '6', '"f"', '6', b'sex',), # float64 bad [False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju',], # json bad (False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta',),# unicode:utf-8 bad (True, b'ninth', b'9.9', '9', '"i"', '9', b'nio',), (True, b'tenth', b'10', '10', '"j"', '10', b'tio',), (False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva',), # float64, int32_10 and number:int bad (True, b'twelfth', b'12', '12', '"l"', '12', b'tolv',), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),)) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.slices): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis(): dw_a = DatasetWriter(name='a', columns={'num': 'int32'}) dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a) dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b) w = dw_a.get_split_write() w(3) w(2) w = dw_b.get_split_write() w(2) w(1) w = dw_c.get_split_write() w(0) a = dw_a.finish() b = dw_b.finish() c = dw_c.finish() opts = dict( sort_columns='num', sort_across_slices=True, ) # sort as a chain jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=a, previous=None)) assert list(Dataset(jid).iterate(None, 'num')) == [2, 3] sorted_a = jid jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=b, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2] jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0] # sort all as a single dataset jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=None)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3] # merge b and c but not a jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=sorted_a)) # test with new style job.dataset assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2] assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def synthesis(params, prepare_res): dw = prepare_res source = dw.finish() jid = subjobs.build( "dataset_sort", options=dict( sort_columns="num", sort_across_slices=True, ), datasets=dict(source=source), ) ds = Dataset(jid) data = list(ds.iterate(None, "str")) good = list("cghjabdefi") + \ [str(sliceno) for sliceno in range(params.slices)] * 64 assert data == good
def synthesis(prepare_res): opts = DotDict( (k, v) for k, v in options.items() if k in a_csvimport.options) lst = prepare_res previous = datasets.previous for fn, info, dsn in lst: opts.filename = fn jid = subjobs.build('csvimport', options=opts, datasets=dict(previous=previous), caption="Import of %s from %s" % ( info.filename, options.filename, )) previous = Dataset(jid).link_to_here(dsn) if options.chaining == 'off': previous = None if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default': Dataset(jid).link_to_here('default')
def verify(zipname, inside_filenames, want_ds, **kw): opts = dict( filename=g.job.filename(zipname), inside_filenames=inside_filenames, ) opts.update(kw) jid = subjobs.build('csvimport_zip', options=opts) for dsn, want_data in want_ds.items(): got_data = list(Dataset(jid, dsn).iterate(None, '0')) assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % ( jid, dsn, zipname, want_data, got_data)
def main(urd): assert urd.info.slices >= 3, "The tests don't work with less than 3 slices (you have %d)." % ( urd.info.slices, ) print() print("Testing dataset creation, export, import") source = urd.build("test_datasetwriter") urd.build("test_datasetwriter_verify", datasets=dict(source=source)) urd.build("test_dataset_in_prepare") ds = Dataset(source, "passed") csvname = "out.csv.gz" csvname_uncompressed = "out.csv" csv = urd.build("csvexport", options=dict(filename=csvname, separator="\t"), datasets=dict(source=ds)) csv_uncompressed = urd.build("csvexport", options=dict(filename=csvname_uncompressed, separator="\t"), datasets=dict(source=ds)) csv_quoted = urd.build("csvexport", options=dict(filename=csvname, quote_fields='"'), datasets=dict(source=ds)) reimp_csv = urd.build("csvimport", options=dict(filename=csv.filename(csvname), separator="\t")) reimp_csv_uncompressed = urd.build( "csvimport", options=dict(filename=csv_uncompressed.filename(csvname_uncompressed), separator="\t")) reimp_csv_quoted = urd.build("csvimport", options=dict( filename=csv_quoted.filename(csvname), quotes=True)) urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_uncompressed)) urd.build("test_compare_datasets", datasets=dict(a=reimp_csv, b=reimp_csv_quoted)) urd.build("test_dataset_column_names") urd.build("test_dataset_merge") print() print("Testing csvimport with more difficult files") urd.build("test_csvimport_corner_cases") urd.build("test_csvimport_separators") print() print("Testing subjobs and dataset typing") urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv)) urd.build("test_subjobs_nesting") try: # Test if numeric_comma is broken (presumably because no suitable locale # was found, since there are not actually any commas in the source dataset.) urd.build("dataset_type", datasets=dict(source=source), options=dict(numeric_comma=True, column2type=dict(b="float64"), defaults=dict(b="0"))) comma_broken = False except JobError as e: comma_broken = True urd.warn() urd.warn('SKIPPED NUMERIC COMMA TESTS') urd.warn( 'Follow the instructions in this error to enable numeric comma:') urd.warn() urd.warn(e.format_msg()) urd.build("test_dataset_type_corner_cases", options=dict(numeric_comma=not comma_broken)) print() print("Testing dataset chaining, filtering, callbacks and rechaining") selfchain = urd.build("test_selfchain") urd.build("test_rechain", jobids=dict(selfchain=selfchain)) print() print("Testing dataset sorting and rehashing (with subjobs again)") urd.build("test_sorting") urd.build("test_sort_stability") urd.build("test_sort_chaining") urd.build("test_rehash") urd.build("test_dataset_type_hashing") urd.build("test_dataset_type_chaining") print() print("Test hashlabels") urd.build("test_hashlabel") print() print("Test dataset roundrobin iteration") urd.build("test_dataset_roundrobin") print() print("Test dataset_checksum") urd.build("test_dataset_checksum") print() print("Test csvimport_zip") urd.build("test_csvimport_zip") print() print("Test output handling") urd.build("test_output") print() print("Test datetime types in options") urd.build("test_datetime") print() print("Test various utility functions") urd.build("test_optionenum") urd.build("test_json") urd.build("test_jobwithfile") urd.build("test_report")
def synthesis(prepare_res, params, job, slices): dws = prepare_res for dw in ( dws.unhashed_split, dws.up_split, ): w = dw.get_split_write_list() for row in all_data: w(row) for dw in dws.values(): dw.finish() # Verify that the different ways of writing gave the same result for names in ( ("unhashed_split", "unhashed_manual"), ("up_checked", "up_split"), ("down_checked", "down_discarded", "down_discarded_list", "down_discarded_dict"), ): dws = {name: job.dataset(name) for name in names} assert dws == {name: Dataset((params.jobid, name)) for name in names }, "Old style Dataset((params.jobid, name)) broken" for sliceno in range(slices): data = {name: list(dws[name].iterate(sliceno)) for name in names} good = data[names[0]] for name in names[1:]: assert data[ name] == good, "%s doesn't match %s in slice %d" % ( names[0], name, sliceno, ) # Verify that both up and down hashed on the expected column hash = typed_writer("int32").hash for colname in ("up", "down"): ds = job.dataset(colname + "_checked") for sliceno in range(slices): for value in ds.iterate(sliceno, colname): assert hash( value ) % slices == sliceno, "Bad hashing on %s in slice %d" % ( colname, sliceno, ) # Verify that up and down are not the same, to catch hashing # not actually hashing. up = list(job.dataset("up_checked").iterate(None)) down = list(job.dataset("down_checked").iterate(None)) assert up != down, "Hashlabel did not change slice distribution" # And check that the data is still the same. assert sorted(up) == sorted( down) == all_data, "Hashed datasets have wrong data" # Verify that rehashing works. # (Can't use sliceno None, because that won't rehash, and even if it did # the order wouldn't match. Order doesn't even match in the rehashed # individual slices.) up = job.dataset("up_checked") down = job.dataset("down_checked") unhashed = job.dataset("unhashed_manual") for sliceno in range(slices): a = list(up.iterate(sliceno)) b = list(down.iterate(sliceno, hashlabel="up", rehash=True)) c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True)) assert sorted(a) == sorted(b) == sorted( c), "Rehashing is broken (slice %d)" % (sliceno, ) # And finally verify that we are not allowed to specify the wrong hashlabel good = True try: up.iterate(None, hashlabel="down") good = False except AssertionError: pass try: unhashed.iterate(None, hashlabel="down") good = False except AssertionError: pass assert good, "Iteration allowed on the wrong hashlabel"
def synthesis(params): ds = Dataset(params.jobid) assert set(ds.iterate(None, "data")) == {"foo", "bar"}
def analysis(sliceno, params): ds = Dataset(params.jobid) assert set(ds.iterate(None, "data")) == {"foo", "bar"}