def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
예제 #2
0
def synthesis():
    dw_a = DatasetWriter(name='a', columns={'num': 'int32'})
    dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a)
    dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b)
    w = dw_a.get_split_write()
    w(3)
    w(2)
    w = dw_b.get_split_write()
    w(2)
    w(1)
    w = dw_c.get_split_write()
    w(0)
    a = dw_a.finish()
    b = dw_b.finish()
    c = dw_c.finish()

    opts = dict(
        sort_columns='num',
        sort_across_slices=True,
    )

    # sort as a chain
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=a, previous=None))
    assert list(Dataset(jid).iterate(None, 'num')) == [2, 3]
    sorted_a = jid
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=b, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2]
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0]

    # sort all as a single dataset
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=None))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3]

    # merge b and c but not a
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=sorted_a))
    # test with new style job.dataset
    assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2]
    assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def synthesis(params, prepare_res):
    dw = prepare_res
    source = dw.finish()
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns="num",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    data = list(ds.iterate(None, "str"))
    good = list("cghjabdefi") + \
           [str(sliceno) for sliceno in range(params.slices)] * 64
    assert data == good
예제 #4
0
def synthesis(prepare_res):
    opts = DotDict(
        (k, v) for k, v in options.items() if k in a_csvimport.options)
    lst = prepare_res
    previous = datasets.previous
    for fn, info, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport',
                            options=opts,
                            datasets=dict(previous=previous),
                            caption="Import of %s from %s" % (
                                info.filename,
                                options.filename,
                            ))
        previous = Dataset(jid).link_to_here(dsn)
        if options.chaining == 'off':
            previous = None
    if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default':
        Dataset(jid).link_to_here('default')
예제 #5
0
def verify(zipname, inside_filenames, want_ds, **kw):
    opts = dict(
        filename=g.job.filename(zipname),
        inside_filenames=inside_filenames,
    )
    opts.update(kw)
    jid = subjobs.build('csvimport_zip', options=opts)
    for dsn, want_data in want_ds.items():
        got_data = list(Dataset(jid, dsn).iterate(None, '0'))
        assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % (
            jid, dsn, zipname, want_data, got_data)
예제 #6
0
def main(urd):
    assert urd.info.slices >= 3, "The tests don't work with less than 3 slices (you have %d)." % (
        urd.info.slices, )

    print()
    print("Testing dataset creation, export, import")
    source = urd.build("test_datasetwriter")
    urd.build("test_datasetwriter_verify", datasets=dict(source=source))
    urd.build("test_dataset_in_prepare")
    ds = Dataset(source, "passed")
    csvname = "out.csv.gz"
    csvname_uncompressed = "out.csv"
    csv = urd.build("csvexport",
                    options=dict(filename=csvname, separator="\t"),
                    datasets=dict(source=ds))
    csv_uncompressed = urd.build("csvexport",
                                 options=dict(filename=csvname_uncompressed,
                                              separator="\t"),
                                 datasets=dict(source=ds))
    csv_quoted = urd.build("csvexport",
                           options=dict(filename=csvname, quote_fields='"'),
                           datasets=dict(source=ds))
    reimp_csv = urd.build("csvimport",
                          options=dict(filename=csv.filename(csvname),
                                       separator="\t"))
    reimp_csv_uncompressed = urd.build(
        "csvimport",
        options=dict(filename=csv_uncompressed.filename(csvname_uncompressed),
                     separator="\t"))
    reimp_csv_quoted = urd.build("csvimport",
                                 options=dict(
                                     filename=csv_quoted.filename(csvname),
                                     quotes=True))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_uncompressed))
    urd.build("test_compare_datasets",
              datasets=dict(a=reimp_csv, b=reimp_csv_quoted))
    urd.build("test_dataset_column_names")
    urd.build("test_dataset_merge")

    print()
    print("Testing csvimport with more difficult files")
    urd.build("test_csvimport_corner_cases")
    urd.build("test_csvimport_separators")

    print()
    print("Testing subjobs and dataset typing")
    urd.build("test_subjobs_type", datasets=dict(typed=ds, untyped=reimp_csv))
    urd.build("test_subjobs_nesting")
    try:
        # Test if numeric_comma is broken (presumably because no suitable locale
        # was found, since there are not actually any commas in the source dataset.)
        urd.build("dataset_type",
                  datasets=dict(source=source),
                  options=dict(numeric_comma=True,
                               column2type=dict(b="float64"),
                               defaults=dict(b="0")))
        comma_broken = False
    except JobError as e:
        comma_broken = True
        urd.warn()
        urd.warn('SKIPPED NUMERIC COMMA TESTS')
        urd.warn(
            'Follow the instructions in this error to enable numeric comma:')
        urd.warn()
        urd.warn(e.format_msg())
    urd.build("test_dataset_type_corner_cases",
              options=dict(numeric_comma=not comma_broken))

    print()
    print("Testing dataset chaining, filtering, callbacks and rechaining")
    selfchain = urd.build("test_selfchain")
    urd.build("test_rechain", jobids=dict(selfchain=selfchain))

    print()
    print("Testing dataset sorting and rehashing (with subjobs again)")
    urd.build("test_sorting")
    urd.build("test_sort_stability")
    urd.build("test_sort_chaining")
    urd.build("test_rehash")
    urd.build("test_dataset_type_hashing")
    urd.build("test_dataset_type_chaining")

    print()
    print("Test hashlabels")
    urd.build("test_hashlabel")

    print()
    print("Test dataset roundrobin iteration")
    urd.build("test_dataset_roundrobin")

    print()
    print("Test dataset_checksum")
    urd.build("test_dataset_checksum")

    print()
    print("Test csvimport_zip")
    urd.build("test_csvimport_zip")

    print()
    print("Test output handling")
    urd.build("test_output")

    print()
    print("Test datetime types in options")
    urd.build("test_datetime")

    print()
    print("Test various utility functions")
    urd.build("test_optionenum")
    urd.build("test_json")
    urd.build("test_jobwithfile")
    urd.build("test_report")
예제 #7
0
def synthesis(prepare_res, params, job, slices):
    dws = prepare_res
    for dw in (
            dws.unhashed_split,
            dws.up_split,
    ):
        w = dw.get_split_write_list()
        for row in all_data:
            w(row)
    for dw in dws.values():
        dw.finish()

    # Verify that the different ways of writing gave the same result
    for names in (
        ("unhashed_split", "unhashed_manual"),
        ("up_checked", "up_split"),
        ("down_checked", "down_discarded", "down_discarded_list",
         "down_discarded_dict"),
    ):
        dws = {name: job.dataset(name) for name in names}
        assert dws == {name: Dataset((params.jobid, name))
                       for name in names
                       }, "Old style Dataset((params.jobid, name)) broken"
        for sliceno in range(slices):
            data = {name: list(dws[name].iterate(sliceno)) for name in names}
            good = data[names[0]]
            for name in names[1:]:
                assert data[
                    name] == good, "%s doesn't match %s in slice %d" % (
                        names[0],
                        name,
                        sliceno,
                    )

    # Verify that both up and down hashed on the expected column
    hash = typed_writer("int32").hash
    for colname in ("up", "down"):
        ds = job.dataset(colname + "_checked")
        for sliceno in range(slices):
            for value in ds.iterate(sliceno, colname):
                assert hash(
                    value
                ) % slices == sliceno, "Bad hashing on %s in slice %d" % (
                    colname,
                    sliceno,
                )

    # Verify that up and down are not the same, to catch hashing
    # not actually hashing.
    up = list(job.dataset("up_checked").iterate(None))
    down = list(job.dataset("down_checked").iterate(None))
    assert up != down, "Hashlabel did not change slice distribution"
    # And check that the data is still the same.
    assert sorted(up) == sorted(
        down) == all_data, "Hashed datasets have wrong data"

    # Verify that rehashing works.
    # (Can't use sliceno None, because that won't rehash, and even if it did
    # the order wouldn't match. Order doesn't even match in the rehashed
    # individual slices.)
    up = job.dataset("up_checked")
    down = job.dataset("down_checked")
    unhashed = job.dataset("unhashed_manual")
    for sliceno in range(slices):
        a = list(up.iterate(sliceno))
        b = list(down.iterate(sliceno, hashlabel="up", rehash=True))
        c = list(unhashed.iterate(sliceno, hashlabel="up", rehash=True))
        assert sorted(a) == sorted(b) == sorted(
            c), "Rehashing is broken (slice %d)" % (sliceno, )

    # And finally verify that we are not allowed to specify the wrong hashlabel
    good = True
    try:
        up.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    try:
        unhashed.iterate(None, hashlabel="down")
        good = False
    except AssertionError:
        pass
    assert good, "Iteration allowed on the wrong hashlabel"
예제 #8
0
def synthesis(params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}
예제 #9
0
def analysis(sliceno, params):
	ds = Dataset(params.jobid)
	assert set(ds.iterate(None, "data")) == {"foo", "bar"}