Esempio n. 1
0
def synthesis(prepare_res, params):
    dw_passed, _ = prepare_res
    # Using set_slice on a dataset that was written in analysis is not
    # actually supported, but since it currently works (as long as that
    # particular slice wasn't written in analysis) let's test it.
    dw_passed.set_slice(0)
    dw_passed.write(**{k: v[0] for k, v in test_data.data.items()})
    dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a")
    dw_synthesis_split.add("a", "int32")
    dw_synthesis_split.add("b", "unicode")
    dw_synthesis_split.get_split_write()(1, "a")
    dw_synthesis_split.get_split_write_list()([2, "b"])
    dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"})
    dw_synthesis_manual = DatasetWriter(name="synthesis_manual",
                                        columns={"sliceno": "int32"})
    dw_nonetest = DatasetWriter(name="nonetest",
                                columns={t: t
                                         for t in test_data.data})
    for sliceno in range(params.slices):
        dw_synthesis_manual.set_slice(sliceno)
        dw_synthesis_manual.write(sliceno)
        dw_nonetest.set_slice(sliceno)
        dw_nonetest.write(
            **{
                k: v[0] if k in test_data.not_none_capable else None
                for k, v in test_data.data.items()
            })
def _verify(name, types, data, coltype, want, default, want_fail, kw):
    if callable(want):
        check = want
    else:

        def check(got, fromstr, filtered=False):
            want1 = want if isinstance(want, list) else want[typ]
            if filtered:
                want1 = want1[::2]
            assert got == want1, 'Expected %r, got %r from %s.' % (
                want1,
                got,
                fromstr,
            )

    dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
    dw.set_slice(0)
    for ix, v in enumerate(data):
        dw.write(v, b'1' if ix % 2 == 0 else b'skip')
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    bytes_ds = dw.finish()
    for typ in types:
        opts = dict(column2type=dict(data=typ))
        opts.update(kw)
        if default is not no_default:
            opts['defaults'] = {'data': default}
        try:
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
        except JobError:
            if want_fail:
                continue
            raise Exception('Typing %r as %s failed.' % (
                bytes_ds,
                typ,
            ))
        assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (
            bytes_ds, typ, jid)
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, 'data'))
        check(got, '%s (typed as %s from %r)' % (
            typed_ds,
            typ,
            bytes_ds,
        ))
        if 'filter_bad' not in opts and not callable(want):
            opts['filter_bad'] = True
            opts['column2type']['extra'] = 'int32_10'
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
            typed_ds = Dataset(jid)
            got = list(typed_ds.iterate(0, 'data'))
            check(
                got,
                '%s (typed as %s from %r with every other line skipped from filter_bad)'
                % (
                    typed_ds,
                    typ,
                    bytes_ds,
                ), True)
        used_type(typ)
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        (
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ),
        (
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ),
        (
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ),
        (
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ),  # number:int bad
        (
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ),  # int32_10 bad
        (
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ),  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        (
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ),  # unicode:utf-8 bad
        (
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ),
        (
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ),
        (
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ),  # float64, int32_10 and number:int bad
        (
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ),
    ]
    dw = DatasetWriter(name="filter bad across types", columns=columns)
    dw.set_slice(0)
    want = []

    def add_want(v):
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))

    for v in data:
        if v[0]:
            add_want(v)
        dw.write(*v[1:])
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(
            typed_ds.iterate(0,
                             ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
        assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(data[3])
        add_want(data[5])
        data[6][4] = '"replacement"'
        add_want(data[6])
        want.sort()  # adding them out of order, int32_10 sorts correctly.
Esempio n. 4
0
def do_one(params, name, data):
	dw = DatasetWriter(name=name, columns=columns)
	dw.set_slice(0)
	for v in data:
		if v is None:
			d = dict(
				ascii_new=None,
				ascii_old=None,
				bytes_new=None,
				bytes_old=None,
				unicode_new=None,
				unicode_old=None,
			)
		else:
			d = dict(
				ascii_new=v,
				ascii_old=v,
				bytes_new=uni(v).encode("ascii"),
				bytes_old=uni(v).encode("ascii"),
				unicode_new=uni(v),
				unicode_old=uni(v),
			)
		dw.write_dict(d)
	# We don't really want the other slices, but write one thing to
	# each, to make sure it doesn't show up in slice 0.
	# (Small slice merging will put it in the same file, so this is
	# a real risk.)
	for sliceno in range(1, params.slices):
		dw.set_slice(sliceno)
		dw.write_dict(d)
	dw.finish()

	# verify we got what we asked for
	me_ds = Dataset(params.jobid, name)
	for colname, coltype in columns.items():
		col = me_ds.columns[colname]
		assert col.type == coltype.split("_")[-1], colname
		assert col.backing_type == coltype, colname
		for want, got in zip(data, me_ds.iterate(0, colname)):
			if want is not None:
				if PY2 and "unicode" in coltype:
					want = uni(want)
				if PY3 and "bytes" in coltype:
					want = want.encode("ascii")
			assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got)

	# check that both types of bytes filter correctly through typing
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="number", # fails on the string, so that gets filtered out everywhere
			bytes_new="bytes",
			bytes_old="bytes",
		),
		filter_bad=True,
	))
	ds = Dataset(jid)
	# verify the number first
	data_it = iter(raw_data)
	next(data_it) # skip the filtered out string
	for got in ds.iterate(0, "ascii_old"):
		want = next(data_it)
		if want is None:
			# Becomes 0 because the typer (unfortunately) sees it as an empty string
			want = 0
		assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got)
	# now verify all the bytes ones are ok, no longer containing the string.
	for colname in ("ascii_new", "bytes_new", "bytes_old",):
		data_it = iter(data)
		next(data_it) # skip the filtered out string
		for got in ds.iterate(0, colname):
			want = next(data_it)
			if want is not None:
				want = want.encode("ascii")
			assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)

	# and now check that the Nones are ok after making bytes from ascii and unicode from bytes.
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="bytes",
			bytes_new="unicode:ascii",
			bytes_old="unicode:ascii",
		),
	))
	ds = Dataset(jid)
	for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",):
		for want, got in ds.iterate(0, ["unicode_new", colname]):
			assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)