Beispiel #1
0
def synthesis(params):
    source = Dataset(subjobs.build("test_sorting_gendata"))
    # Test that all datatypes work for sorting
    for key in test_data.data:
        check_one(params.slices, key, source)
    # Check reverse sorting
    check_one(params.slices, "int32", source, reverse=True)
    # Check that sorting across slices and by two columns works
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=["int64", "int32"],
            sort_order="descending",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    int64_off = sorted(test_data.data).index("int64")
    int32_off = sorted(test_data.data).index("int32")
    all_data = chain.from_iterable(
        test_data.sort_data_for_slice(sliceno)
        for sliceno in range(params.slices))
    good = sorted(all_data,
                  key=lambda t: (
                      t[int64_off],
                      t[int32_off],
                  ),
                  reverse=True)
    ds = Dataset(jid)
    check = list(ds.iterate(None))
    assert check == good, "Sorting across slices on [int64, int32] bad (%s)" % (
        jid, )
def check_one(params,
              line_sep,
              sep,
              data,
              want_res=None,
              prefix="",
              quotes=False,
              leave_bad=False):
    sep_c = chr(sep)
    # Can't have separator character in unquoted values
    if not quotes and not leave_bad:
        data = [[el.replace(sep_c, "") for el in line] for line in data]
    if not want_res:
        want_res = [
            tuple(s.encode("ascii") for s in line) for line in data[1:]
        ]
    filename = "%s_csv.%d.%r.txt" % (prefix, sep, line_sep)
    with open(filename, "w") as fh:
        for line in data:
            if quotes:
                line = [
                    quotes + el.replace(quotes, quotes + quotes) + quotes
                    for el in line
                ]
            fh.write(sep_c.join(line))
            fh.write(line_sep)
    try:
        jid = subjobs.build("csvimport",
                            options=dict(
                                filename=resolve_jobid_filename(
                                    params.jobid, filename),
                                separator=sep_c,
                                quote_support=bool(quotes),
                            ))
    except JobError as e:
        raise CSVImportException(
            "Failed to csvimport for separator %d with line separator %r, csvimport error was:\n%s"
            % (sep, line_sep, e.format_msg()))
    ds = Dataset(jid)
    labels = sorted(ds.columns)
    if labels != data[0]:
        raise WrongLabelsException(
            "csvimport gave wrong labels for separator %d with line separator %r: %r (expected %r)"
            % (
                sep,
                line_sep,
                labels,
                data[0],
            ))
    res = list(ds.iterate(None, data[0]))
    if res != want_res:
        raise WrongDataException(
            "csvimport gave wrong data for separator %d with line separator %r: %r (expected %r)"
            % (
                sep,
                line_sep,
                res,
                want_res,
            ))
Beispiel #3
0
def synthesis(analysis_res):
    opts = DotDict(options)
    del opts.inside_filenames
    lst = analysis_res.merge_auto()
    for fn, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport', options=opts)
        unlink(fn)
        Dataset(jid).link_to_here(dsn)
    if len(lst) == 1 and dsn != 'default':
        Dataset(jid).link_to_here('default')
Beispiel #4
0
def verify(zipname, inside_filenames, want_ds, **kw):
    opts = dict(
        filename=resolve_jobid_filename(g.jobid, zipname),
        inside_filenames=inside_filenames,
    )
    opts.update(kw)
    jid = subjobs.build('csvimport_zip', options=opts)
    for dsn, want_data in want_ds.items():
        got_data = list(Dataset(jid, dsn).iterate(None, '0'))
        assert got_data == want_data, "%s/%s from %s didn't contain %r, instead contained %r" % (
            jid, dsn, zipname, want_data, got_data)
Beispiel #5
0
def synthesis():
    typerename = dict(
        int64="int64_10",
        int32="int32_10",
        bits64="bits64_10",
        bits32="bits32_10",
        bool="strbool",
        datetime="datetime:%Y-%m-%d %H:%M:%S.%f",
        date="date:%Y-%m-%d",
        time="time:%H:%M:%S.%f",
        unicode="unicode:utf-8",
    )
    columns = {
        k: typerename.get(v.type, v.type)
        for k, v in datasets.typed.columns.items()
    }
    retyped = subjobs.build("dataset_type",
                            options=dict(column2type=columns),
                            datasets=dict(source=datasets.untyped))
    subjobs.build("test_compare_datasets",
                  datasets=dict(a=datasets.typed, b=retyped))
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_jobid=datasets.stop)
    for src in jobs:
        jid = build('dataset_checksum',
                    options=dict(columns=options.columns, sort=options.sort),
                    datasets=dict(source=src))
        data = blob.load(jobid=jid)
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
def synthesis(params, prepare_res):
    dw = prepare_res
    source = dw.finish()
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns="num",
            sort_across_slices=True,
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    data = list(ds.iterate(None, "str"))
    good = list("cghjabdefi") + \
           [str(sliceno) for sliceno in range(params.slices)] * 64
    assert data == good
Beispiel #8
0
def check_no_separator(params):
    def write(data):
        fh.write(data + nl_b)
        wrote_c[data] += 1
        if q_b:
            data = q_b + data + q_b
            fh.write(q_b + data.replace(q_b, q_b + q_b) + q_b + nl_b)
            wrote_c[data] += 1

    for nl in (10, 0, 255):
        for q in (None, 0, 34, 13, 10, 228):
            if nl == q:
                continue
            filename = "no separator.%r.%r.txt" % (
                nl,
                q,
            )
            nl_b = bytechr(nl)
            q_b = bytechr(q) if q else b''
            wrote_c = Counter()
            with openx(filename) as fh:
                for splitpoint in range(256):
                    write(byteline(0, splitpoint, nl, q))
                    write(byteline(splitpoint, 256, nl, q))
            try:
                jid = subjobs.build("csvimport",
                                    options=dict(
                                        filename=resolve_jobid_filename(
                                            params.jobid, filename),
                                        quotes=q_b.decode("iso-8859-1"),
                                        newline=nl_b.decode("iso-8859-1"),
                                        separator='',
                                        labelsonfirstline=False,
                                        labels=["data"],
                                    ))
            except JobError:
                raise Exception("Importing %r failed" % (filename, ))
            got_c = Counter(Dataset(jid).iterate(None, "data"))
            assert got_c == wrote_c, "Importing %r (%s) gave wrong contents" % (
                filename,
                jid,
            )
Beispiel #9
0
def check_one(slices, key, source, reverse=False):
	jid = subjobs.build(
		"dataset_sort",
		options=dict(
			sort_columns=key,
			sort_order="descending" if reverse else "ascending",
		),
		datasets=dict(source=source),
	)
	ds = Dataset(jid)
	key_off = sorted(test_data.data).index(key)
	# This provides better separation than the replacement values
	# used in the actual sort method (but this is slow).
	if 'date' in key or 'time' in key:
		nonepos = 1
	else:
		nonepos = -1
	def cmp(a, b):
		a = a[key_off]
		b = b[key_off]
		if a is None:
			if b is None:
				return 0
			return nonepos
		if b is None:
			return -nonepos
		if isinstance(a, float):
			if isnan(a):
				if isnan(b):
					return 0
				return 1
			if isnan(b):
				return -1
		if a < b:
			return -1
		return a > b
	keycmp = cmp_to_key(cmp)
	for sliceno in range(slices):
		good = sorted(test_data.sort_data_for_slice(sliceno), key=keycmp, reverse=reverse)
		check = list(ds.iterate(sliceno))
		assert unnan(check) == unnan(good), "Slice %d sorted on %s bad (%s)" % (sliceno, key, jid,)
Beispiel #10
0
def check_one(slices, key, source, reverse=False):
    jid = subjobs.build(
        "dataset_sort",
        options=dict(
            sort_columns=key,
            sort_order="descending" if reverse else "ascending",
        ),
        datasets=dict(source=source),
    )
    ds = Dataset(jid)
    key_off = sorted(test_data.data).index(key)
    for sliceno in range(slices):
        good = sorted(test_data.sort_data_for_slice(sliceno),
                      key=itemgetter(key_off),
                      reverse=reverse)
        check = list(ds.iterate(sliceno))
        assert check == good, "Slice %d sorted on %s bad (%s)" % (
            sliceno,
            key,
            jid,
        )
def verify_ds(options, d, filename):
    jid = subjobs.build("csvimport", options=options)
    # Order varies depending on slice count, so we use a dict {ix: data}
    for ix, a, b in Dataset(jid).iterate(None, ["ix", "0", "1"]):
        try:
            ix = int(ix)
        except ValueError:
            # We have a few non-numeric ones
            pass
        assert ix in d, "Bad index %r in %r (%s)" % (ix, filename, jid)
        assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        del d[ix]
    assert not d, "Not all lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d.keys()),
    )
Beispiel #12
0
def synthesis(analysis_res, params):
    badnesses = next(analysis_res)
    for tmp in analysis_res:
        badnesses = {k: max(badnesses[k], tmp[k]) for k in tmp}
    badness2type = {
        0: "number",  # this used to be int64_10
        1: "number",  # and this used to be float64
        2: "ascii:encode",
    }
    types = {k: badness2type[v] for k, v in badnesses.iteritems()}
    types.update(options.column2type)
    sub_opts = dict(
        column2type=types,
        defaults=options.defaults,
        rename=options.rename,
        caption=options.caption,
        discard_untyped=options.discard_untyped,
        filter_bad=options.filter_bad,
        numeric_comma=options.numeric_comma,
    )
    jid = build("dataset_type", options=sub_opts, datasets=datasets)
    Dataset(jid).link_to_here()
Beispiel #13
0
def verify(slices, data, source, previous=None, **options):
    jid = subjobs.build(
        "dataset_rehash",
        datasets=dict(source=source, previous=previous),
        options=options,
    )
    hl = options["hashlabel"]
    h = typed_writer(columns[hl]).hash
    ds = Dataset(jid)
    good = {row[hl]: row for row in data}
    names = list(data[0])
    for slice in range(slices):
        for row in ds.iterate_chain(slice, names):
            row = dict(zip(names, row))
            assert h(
                row[hl]
            ) % slices == slice, "row %r is incorrectly in slice %d in %s" % (
                row, slice, ds)
            want = good[row[hl]]
            assert row == want, '%s (rehashed from %s) did not contain the right data for "%s".\nWanted\n%r\ngot\n%r' % (
                ds, source, hl, want, row)
    return ds
Beispiel #14
0
def do_one(params, name, data):
	dw = DatasetWriter(name=name, columns=columns)
	dw.set_slice(0)
	for v in data:
		if v is None:
			d = dict(
				ascii_new=None,
				ascii_old=None,
				bytes_new=None,
				bytes_old=None,
				unicode_new=None,
				unicode_old=None,
			)
		else:
			d = dict(
				ascii_new=v,
				ascii_old=v,
				bytes_new=uni(v).encode("ascii"),
				bytes_old=uni(v).encode("ascii"),
				unicode_new=uni(v),
				unicode_old=uni(v),
			)
		dw.write_dict(d)
	# We don't really want the other slices, but write one thing to
	# each, to make sure it doesn't show up in slice 0.
	# (Small slice merging will put it in the same file, so this is
	# a real risk.)
	for sliceno in range(1, params.slices):
		dw.set_slice(sliceno)
		dw.write_dict(d)
	dw.finish()

	# verify we got what we asked for
	me_ds = Dataset(params.jobid, name)
	for colname, coltype in columns.items():
		col = me_ds.columns[colname]
		assert col.type == coltype.split("_")[-1], colname
		assert col.backing_type == coltype, colname
		for want, got in zip(data, me_ds.iterate(0, colname)):
			if want is not None:
				if PY2 and "unicode" in coltype:
					want = uni(want)
				if PY3 and "bytes" in coltype:
					want = want.encode("ascii")
			assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got)

	# check that both types of bytes filter correctly through typing
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="number", # fails on the string, so that gets filtered out everywhere
			bytes_new="bytes",
			bytes_old="bytes",
		),
		filter_bad=True,
	))
	ds = Dataset(jid)
	# verify the number first
	data_it = iter(raw_data)
	next(data_it) # skip the filtered out string
	for got in ds.iterate(0, "ascii_old"):
		want = next(data_it)
		if want is None:
			# Becomes 0 because the typer (unfortunately) sees it as an empty string
			want = 0
		assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got)
	# now verify all the bytes ones are ok, no longer containing the string.
	for colname in ("ascii_new", "bytes_new", "bytes_old",):
		data_it = iter(data)
		next(data_it) # skip the filtered out string
		for got in ds.iterate(0, colname):
			want = next(data_it)
			if want is not None:
				want = want.encode("ascii")
			assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)

	# and now check that the Nones are ok after making bytes from ascii and unicode from bytes.
	jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict(
		column2type=dict(
			ascii_new="bytes",
			ascii_old="bytes",
			bytes_new="unicode:ascii",
			bytes_old="unicode:ascii",
		),
	))
	ds = Dataset(jid)
	for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",):
		for want, got in ds.iterate(0, ["unicode_new", colname]):
			assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)
Beispiel #15
0
def verify_ds(options, d, d_bad, d_skipped, filename):
    jid = subjobs.build("csvimport", options=options)
    # Order varies depending on slice count, so we use a dict {ix: data}
    for ix, a, b in Dataset(jid).iterate(None, ["ix", "0", "1"]):
        try:
            ix = int(ix)
        except ValueError:
            # We have a few non-numeric ones
            pass
        assert ix in d, "Bad index %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        assert a == b == d[ix], "Wrong data for line %r in %r (%s)" % (
            ix,
            filename,
            jid,
        )
        del d[ix]
    assert not d, "Not all lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d.keys()),
    )
    if options.get("allow_bad"):
        for ix, data in Dataset(jid, "bad").iterate(None, ["lineno", "data"]):
            assert ix in d_bad, "Bad bad_lineno %d in %r (%s/bad) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_bad[
                ix], "Wrong saved bad line %d in %r (%s/bad).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_bad[ix],
                    data,
                )
            del d_bad[ix]
    assert not d_bad, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_bad.keys()),
    )

    if options.get("comment") or options.get("skip_lines"):
        for ix, data in Dataset(jid,
                                "skipped").iterate(None, ["lineno", "data"]):
            assert ix in d_skipped, "Bad skipped_lineno %d in %r (%s/skipped) %r" % (
                ix,
                filename,
                jid,
                data,
            )
            assert data == d_skipped[
                ix], "Wrong saved skipped line %d in %r (%s/skipped).\nWanted %r.\nGot    %r." % (
                    ix,
                    filename,
                    jid,
                    d_skipped[ix],
                    data,
                )
            del d_skipped[ix]
    assert not d_skipped, "Not all bad lines returned from %r (%s), %r missing" % (
        filename,
        jid,
        set(d_skipped.keys()),
    )
Beispiel #16
0
def require_failure(name, options):
    try:
        subjobs.build("csvimport", options=options)
    except JobError:
        return
    raise Exception("File with %s was imported without error." % (name, ))
def _verify(name, types, data, coltype, want, default, want_fail, kw):
    if callable(want):
        check = want
    else:

        def check(got, fromstr, filtered=False):
            want1 = want if isinstance(want, list) else want[typ]
            if filtered:
                want1 = want1[::2]
            assert got == want1, 'Expected %r, got %r from %s.' % (
                want1,
                got,
                fromstr,
            )

    dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
    dw.set_slice(0)
    for ix, v in enumerate(data):
        dw.write(v, b'1' if ix % 2 == 0 else b'skip')
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    bytes_ds = dw.finish()
    for typ in types:
        opts = dict(column2type=dict(data=typ))
        opts.update(kw)
        if default is not no_default:
            opts['defaults'] = {'data': default}
        try:
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
        except JobError:
            if want_fail:
                continue
            raise Exception('Typing %r as %s failed.' % (
                bytes_ds,
                typ,
            ))
        assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (
            bytes_ds, typ, jid)
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, 'data'))
        check(got, '%s (typed as %s from %r)' % (
            typed_ds,
            typ,
            bytes_ds,
        ))
        if 'filter_bad' not in opts and not callable(want):
            opts['filter_bad'] = True
            opts['column2type']['extra'] = 'int32_10'
            jid = subjobs.build('dataset_type',
                                datasets=dict(source=bytes_ds),
                                options=opts)
            typed_ds = Dataset(jid)
            got = list(typed_ds.iterate(0, 'data'))
            check(
                got,
                '%s (typed as %s from %r with every other line skipped from filter_bad)'
                % (
                    typed_ds,
                    typ,
                    bytes_ds,
                ), True)
        used_type(typ)
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        (
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ),
        (
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ),
        (
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ),
        (
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ),  # number:int bad
        (
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ),  # int32_10 bad
        (
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ),  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        (
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ),  # unicode:utf-8 bad
        (
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ),
        (
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ),
        (
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ),  # float64, int32_10 and number:int bad
        (
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ),
    ]
    dw = DatasetWriter(name="filter bad across types", columns=columns)
    dw.set_slice(0)
    want = []

    def add_want(v):
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))

    for v in data:
        if v[0]:
            add_want(v)
        dw.write(*v[1:])
    for sliceno in range(1, g.SLICES):
        dw.set_slice(sliceno)
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(
            typed_ds.iterate(0,
                             ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
        assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(data[3])
        add_want(data[5])
        data[6][4] = '"replacement"'
        add_want(data[6])
        want.sort()  # adding them out of order, int32_10 sorts correctly.
def ck(jid, method="dataset_checksum", **kw):
    jid = subjobs.build(method, datasets=dict(source=jid), options=kw)
    return blob.load(jobid=jid).sum