def synthesis(prepare_res, slices, job):
    dw_passed, _ = prepare_res
    # Using set_slice on a dataset that was written in analysis is not
    # actually supported, but since it currently works (as long as that
    # particular slice wasn't written in analysis) let's test it.
    dw_passed.set_slice(0)
    dw_passed.write(**{k: v[0] for k, v in test_data.data.items()})
    dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a")
    dw_synthesis_split.add("a", "int32")
    dw_synthesis_split.add("b", "unicode")
    dw_synthesis_split.get_split_write()(1, "a")
    dw_synthesis_split.get_split_write_list()([2, "b"])
    dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"})
    dw_synthesis_manual = job.datasetwriter(name="synthesis_manual",
                                            columns={"sliceno": "int32"})
    dw_nonetest = job.datasetwriter(name="nonetest",
                                    columns={t: t
                                             for t in test_data.data})
    for sliceno in range(slices):
        dw_synthesis_manual.set_slice(sliceno)
        dw_synthesis_manual.write(sliceno)
        dw_nonetest.set_slice(sliceno)
        dw_nonetest.write(
            **{
                k: v[0] if k in test_data.not_none_capable else None
                for k, v in test_data.data.items()
            })
Exemple #2
0
def synthesis(params):
    ds = write(data)
    for colname in data[0]:
        verify(params.slices, data, ds, hashlabel=colname)
    # ok, all the hashing stuff works out, let's test the chaining options.
    bonus_ds = write(bonus_data, name="bonus", previous=ds)
    # no chaining options - full chain
    verify(params.slices, data + bonus_data, bonus_ds, hashlabel="date")
    # just the bonus ds
    verify(params.slices, bonus_data, bonus_ds, hashlabel="date", length=1)
    # built as a chain
    verify(params.slices,
           data + bonus_data,
           bonus_ds,
           hashlabel="date",
           as_chain=True)
    # normal chaining
    a = verify(params.slices, data, ds, hashlabel="date")
    b = verify(params.slices,
               data + bonus_data,
               bonus_ds,
               hashlabel="date",
               previous=a)
    assert b.chain() == [
        a, b
    ], "chain of %s is not [%s, %s] as expected" % (b, a, b)
    # as_chain sparseness
    dw = DatasetWriter(columns=columns, name="empty")
    dw.get_split_write()
    ds = verify(params.slices, [],
                dw.finish(),
                hashlabel="date",
                as_chain=True)
    assert len(
        ds.chain()
    ) == 1, ds + ": dataset_hashpart on empty dataset with as_chain=True did not produce a single dataset"
    # two populated slices with the same data, should end up in two datasets.
    dw = DatasetWriter(columns=columns, name="0 and 2")
    dw.set_slice(0)
    dw.write_dict(data[0])
    dw.set_slice(1)
    dw.set_slice(2)
    dw.write_dict(data[0])
    for s in range(3, params.slices):
        dw.set_slice(s)
    ds = verify(params.slices, [data[0]],
                dw.finish(),
                hashlabel="date",
                as_chain=True)
    got_slices = len(ds.chain())
    assert got_slices == 2, "%s (built with as_chain=True) has %d datasets in chain, expected 2." % (
        ds,
        got_slices,
    )
def synthesis(job):
	dw = DatasetWriter(name='empty', columns={'v': 'ascii'})
	dw.get_split_write()
	empty_ds = dw.finish()
	assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns'
	for typ, groups in tests.items():
		t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset()
		minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)

		if minmax != (None, None):
			raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,))
		all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups))
		# just 1 and 2, so we don't make way too many
		for num_groups in (1, 2,):
			for names in combinations(all_names, num_groups):
				ds, mn, mx = make_source(names)
				t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset()
				got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,))
				chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds)
				# verify writing the same data normally also gives the correct result
				dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns)
				write = dw.get_split_write()
				for v in t_ds.iterate(None, 'v'):
					write(v)
				re_ds = dw.finish()
				got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,))

	# make sure renaming doesn't mix anything up
	dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'})
	write = dw.get_split_write()
	write('5', '3')
	write('7', 'oops')
	ds = dw.finish()
	t_ds = subjobs.build(
		'dataset_type',
		column2type=dict(num='number', int='int32_10'),
		defaults=dict(num='1', int='2'),
		rename=dict(a='num', b='int'),
		source=ds,
	).dataset()
	for name, want_minmax in (
		('num', (5, 7)),
		('int', (2, 3)),
	):
		got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max)
		msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,)
		chk_minmax(got_minmax, want_minmax, msg)
Exemple #4
0
def synthesis():
    dw_a = DatasetWriter(name='a', columns={'num': 'int32'})
    dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a)
    dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b)
    w = dw_a.get_split_write()
    w(3)
    w(2)
    w = dw_b.get_split_write()
    w(2)
    w(1)
    w = dw_c.get_split_write()
    w(0)
    a = dw_a.finish()
    b = dw_b.finish()
    c = dw_c.finish()

    opts = dict(
        sort_columns='num',
        sort_across_slices=True,
    )

    # sort as a chain
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=a, previous=None))
    assert list(Dataset(jid).iterate(None, 'num')) == [2, 3]
    sorted_a = jid
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=b, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2]
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0]

    # sort all as a single dataset
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=None))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3]

    # merge b and c but not a
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=sorted_a))
    # test with new style job.dataset
    assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2]
    assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def mkds(name, columns, data, **kw):
    columns = dict.fromkeys(columns, 'int32')
    dw = DatasetWriter(name=name, columns=columns, **kw)
    write = dw.get_split_write()
    for v in data:
        write(*v)
    return dw.finish()
def synthesis(job):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        prev = ds.link_to_here(name, override_previous=prev)
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(job, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in prev._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        prev = dw.finish()
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(job, "nocache")._data
    assert "cache" in Dataset(job, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2]
    assert list(Dataset(job, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
Exemple #7
0
def make_source(names):
    names = sorted(names)
    dsname = '+'.join(names)
    if dsname not in sources:
        dw = DatasetWriter(name=dsname, columns={'v': 'ascii'})
        write = dw.get_split_write()
        for name in names:
            for value in data[name][0]:
                write(value)
        sources[dsname] = (
            dw.finish(),
            min(unnan(data[name][1] for name in names)),
            max(unnan(data[name][2] for name in names)),
        )
    return sources[dsname]
def test_column_discarding():
	dw = DatasetWriter(name='column discarding')
	dw.add('a', 'bytes')
	dw.add('b', 'bytes')
	dw.add('c', 'bytes')
	w = dw.get_split_write()
	w(b'a', b'b', b'c')
	source = dw.finish()

	# Discard b because it's not typed
	ac_implicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		discard_untyped=True,
	).dataset()
	assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),)
	assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit

	# Discard b explicitly
	ac_explicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b=None),
	).dataset()
	assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),)
	assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit

	# Discard c by overwriting it with b. Keep untyped b.
	ac_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),)
	assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc

	# Discard c by overwriting it with b. Also type b as a different type.
	abc_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', b='strbool', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),)
	assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
Exemple #9
0
def test_rehash_with_empty_slices():
    dw = DatasetWriter(name='rehash with empty slices', hashlabel='a')
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    w = dw.get_split_write()
    w('a', '42')
    w('42', 'b')
    source = dw.finish()
    hashfunc = typed_writer('int32').hash

    def verify_hashing(caption, want_values, **kw):
        ds = subjobs.build('dataset_type',
                           source=source,
                           column2type=dict(a='int32_10'),
                           caption=caption,
                           **kw).dataset()
        got_values = set()
        for sliceno in range(g.slices):
            for got in ds.iterate(sliceno):
                assert hashfunc(got[0]) % g.slices == sliceno
                assert got not in got_values
                got_values.add(got)
        assert want_values == got_values

    verify_hashing('with discard', {(
        42,
        'b',
    )}, filter_bad=True)
    # using defaults uses some different code paths
    verify_hashing('with default=0 (probably two slices)', {(
        0,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='0'))
    verify_hashing('with default=42 (one slice)', {(
        42,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='42'))
Exemple #10
0
def prepare():
	dw = DatasetWriter(columns={"data": "ascii"})
	write = dw.get_split_write()
	write("foo")
	write("bar")