Exemple #1
0
def test_filter_bad_with_rename_and_chain():
    dw = DatasetWriter(name="filter bad with rename",
                       allow_missing_slices=True)
    dw.add('a', 'ascii')
    dw.add('b', 'bytes')
    dw.add('c', 'unicode')
    dw.set_slice(0)
    dw.write('0', b'1', '2')
    dw.write('9', B'A', 'B')
    dw.write('C', B'D', 'E')
    source_ds = dw.finish()
    jid = subjobs.build(
        'dataset_type',
        column2type=dict(b='int32_10', c='int64_16', d='int32_16'),
        filter_bad=True,
        rename=dict(a='b', b='c', c='d'),
        source=source_ds,
    )
    typed_ds = jid.dataset()
    coltypes = sorted(
        (name, col.type) for name, col in typed_ds.columns.items())
    assert coltypes == [('b', 'int32'), ('c', 'int64'),
                        ('d', 'int32')], coltypes
    assert list(typed_ds.iterate(0)) == [(0, 1, 2), (9, 10, 11)]
    bad_ds = jid.dataset('bad')
    coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items())
    assert coltypes == [('b', 'ascii'), ('c', 'bytes'),
                        ('d', 'unicode')], coltypes
    assert list(bad_ds.iterate(0)) == [('C', b'D', 'E')]

    dw = DatasetWriter(name="filter bad with rename chain",
                       allow_missing_slices=True,
                       previous=source_ds)
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    dw.add('c', 'ascii')
    dw.set_slice(0)
    dw.write('3', '4', '5')
    dw.write('6', '7', 'eight')
    source_ds = dw.finish()
    jid = subjobs.build(
        'dataset_type',
        column2type=dict(a='number', b='int32_10', c='int64_10'),
        defaults=dict(a='8'),
        filter_bad=True,
        rename=dict(a='b', b='c', c='a'),
        source=source_ds,
    )
    typed_ds = jid.dataset()
    coltypes = sorted(
        (name, col.type) for name, col in typed_ds.columns.items())
    assert coltypes == [('a', 'number'), ('b', 'int32'),
                        ('c', 'int64')], coltypes
    assert list(typed_ds.iterate(0)) == [(2, 0, 1), (5, 3, 4), (8, 6, 7)]
    bad_ds = jid.dataset('bad')
    coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items())
    assert coltypes == [('a', 'unicode'), ('b', 'ascii'),
                        ('c', 'bytes')], coltypes
    assert list(bad_ds.iterate(0)) == [('B', '9', b'A'), ('E', 'C', b'D')]
def synthesis(job):
	dw = DatasetWriter(name='empty', columns={'v': 'ascii'})
	dw.get_split_write()
	empty_ds = dw.finish()
	assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns'
	for typ, groups in tests.items():
		t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset()
		minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)

		if minmax != (None, None):
			raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,))
		all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups))
		# just 1 and 2, so we don't make way too many
		for num_groups in (1, 2,):
			for names in combinations(all_names, num_groups):
				ds, mn, mx = make_source(names)
				t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset()
				got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,))
				chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds)
				# verify writing the same data normally also gives the correct result
				dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns)
				write = dw.get_split_write()
				for v in t_ds.iterate(None, 'v'):
					write(v)
				re_ds = dw.finish()
				got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max)
				want_minmax = (mn, mx)
				chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,))

	# make sure renaming doesn't mix anything up
	dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'})
	write = dw.get_split_write()
	write('5', '3')
	write('7', 'oops')
	ds = dw.finish()
	t_ds = subjobs.build(
		'dataset_type',
		column2type=dict(num='number', int='int32_10'),
		defaults=dict(num='1', int='2'),
		rename=dict(a='num', b='int'),
		source=ds,
	).dataset()
	for name, want_minmax in (
		('num', (5, 7)),
		('int', (2, 3)),
	):
		got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max)
		msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,)
		chk_minmax(got_minmax, want_minmax, msg)
Exemple #3
0
def synthesis():
    dw_a = DatasetWriter(name='a', columns={'num': 'int32'})
    dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a)
    dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b)
    w = dw_a.get_split_write()
    w(3)
    w(2)
    w = dw_b.get_split_write()
    w(2)
    w(1)
    w = dw_c.get_split_write()
    w(0)
    a = dw_a.finish()
    b = dw_b.finish()
    c = dw_c.finish()

    opts = dict(
        sort_columns='num',
        sort_across_slices=True,
    )

    # sort as a chain
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=a, previous=None))
    assert list(Dataset(jid).iterate(None, 'num')) == [2, 3]
    sorted_a = jid
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=b, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2]
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=jid))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0]

    # sort all as a single dataset
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=None))
    assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3]

    # merge b and c but not a
    jid = subjobs.build('dataset_sort',
                        options=opts,
                        datasets=dict(source=c, previous=sorted_a))
    # test with new style job.dataset
    assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2]
    assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def mkds(name, columns, data, **kw):
    columns = dict.fromkeys(columns, 'int32')
    dw = DatasetWriter(name=name, columns=columns, **kw)
    write = dw.get_split_write()
    for v in data:
        write(*v)
    return dw.finish()
def test_filter_bad_across_types():
	columns={
		'bytes': 'bytes',
		'float64': 'bytes',
		'int32_10': 'ascii',
		'json': 'unicode',
		'number:int': 'unicode',
		'unicode:utf-8': 'bytes',
	}
	# all_good, *values
	# Make sure all those types (except bytes) can filter other lines,
	# and be filtered by other lines. And that several filtering values
	# is not a problem (line 11).
	data = [
		(True,  b'first',    b'1.1', '1',  '"a"',   '001', b'ett',),
		(True,  b'second',   b'2.2', '2',  '"b"',   '02',  b'tv\xc3\xa5',),
		(True,  b'third',    b'3.3', '3',  '["c"]', '3.0', b'tre',),
		(False, b'fourth',   b'4.4', '4',  '"d"',   '4.4', b'fyra',),       # number:int bad
		(False, b'fifth',    b'5.5', '-',  '"e"',   '5',   b'fem',),        # int32_10 bad
		(False, b'sixth',    b'6.b', '6',  '"f"',   '6',   b'sex',),        # float64 bad
		[False, b'seventh',  b'7.7', '7',  '{"g"}', '7',   b'sju',],        # json bad
		(False, b'eigth',    b'8.8', '8',  '"h"',   '8',   b'\xa5\xc3tta',),# unicode:utf-8 bad
		(True,  b'ninth',    b'9.9', '9',  '"i"',   '9',   b'nio',),
		(True,  b'tenth',    b'10',  '10', '"j"',   '10',  b'tio',),
		(False, b'eleventh', b'11a', '1-', '"k",',  '1,',  b'elva',),       # float64, int32_10 and number:int bad
		(True,  b'twelfth',  b'12',  '12', '"l"',   '12',  b'tolv',),
	]
	dw = DatasetWriter(name="filter bad across types", columns=columns)
	dw.set_slice(0)
	want = []
	def add_want(v):
		want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),))
	for v in data:
		if v[0]:
			add_want(v)
		dw.write(*v[1:])
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	source_ds = dw.finish()
	# Once with just filter_bad, once with some defaults too.
	defaults = {}
	for _ in range(2):
		jid = subjobs.build(
			'dataset_type',
			datasets=dict(source=source_ds),
			options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults),
		)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8']))
		assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '')
		# make more lines "ok" for the second lap
		defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
		add_want(data[3])
		add_want(data[5])
		data[6][4] = '"replacement"'
		add_want(data[6])
		want.sort() # adding them out of order, int32_10 sorts correctly.
Exemple #6
0
def make_source(names):
    names = sorted(names)
    dsname = '+'.join(names)
    if dsname not in sources:
        dw = DatasetWriter(name=dsname, columns={'v': 'ascii'})
        write = dw.get_split_write()
        for name in names:
            for value in data[name][0]:
                write(value)
        sources[dsname] = (
            dw.finish(),
            min(unnan(data[name][1] for name in names)),
            max(unnan(data[name][2] for name in names)),
        )
    return sources[dsname]
def test_column_discarding():
	dw = DatasetWriter(name='column discarding')
	dw.add('a', 'bytes')
	dw.add('b', 'bytes')
	dw.add('c', 'bytes')
	w = dw.get_split_write()
	w(b'a', b'b', b'c')
	source = dw.finish()

	# Discard b because it's not typed
	ac_implicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		discard_untyped=True,
	).dataset()
	assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),)
	assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit

	# Discard b explicitly
	ac_explicit = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b=None),
	).dataset()
	assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),)
	assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit

	# Discard c by overwriting it with b. Keep untyped b.
	ac_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),)
	assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc

	# Discard c by overwriting it with b. Also type b as a different type.
	abc_bASc = subjobs.build(
		'dataset_type',
		source=source,
		column2type=dict(a='ascii', b='strbool', c='ascii'),
		rename=dict(b='c'),
	).dataset()
	assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),)
	assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
Exemple #8
0
def test_rehash_with_empty_slices():
    dw = DatasetWriter(name='rehash with empty slices', hashlabel='a')
    dw.add('a', 'ascii')
    dw.add('b', 'ascii')
    w = dw.get_split_write()
    w('a', '42')
    w('42', 'b')
    source = dw.finish()
    hashfunc = typed_writer('int32').hash

    def verify_hashing(caption, want_values, **kw):
        ds = subjobs.build('dataset_type',
                           source=source,
                           column2type=dict(a='int32_10'),
                           caption=caption,
                           **kw).dataset()
        got_values = set()
        for sliceno in range(g.slices):
            for got in ds.iterate(sliceno):
                assert hashfunc(got[0]) % g.slices == sliceno
                assert got not in got_values
                got_values.add(got)
        assert want_values == got_values

    verify_hashing('with discard', {(
        42,
        'b',
    )}, filter_bad=True)
    # using defaults uses some different code paths
    verify_hashing('with default=0 (probably two slices)', {(
        0,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='0'))
    verify_hashing('with default=42 (one slice)', {(
        42,
        '42',
    ), (
        42,
        'b',
    )},
                   defaults=dict(a='42'))
def synthesis(job):
    manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"]
    manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]]
    # build a local abf chain
    prev = None
    for ix, ds in enumerate(manual_abf):
        name = "abf%d" % (ix, )
        prev = ds.link_to_here(name, override_previous=prev)
    manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf))
    local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None))
    assert manual_abf_data == local_abf_data
    # disconnect h, verify there is no chain
    manual_chain[-1].link_to_here("alone", override_previous=None)
    assert len(Dataset(job, "alone").chain()) == 1
    # check that the original chain is unhurt
    assert manual_chain == manual_chain[-1].chain()

    # So far so good, now make a chain long enough to have a cache.
    prev = None
    ix = 0
    going = True
    while going:
        if prev and "cache" in prev._data:
            going = False
        name = "longchain%d" % (ix, )
        dw = DatasetWriter(name=name, previous=prev)
        dw.add("ix", "number")
        dw.get_split_write()(ix)
        prev = dw.finish()
        ix += 1
    # we now have a chain that goes one past the first cache point
    full_chain = Dataset(prev).chain()
    assert "cache" in full_chain[
        -2]._data  # just to check the above logic is correct
    assert "cache" not in full_chain[-1]._data  # just to be sure..
    full_chain[-2].link_to_here("nocache", override_previous=None)
    full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3])
    assert "cache" not in Dataset(job, "nocache")._data
    assert "cache" in Dataset(job, "withcache")._data
    # And make sure they both get the right data too.
    assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix))
    assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2]
    assert list(Dataset(job, "withcache").iterate_chain(
        None, "ix")) == list(range(ix - 2)) + [ix - 1]
def _verify(name, types, data, coltype, want, default, want_fail, kw):
	if callable(want):
		check = want
	else:
		def check(got, fromstr, filtered=False):
			want1 = want if isinstance(want, list) else want[typ]
			if filtered:
				want1 = want1[::2]
			assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,)
	dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'})
	dw.set_slice(0)
	for ix, v in enumerate(data):
		dw.write(v, b'1' if ix % 2 == 0 else b'skip')
	for sliceno in range(1, g.slices):
		dw.set_slice(sliceno)
	bytes_ds = dw.finish()
	for typ in types:
		opts = dict(column2type=dict(data=typ))
		opts.update(kw)
		if default is not no_default:
			opts['defaults'] = {'data': default}
		try:
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
		except JobError:
			if want_fail:
				continue
			raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,))
		assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid)
		typed_ds = Dataset(jid)
		got = list(typed_ds.iterate(0, 'data'))
		check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,))
		if 'filter_bad' not in opts and not callable(want):
			opts['filter_bad'] = True
			opts['column2type']['extra'] = 'int32_10'
			jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts)
			typed_ds = Dataset(jid)
			got = list(typed_ds.iterate(0, 'data'))
			check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True)
		used_type(typ)
Exemple #11
0
def write(data, **kw):
    dw = DatasetWriter(columns=columns, **kw)
    w = dw.get_split_write_dict()
    for values in data:
        w(values)
    return dw.finish()
Exemple #12
0
def test_filter_bad_across_types():
    columns = {
        'bytes': 'bytes',
        'float64': 'bytes',
        'int32_10': 'ascii',
        'json': 'unicode',
        'number:int': 'unicode',
        'unicode:utf-8': 'bytes',
    }
    # all_good, *values
    # Make sure all those types (except bytes) can filter other lines,
    # and be filtered by other lines. And that several filtering values
    # is not a problem (line 11).
    data = [
        [
            True,
            b'first',
            b'1.1',
            '1',
            '"a"',
            '001',
            b'ett',
        ],
        [
            True,
            b'second',
            b'2.2',
            '2',
            '"b"',
            '02',
            b'tv\xc3\xa5',
        ],
        [
            True,
            b'third',
            b'3.3',
            '3',
            '["c"]',
            '3.0',
            b'tre',
        ],
        [
            False,
            b'fourth',
            b'4.4',
            '4',
            '"d"',
            '4.4',
            b'fyra',
        ],  # number:int bad
        [
            False,
            b'fifth',
            b'5.5',
            '-',
            '"e"',
            '5',
            b'fem',
        ],  # int32_10 bad
        [
            False,
            b'sixth',
            b'6.b',
            '6',
            '"f"',
            '6',
            b'sex',
        ],  # float64 bad
        [
            False,
            b'seventh',
            b'7.7',
            '7',
            '{"g"}',
            '7',
            b'sju',
        ],  # json bad
        [
            False,
            b'eigth',
            b'8.8',
            '8',
            '"h"',
            '8',
            b'\xa5\xc3tta',
        ],  # unicode:utf-8 bad
        [
            True,
            b'ninth',
            b'9.9',
            '9',
            '"i"',
            '9',
            b'nio',
        ],
        [
            True,
            b'tenth',
            b'10',
            '10',
            '"j"',
            '10',
            b'tio',
        ],
        [
            False,
            b'eleventh',
            b'11a',
            '1-',
            '"k",',
            '1,',
            b'elva',
        ],  # float64, int32_10 and number:int bad
        [
            True,
            b'twelfth',
            b'12',
            '12',
            '"l"',
            '12',
            b'tolv',
        ],
    ]
    want_bad = [tuple(l[1:]) for l in data if not l[0]]
    dw = DatasetWriter(name="filter bad across types",
                       columns=columns,
                       allow_missing_slices=True)
    cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8']
    if PY3:
        # z so it sorts last.
        dw.add('zpickle', 'pickle')
        cols_to_check.append('zpickle')
        for ix in range(len(data)):
            data[ix].append({ix})
    dw.set_slice(0)
    want = []

    def add_want(ix):
        v = data[ix]
        want.append((
            int(v[3]),
            v[1],
            json.loads(v[4]),
            v[6].decode('utf-8'),
        ))
        if PY3:
            want[-1] = want[-1] + (v[7], )

    for ix, v in enumerate(data):
        if v[0]:
            add_want(ix)
        dw.write(*v[1:])
    source_ds = dw.finish()
    # Once with just filter_bad, once with some defaults too.
    defaults = {}
    for _ in range(2):
        jid = subjobs.build(
            'dataset_type',
            datasets=dict(source=source_ds),
            options=dict(column2type={t: t
                                      for t in columns},
                         filter_bad=True,
                         defaults=defaults),
        )
        typed_ds = Dataset(jid)
        got = list(typed_ds.iterate(0, cols_to_check))
        assert got == want, "Expected %r, got %r from %s (from %r%s)" % (
            want, got, typed_ds, source_ds,
            ' with defaults' if defaults else '')
        bad_ds = Dataset(jid, 'bad')
        got_bad = list(bad_ds.iterate(0, sorted(columns)))
        assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % (
            want_bad, got_bad, bad_ds, source_ds,
            ' with defaults' if defaults else '')
        # make more lines "ok" for the second lap
        if not defaults:
            want_bad.pop(0)  # number:int
            want_bad.pop(1)  # float64
            want_bad.pop(1)  # json
        defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'}
        add_want(3)
        add_want(5)
        data[6][4] = '"replacement"'
        add_want(6)
        want.sort()  # adding them out of order, int32_10 sorts correctly.
def synthesis(params):
    a0 = mkds('a0', ['0', '1'], [(1, 2), (3, 4), (5, 6)])
    a1 = mkds('a1', ['0', '1'], [(7, 8), (9, 10), (11, 12)], previous=a0)
    b0 = mkds('b0', ['1', '2'], [(13, 14), (15, 16), (17, 18)], parent=a0)
    b1 = mkds('b1', ['1', '2'], [(19, 20), (21, 22), (23, 24)],
              parent=a1,
              previous=b0)
    c0 = mkds('c0', ['3'], [(25, ), (26, ), (27, )], parent=a0)
    c1 = mkds('c1', ['3'], [(28, ), (29, ), (30, )], parent=a1, previous=c0)
    # a contains columns 0 and 1
    # b contains columns 0, 1 and 2 with 0 from a
    # c contains columns 0, 1 and 3 with 0 and 1 from a
    bc0 = merge('bc0', b0, c0)  # simple merge, one overlapping column
    # bc contains columns 0, 1, 2 and 3, with 0 and 1 from a (via c), 2 from b and 3 from c
    check(bc0, [(1, 2, 14, 25), (3, 4, 16, 26), (5, 6, 18, 27)])
    bc1 = merge('bc1', b1, c1, previous=bc0)  # chained
    check(bc1, [(1, 2, 14, 25), (3, 4, 16, 26), (5, 6, 18, 27), (7, 8, 20, 28),
                (9, 10, 22, 29), (11, 12, 24, 30)])
    cb0 = merge('cb0', c0,
                b0)  # other direction, getting the other "1" column.
    # cb contains columns 0, 1, 2 and 3, with 0 from a (via b), 1 and 2 from b and 3 from c
    check(cb0, [(1, 13, 14, 25), (3, 15, 16, 26), (5, 17, 18, 27)])
    d0 = mkds('d0', ['4'], [(37, ), (38, ), (39, )], parent=c0)
    bd0 = merge('bd0', b0, d0)  # immediate parents are not shared
    # bd contains columns 0, 1, 2, 3 and 4, with 0 and 1 from a (via d -> c -> a), 2 from b, 3 from c and 4 from d
    check(bd0, [(1, 2, 14, 25, 37), (3, 4, 16, 26, 38), (5, 6, 18, 27, 39)])
    # more than two datasets with complex parent relationship
    # merged in two stages here, but a single dataset_merge job later.
    cbd0 = merge('cbd0', c0, bd0)
    del merges['cbd0']
    cbdb0 = merge('cbdb0', cbd0, b0)
    merges['cbdb0'] = ((c0, bd0, b0), {})
    # cbdb contains columns 0, 1, 2, 3 and 4, with 0 from a (via d -> c -> a), 1 and 2 from b, 3 from c and 4 from d
    check(cbdb0, [(1, 13, 14, 25, 37), (3, 15, 16, 26, 38),
                  (5, 17, 18, 27, 39)])
    fail_merge(a0, a1)  # no parents
    fail_merge(b0, b1)  # parents not shared
    fail_merge(b0, b0)  # merge with self
    other = mkds('other', ['5'], [(31, ), (32, ), (33, )])
    fail_merge(a0, other)  # parents not shared
    aother = merge('aother', a0, other, allow_unrelated=True)
    # aother contains 0 and 1 from a, 5 from other
    check(aother, [(1, 2, 31), (3, 4, 32), (5, 6, 33)])

    # check hashed datasets too
    ab_a = mkds('ab_a', ['a', 'b'], [(1, 2), (3, 4), (5, 6)], hashlabel='a')
    ab_b = mkds('ab_b', ['a', 'b'], [(7, 8), (9, 10), (11, 12)], hashlabel='b')
    ac_a = mkds('ac_a', ['a', 'c'], [(1, 14), (3, 15), (5, 16)],
                hashlabel='a')  # a values must match ab_a
    fail_merge(ab_a, ab_b, allow_unrelated=True)  # different hashlabels
    abac_a = merge('abac_a', ab_a, ac_a, allow_unrelated=True)
    assert abac_a.hashlabel == 'a'
    check(abac_a, [(1, 2, 14), (3, 4, 15), (5, 6, 16)])

    # merge hashed with unhashed (which we align with the hashlabel manually)
    dw = DatasetWriter(name='d_none', columns={'d': 'number'})
    for sliceno in range(params.slices):
        dw.set_slice(sliceno)
        for v in ab_a.iterate(sliceno, 'a'):
            dw.write(v + 16)
    d_none = dw.finish()
    abd_a = merge('abd_a', ab_a, d_none, allow_unrelated=True)
    assert abd_a.hashlabel == 'a'
    check(abd_a, [(1, 2, 17), (3, 4, 19), (5, 6, 21)])
    # other way round should affect nothing here
    dab_a = merge('dab_a', ab_a, d_none, allow_unrelated=True)
    assert dab_a.hashlabel == 'a'
    check(dab_a, [(1, 2, 17), (3, 4, 19), (5, 6, 21)])

    # the same test but with the lines in the wrong slices:
    dw = DatasetWriter(name='e_none', columns={'e': 'number'})
    e_done = False
    for sliceno in range(params.slices):
        dw.set_slice(sliceno)
        # there are 3 lines in total, some slice will not have all of them.
        if ab_a.lines[sliceno] != 3 and not e_done:
            dw.write(17)
            dw.write(19)
            dw.write(21)
            e_done = True
    assert e_done
    e_none = dw.finish()
    fail_merge(ab_a, e_none, allow_unrelated=True)

    # and finally test all we tested above using the dataset_merge method too
    for name, (parents, kw) in merges.items():
        a_ds = dict(source=parents)
        if 'previous' in kw:
            a_ds['previous'] = kw.pop('previous')
        jid = subjobs.build('dataset_merge', datasets=a_ds, options=kw)
        check(jid.dataset(), checks[name])
    for parents, kw in failed_merges:
        try:
            subjobs.build('dataset_merge',
                          datasets=dict(source=parents),
                          options=kw)
        except JobError:
            continue
        raise Exception("dataset_merge incorrectly allowed %r with %r" %
                        (parents, kw))