def test_filter_bad_with_rename_and_chain(): dw = DatasetWriter(name="filter bad with rename", allow_missing_slices=True) dw.add('a', 'ascii') dw.add('b', 'bytes') dw.add('c', 'unicode') dw.set_slice(0) dw.write('0', b'1', '2') dw.write('9', B'A', 'B') dw.write('C', B'D', 'E') source_ds = dw.finish() jid = subjobs.build( 'dataset_type', column2type=dict(b='int32_10', c='int64_16', d='int32_16'), filter_bad=True, rename=dict(a='b', b='c', c='d'), source=source_ds, ) typed_ds = jid.dataset() coltypes = sorted( (name, col.type) for name, col in typed_ds.columns.items()) assert coltypes == [('b', 'int32'), ('c', 'int64'), ('d', 'int32')], coltypes assert list(typed_ds.iterate(0)) == [(0, 1, 2), (9, 10, 11)] bad_ds = jid.dataset('bad') coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items()) assert coltypes == [('b', 'ascii'), ('c', 'bytes'), ('d', 'unicode')], coltypes assert list(bad_ds.iterate(0)) == [('C', b'D', 'E')] dw = DatasetWriter(name="filter bad with rename chain", allow_missing_slices=True, previous=source_ds) dw.add('a', 'ascii') dw.add('b', 'ascii') dw.add('c', 'ascii') dw.set_slice(0) dw.write('3', '4', '5') dw.write('6', '7', 'eight') source_ds = dw.finish() jid = subjobs.build( 'dataset_type', column2type=dict(a='number', b='int32_10', c='int64_10'), defaults=dict(a='8'), filter_bad=True, rename=dict(a='b', b='c', c='a'), source=source_ds, ) typed_ds = jid.dataset() coltypes = sorted( (name, col.type) for name, col in typed_ds.columns.items()) assert coltypes == [('a', 'number'), ('b', 'int32'), ('c', 'int64')], coltypes assert list(typed_ds.iterate(0)) == [(2, 0, 1), (5, 3, 4), (8, 6, 7)] bad_ds = jid.dataset('bad') coltypes = sorted((name, col.type) for name, col in bad_ds.columns.items()) assert coltypes == [('a', 'unicode'), ('b', 'ascii'), ('c', 'bytes')], coltypes assert list(bad_ds.iterate(0)) == [('B', '9', b'A'), ('E', 'C', b'D')]
def synthesis(job): dw = DatasetWriter(name='empty', columns={'v': 'ascii'}) dw.get_split_write() empty_ds = dw.finish() assert empty_ds.min('non-existant column') is empty_ds.max('non-existant column') is None, 'Dataset.min/max() broken for non-existant columns' for typ, groups in tests.items(): t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=empty_ds).dataset() minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) if minmax != (None, None): raise Exception('Typing empty dataset as %s did not give minmax == None, gave %r' % (typ, minmax,)) all_names = list(chain.from_iterable(groupdata[group].keys() for group in groups)) # just 1 and 2, so we don't make way too many for num_groups in (1, 2,): for names in combinations(all_names, num_groups): ds, mn, mx = make_source(names) t_ds = subjobs.build('dataset_type', column2type={'v': typ}, source=ds).dataset() got_minmax = (t_ds.columns['v'].min, t_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Typing %s as %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, typ, want_minmax, got_minmax, t_ds,)) chk_minmax(got_minmax, (t_ds.min('v'), t_ds.max('v')), 'Dataset.min/max() broken on ' + t_ds) # verify writing the same data normally also gives the correct result dw = DatasetWriter(name='rewrite ' + t_ds, columns=t_ds.columns) write = dw.get_split_write() for v in t_ds.iterate(None, 'v'): write(v) re_ds = dw.finish() got_minmax = (re_ds.columns['v'].min, re_ds.columns['v'].max) want_minmax = (mn, mx) chk_minmax(got_minmax, want_minmax, 'Rewriting %s gave the wrong minmax: expected %r, got %r (in %s)' % (t_ds, want_minmax, got_minmax, re_ds,)) # make sure renaming doesn't mix anything up dw = DatasetWriter(name='rename', columns={'a': 'ascii', 'b': 'ascii'}) write = dw.get_split_write() write('5', '3') write('7', 'oops') ds = dw.finish() t_ds = subjobs.build( 'dataset_type', column2type=dict(num='number', int='int32_10'), defaults=dict(num='1', int='2'), rename=dict(a='num', b='int'), source=ds, ).dataset() for name, want_minmax in ( ('num', (5, 7)), ('int', (2, 3)), ): got_minmax = (t_ds.columns[name].min, t_ds.columns[name].max) msg = 'Typing %s gave wrong minmax: expected %r, got %r (in %s)' % (ds, want_minmax, got_minmax, t_ds,) chk_minmax(got_minmax, want_minmax, msg)
def synthesis(): dw_a = DatasetWriter(name='a', columns={'num': 'int32'}) dw_b = DatasetWriter(name='b', columns={'num': 'int32'}, previous=dw_a) dw_c = DatasetWriter(name='c', columns={'num': 'int32'}, previous=dw_b) w = dw_a.get_split_write() w(3) w(2) w = dw_b.get_split_write() w(2) w(1) w = dw_c.get_split_write() w(0) a = dw_a.finish() b = dw_b.finish() c = dw_c.finish() opts = dict( sort_columns='num', sort_across_slices=True, ) # sort as a chain jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=a, previous=None)) assert list(Dataset(jid).iterate(None, 'num')) == [2, 3] sorted_a = jid jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=b, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2] jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=jid)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [2, 3, 1, 2, 0] # sort all as a single dataset jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=None)) assert list(Dataset(jid).iterate_chain(None, 'num')) == [0, 1, 2, 2, 3] # merge b and c but not a jid = subjobs.build('dataset_sort', options=opts, datasets=dict(source=c, previous=sorted_a)) # test with new style job.dataset assert list(jid.dataset().iterate(None, 'num')) == [0, 1, 2] assert list(jid.dataset().iterate_chain(None, 'num')) == [2, 3, 0, 1, 2]
def mkds(name, columns, data, **kw): columns = dict.fromkeys(columns, 'int32') dw = DatasetWriter(name=name, columns=columns, **kw) write = dw.get_split_write() for v in data: write(*v) return dw.finish()
def test_filter_bad_across_types(): columns={ 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ (True, b'first', b'1.1', '1', '"a"', '001', b'ett',), (True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5',), (True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre',), (False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra',), # number:int bad (False, b'fifth', b'5.5', '-', '"e"', '5', b'fem',), # int32_10 bad (False, b'sixth', b'6.b', '6', '"f"', '6', b'sex',), # float64 bad [False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju',], # json bad (False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta',),# unicode:utf-8 bad (True, b'ninth', b'9.9', '9', '"i"', '9', b'nio',), (True, b'tenth', b'10', '10', '"j"', '10', b'tio',), (False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva',), # float64, int32_10 and number:int bad (True, b'twelfth', b'12', '12', '"l"', '12', b'tolv',), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append((int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'),)) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.slices): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % (want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def make_source(names): names = sorted(names) dsname = '+'.join(names) if dsname not in sources: dw = DatasetWriter(name=dsname, columns={'v': 'ascii'}) write = dw.get_split_write() for name in names: for value in data[name][0]: write(value) sources[dsname] = ( dw.finish(), min(unnan(data[name][1] for name in names)), max(unnan(data[name][2] for name in names)), ) return sources[dsname]
def test_column_discarding(): dw = DatasetWriter(name='column discarding') dw.add('a', 'bytes') dw.add('b', 'bytes') dw.add('c', 'bytes') w = dw.get_split_write() w(b'a', b'b', b'c') source = dw.finish() # Discard b because it's not typed ac_implicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), discard_untyped=True, ).dataset() assert sorted(ac_implicit.columns) == ['a', 'c'], '%s: %r' % (ac_implicit, sorted(ac_implicit.columns),) assert list(ac_implicit.iterate(None)) == [('a', 'c',)], ac_implicit # Discard b explicitly ac_explicit = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b=None), ).dataset() assert sorted(ac_explicit.columns) == ['a', 'c'], '%s: %r' % (ac_explicit, sorted(ac_explicit.columns),) assert list(ac_explicit.iterate(None)) == [('a', 'c',)], ac_explicit # Discard c by overwriting it with b. Keep untyped b. ac_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(ac_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (ac_bASc, sorted(ac_bASc.columns),) assert list(ac_bASc.iterate(None)) == [('a', b'b', 'b',)], ac_bASc # Discard c by overwriting it with b. Also type b as a different type. abc_bASc = subjobs.build( 'dataset_type', source=source, column2type=dict(a='ascii', b='strbool', c='ascii'), rename=dict(b='c'), ).dataset() assert sorted(abc_bASc.columns) == ['a', 'b', 'c'], '%s: %r' % (abc_bASc, sorted(abc_bASc.columns),) assert list(abc_bASc.iterate(None)) == [('a', True, 'b',)], abc_bASc
def test_rehash_with_empty_slices(): dw = DatasetWriter(name='rehash with empty slices', hashlabel='a') dw.add('a', 'ascii') dw.add('b', 'ascii') w = dw.get_split_write() w('a', '42') w('42', 'b') source = dw.finish() hashfunc = typed_writer('int32').hash def verify_hashing(caption, want_values, **kw): ds = subjobs.build('dataset_type', source=source, column2type=dict(a='int32_10'), caption=caption, **kw).dataset() got_values = set() for sliceno in range(g.slices): for got in ds.iterate(sliceno): assert hashfunc(got[0]) % g.slices == sliceno assert got not in got_values got_values.add(got) assert want_values == got_values verify_hashing('with discard', {( 42, 'b', )}, filter_bad=True) # using defaults uses some different code paths verify_hashing('with default=0 (probably two slices)', {( 0, '42', ), ( 42, 'b', )}, defaults=dict(a='0')) verify_hashing('with default=42 (one slice)', {( 42, '42', ), ( 42, 'b', )}, defaults=dict(a='42'))
def synthesis(job): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) prev = ds.link_to_here(name, override_previous=prev) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(job, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(job, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in prev._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) prev = dw.finish() ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(job, "nocache")._data assert "cache" in Dataset(job, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(job, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(job, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % (want1, got, fromstr,) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.slices): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % (bytes_ds, typ,)) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % (bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % (typed_ds, typ, bytes_ds,)) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % (typed_ds, typ, bytes_ds,), True) used_type(typ)
def write(data, **kw): dw = DatasetWriter(columns=columns, **kw) w = dw.get_split_write_dict() for values in data: w(values) return dw.finish()
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ [ True, b'first', b'1.1', '1', '"a"', '001', b'ett', ], [ True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ], [ True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ], [ False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ], # number:int bad [ False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ], # int32_10 bad [ False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ], # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad [ False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ], # unicode:utf-8 bad [ True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ], [ True, b'tenth', b'10', '10', '"j"', '10', b'tio', ], [ False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ], # float64, int32_10 and number:int bad [ True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ], ] want_bad = [tuple(l[1:]) for l in data if not l[0]] dw = DatasetWriter(name="filter bad across types", columns=columns, allow_missing_slices=True) cols_to_check = ['int32_10', 'bytes', 'json', 'unicode:utf-8'] if PY3: # z so it sorts last. dw.add('zpickle', 'pickle') cols_to_check.append('zpickle') for ix in range(len(data)): data[ix].append({ix}) dw.set_slice(0) want = [] def add_want(ix): v = data[ix] want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) if PY3: want[-1] = want[-1] + (v[7], ) for ix, v in enumerate(data): if v[0]: add_want(ix) dw.write(*v[1:]) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, cols_to_check)) assert got == want, "Expected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') bad_ds = Dataset(jid, 'bad') got_bad = list(bad_ds.iterate(0, sorted(columns))) assert got_bad == want_bad, "Expected %r, got %r from %s (from %r%s)" % ( want_bad, got_bad, bad_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap if not defaults: want_bad.pop(0) # number:int want_bad.pop(1) # float64 want_bad.pop(1) # json defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(3) add_want(5) data[6][4] = '"replacement"' add_want(6) want.sort() # adding them out of order, int32_10 sorts correctly.
def synthesis(params): a0 = mkds('a0', ['0', '1'], [(1, 2), (3, 4), (5, 6)]) a1 = mkds('a1', ['0', '1'], [(7, 8), (9, 10), (11, 12)], previous=a0) b0 = mkds('b0', ['1', '2'], [(13, 14), (15, 16), (17, 18)], parent=a0) b1 = mkds('b1', ['1', '2'], [(19, 20), (21, 22), (23, 24)], parent=a1, previous=b0) c0 = mkds('c0', ['3'], [(25, ), (26, ), (27, )], parent=a0) c1 = mkds('c1', ['3'], [(28, ), (29, ), (30, )], parent=a1, previous=c0) # a contains columns 0 and 1 # b contains columns 0, 1 and 2 with 0 from a # c contains columns 0, 1 and 3 with 0 and 1 from a bc0 = merge('bc0', b0, c0) # simple merge, one overlapping column # bc contains columns 0, 1, 2 and 3, with 0 and 1 from a (via c), 2 from b and 3 from c check(bc0, [(1, 2, 14, 25), (3, 4, 16, 26), (5, 6, 18, 27)]) bc1 = merge('bc1', b1, c1, previous=bc0) # chained check(bc1, [(1, 2, 14, 25), (3, 4, 16, 26), (5, 6, 18, 27), (7, 8, 20, 28), (9, 10, 22, 29), (11, 12, 24, 30)]) cb0 = merge('cb0', c0, b0) # other direction, getting the other "1" column. # cb contains columns 0, 1, 2 and 3, with 0 from a (via b), 1 and 2 from b and 3 from c check(cb0, [(1, 13, 14, 25), (3, 15, 16, 26), (5, 17, 18, 27)]) d0 = mkds('d0', ['4'], [(37, ), (38, ), (39, )], parent=c0) bd0 = merge('bd0', b0, d0) # immediate parents are not shared # bd contains columns 0, 1, 2, 3 and 4, with 0 and 1 from a (via d -> c -> a), 2 from b, 3 from c and 4 from d check(bd0, [(1, 2, 14, 25, 37), (3, 4, 16, 26, 38), (5, 6, 18, 27, 39)]) # more than two datasets with complex parent relationship # merged in two stages here, but a single dataset_merge job later. cbd0 = merge('cbd0', c0, bd0) del merges['cbd0'] cbdb0 = merge('cbdb0', cbd0, b0) merges['cbdb0'] = ((c0, bd0, b0), {}) # cbdb contains columns 0, 1, 2, 3 and 4, with 0 from a (via d -> c -> a), 1 and 2 from b, 3 from c and 4 from d check(cbdb0, [(1, 13, 14, 25, 37), (3, 15, 16, 26, 38), (5, 17, 18, 27, 39)]) fail_merge(a0, a1) # no parents fail_merge(b0, b1) # parents not shared fail_merge(b0, b0) # merge with self other = mkds('other', ['5'], [(31, ), (32, ), (33, )]) fail_merge(a0, other) # parents not shared aother = merge('aother', a0, other, allow_unrelated=True) # aother contains 0 and 1 from a, 5 from other check(aother, [(1, 2, 31), (3, 4, 32), (5, 6, 33)]) # check hashed datasets too ab_a = mkds('ab_a', ['a', 'b'], [(1, 2), (3, 4), (5, 6)], hashlabel='a') ab_b = mkds('ab_b', ['a', 'b'], [(7, 8), (9, 10), (11, 12)], hashlabel='b') ac_a = mkds('ac_a', ['a', 'c'], [(1, 14), (3, 15), (5, 16)], hashlabel='a') # a values must match ab_a fail_merge(ab_a, ab_b, allow_unrelated=True) # different hashlabels abac_a = merge('abac_a', ab_a, ac_a, allow_unrelated=True) assert abac_a.hashlabel == 'a' check(abac_a, [(1, 2, 14), (3, 4, 15), (5, 6, 16)]) # merge hashed with unhashed (which we align with the hashlabel manually) dw = DatasetWriter(name='d_none', columns={'d': 'number'}) for sliceno in range(params.slices): dw.set_slice(sliceno) for v in ab_a.iterate(sliceno, 'a'): dw.write(v + 16) d_none = dw.finish() abd_a = merge('abd_a', ab_a, d_none, allow_unrelated=True) assert abd_a.hashlabel == 'a' check(abd_a, [(1, 2, 17), (3, 4, 19), (5, 6, 21)]) # other way round should affect nothing here dab_a = merge('dab_a', ab_a, d_none, allow_unrelated=True) assert dab_a.hashlabel == 'a' check(dab_a, [(1, 2, 17), (3, 4, 19), (5, 6, 21)]) # the same test but with the lines in the wrong slices: dw = DatasetWriter(name='e_none', columns={'e': 'number'}) e_done = False for sliceno in range(params.slices): dw.set_slice(sliceno) # there are 3 lines in total, some slice will not have all of them. if ab_a.lines[sliceno] != 3 and not e_done: dw.write(17) dw.write(19) dw.write(21) e_done = True assert e_done e_none = dw.finish() fail_merge(ab_a, e_none, allow_unrelated=True) # and finally test all we tested above using the dataset_merge method too for name, (parents, kw) in merges.items(): a_ds = dict(source=parents) if 'previous' in kw: a_ds['previous'] = kw.pop('previous') jid = subjobs.build('dataset_merge', datasets=a_ds, options=kw) check(jid.dataset(), checks[name]) for parents, kw in failed_merges: try: subjobs.build('dataset_merge', datasets=dict(source=parents), options=kw) except JobError: continue raise Exception("dataset_merge incorrectly allowed %r with %r" % (parents, kw))