def synthesis(params): dw = DatasetWriter(name="parent") in_parent = [ # list because order matters "-", # becomes _ because everything must be a valid python identifier. "a b", # becomes a_b because everything must be a valid python identifier. "42", # becomes _42 because everything must be a valid python identifier. "print", # becomes print_ because print is a keyword (in py2). "print@", # becomes print__ because print_ is taken. "None", # becomes None_ because None is a keyword (in py3). ] for colname in in_parent: dw.add(colname, "unicode") w = dw.get_split_write() w(_="- 1", a_b="a b 1", _42="42 1", print_="print 1", None_="None 1", print__="Will be overwritten 1") w(_="- 2", a_b="a b 2", _42="42 2", print_="print 2", None_="None 2", print__="Will be overwritten 2") parent = dw.finish() dw = DatasetWriter(name="child", parent=parent) in_child = [ # order still matters "print_*", # becomes print___ because print__ is taken. "print_", # becomes print____ because all shorter are taken. "normal", # no collision. "Normal", # no collision. "print@", # re-uses print__ from the parent dataset. ] for colname in in_child: dw.add(colname, "unicode") w = dw.get_split_write() w(print__="print@ 1", print___="print_* 1", print____="print_ 1", normal="normal 1", Normal="Normal 1") w(print__="print@ 2", print___="print_* 2", print____="print_ 2", normal="normal 2", Normal="Normal 2") child = dw.finish() for colname in in_parent + in_child: data = set(child.iterate(None, colname)) assert data == {colname + " 1", colname + " 2"}, "Bad data for %s: %r" % (colname, data)
def synthesis(jobid): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) ds.link_to_here(name, override_previous=prev) prev = ( jobid, name, ) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(jobid, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(jobid, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in Dataset(prev)._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) dw.finish() prev = ( jobid, name, ) ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(jobid, "nocache")._data assert "cache" in Dataset(jobid, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(jobid, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(jobid, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def _verify(name, types, data, coltype, want, default, want_fail, kw): if callable(want): check = want else: def check(got, fromstr, filtered=False): want1 = want if isinstance(want, list) else want[typ] if filtered: want1 = want1[::2] assert got == want1, 'Expected %r, got %r from %s.' % ( want1, got, fromstr, ) dw = DatasetWriter(name=name, columns={'data': coltype, 'extra': 'bytes'}) dw.set_slice(0) for ix, v in enumerate(data): dw.write(v, b'1' if ix % 2 == 0 else b'skip') for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) bytes_ds = dw.finish() for typ in types: opts = dict(column2type=dict(data=typ)) opts.update(kw) if default is not no_default: opts['defaults'] = {'data': default} try: jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) except JobError: if want_fail: continue raise Exception('Typing %r as %s failed.' % ( bytes_ds, typ, )) assert not want_fail, "Typing %r as %s should have failed, but didn't (%s)." % ( bytes_ds, typ, jid) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check(got, '%s (typed as %s from %r)' % ( typed_ds, typ, bytes_ds, )) if 'filter_bad' not in opts and not callable(want): opts['filter_bad'] = True opts['column2type']['extra'] = 'int32_10' jid = subjobs.build('dataset_type', datasets=dict(source=bytes_ds), options=opts) typed_ds = Dataset(jid) got = list(typed_ds.iterate(0, 'data')) check( got, '%s (typed as %s from %r with every other line skipped from filter_bad)' % ( typed_ds, typ, bytes_ds, ), True) used_type(typ)
def test_filter_bad_across_types(): columns = { 'bytes': 'bytes', 'float64': 'bytes', 'int32_10': 'ascii', 'json': 'unicode', 'number:int': 'unicode', 'unicode:utf-8': 'bytes', } # all_good, *values # Make sure all those types (except bytes) can filter other lines, # and be filtered by other lines. And that several filtering values # is not a problem (line 11). data = [ ( True, b'first', b'1.1', '1', '"a"', '001', b'ett', ), ( True, b'second', b'2.2', '2', '"b"', '02', b'tv\xc3\xa5', ), ( True, b'third', b'3.3', '3', '["c"]', '3.0', b'tre', ), ( False, b'fourth', b'4.4', '4', '"d"', '4.4', b'fyra', ), # number:int bad ( False, b'fifth', b'5.5', '-', '"e"', '5', b'fem', ), # int32_10 bad ( False, b'sixth', b'6.b', '6', '"f"', '6', b'sex', ), # float64 bad [ False, b'seventh', b'7.7', '7', '{"g"}', '7', b'sju', ], # json bad ( False, b'eigth', b'8.8', '8', '"h"', '8', b'\xa5\xc3tta', ), # unicode:utf-8 bad ( True, b'ninth', b'9.9', '9', '"i"', '9', b'nio', ), ( True, b'tenth', b'10', '10', '"j"', '10', b'tio', ), ( False, b'eleventh', b'11a', '1-', '"k",', '1,', b'elva', ), # float64, int32_10 and number:int bad ( True, b'twelfth', b'12', '12', '"l"', '12', b'tolv', ), ] dw = DatasetWriter(name="filter bad across types", columns=columns) dw.set_slice(0) want = [] def add_want(v): want.append(( int(v[3]), v[1], json.loads(v[4]), v[6].decode('utf-8'), )) for v in data: if v[0]: add_want(v) dw.write(*v[1:]) for sliceno in range(1, g.SLICES): dw.set_slice(sliceno) source_ds = dw.finish() # Once with just filter_bad, once with some defaults too. defaults = {} for _ in range(2): jid = subjobs.build( 'dataset_type', datasets=dict(source=source_ds), options=dict(column2type={t: t for t in columns}, filter_bad=True, defaults=defaults), ) typed_ds = Dataset(jid) got = list( typed_ds.iterate(0, ['int32_10', 'bytes', 'json', 'unicode:utf-8'])) assert got == want, "Exptected %r, got %r from %s (from %r%s)" % ( want, got, typed_ds, source_ds, ' with defaults' if defaults else '') # make more lines "ok" for the second lap defaults = {'number:int': '0', 'float64': '0', 'json': '"replacement"'} add_want(data[3]) add_want(data[5]) data[6][4] = '"replacement"' add_want(data[6]) want.sort() # adding them out of order, int32_10 sorts correctly.
def write(data, **kw): dw = DatasetWriter(columns=columns, **kw) w = dw.get_split_write_dict() for values in data: w(values) return dw.finish()
def do_one(params, name, data): dw = DatasetWriter(name=name, columns=columns) dw.set_slice(0) for v in data: if v is None: d = dict( ascii_new=None, ascii_old=None, bytes_new=None, bytes_old=None, unicode_new=None, unicode_old=None, ) else: d = dict( ascii_new=v, ascii_old=v, bytes_new=uni(v).encode("ascii"), bytes_old=uni(v).encode("ascii"), unicode_new=uni(v), unicode_old=uni(v), ) dw.write_dict(d) # We don't really want the other slices, but write one thing to # each, to make sure it doesn't show up in slice 0. # (Small slice merging will put it in the same file, so this is # a real risk.) for sliceno in range(1, params.slices): dw.set_slice(sliceno) dw.write_dict(d) dw.finish() # verify we got what we asked for me_ds = Dataset(params.jobid, name) for colname, coltype in columns.items(): col = me_ds.columns[colname] assert col.type == coltype.split("_")[-1], colname assert col.backing_type == coltype, colname for want, got in zip(data, me_ds.iterate(0, colname)): if want is not None: if PY2 and "unicode" in coltype: want = uni(want) if PY3 and "bytes" in coltype: want = want.encode("ascii") assert want == got, "%s in %s did not contain the expected value. Wanted %r but got %r." % (colname, me_ds, want, got) # check that both types of bytes filter correctly through typing jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="number", # fails on the string, so that gets filtered out everywhere bytes_new="bytes", bytes_old="bytes", ), filter_bad=True, )) ds = Dataset(jid) # verify the number first data_it = iter(raw_data) next(data_it) # skip the filtered out string for got in ds.iterate(0, "ascii_old"): want = next(data_it) if want is None: # Becomes 0 because the typer (unfortunately) sees it as an empty string want = 0 assert want == got, "ascii_old in %s did not type correctly as number. Wanted %r but got %r." % (ds, want, got) # now verify all the bytes ones are ok, no longer containing the string. for colname in ("ascii_new", "bytes_new", "bytes_old",): data_it = iter(data) next(data_it) # skip the filtered out string for got in ds.iterate(0, colname): want = next(data_it) if want is not None: want = want.encode("ascii") assert want == got, "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got) # and now check that the Nones are ok after making bytes from ascii and unicode from bytes. jid = subjobs.build("dataset_type", datasets=dict(source=me_ds), options=dict( column2type=dict( ascii_new="bytes", ascii_old="bytes", bytes_new="unicode:ascii", bytes_old="unicode:ascii", ), )) ds = Dataset(jid) for colname in ("ascii_new", "ascii_old", "bytes_new", "bytes_old",): for want, got in ds.iterate(0, ["unicode_new", colname]): assert uni(want) == uni(got), "%s in %s did not roundtrip correctly as bytes. Wanted %r but got %r." % (colname, ds, want, got)