def prepare(params): d = datasets.source caption = options.caption % dict(caption=d.caption, hashlabel=options.hashlabel) prev_p = job_params(datasets.previous, default_empty=True) prev_source = prev_p.datasets.source if len(d.chain(stop_jobid=prev_source, length=options.length)) == 1: filename = d.filename else: filename = None dws = [] previous = datasets.previous for sliceno in range(params.slices): if options.as_chain and sliceno == params.slices - 1: name = "default" else: name = str(sliceno) dw = DatasetWriter( caption="%s (slice %d)" % (caption, sliceno), hashlabel=options.hashlabel, filename=filename, previous=previous, name=name, for_single_slice=sliceno, ) previous = (params.jobid, name) dws.append(dw) names = [] for n, c in d.columns.items(): # names has to be in the same order as the add calls # so the iterator returns the same order the writer expects. names.append(n) for dw in dws: dw.add(n, c.type) return dws, names, prev_source, caption, filename
def synthesis(params): dw = DatasetWriter(name="parent") in_parent = [ # list because order matters "-", # becomes _ because everything must be a valid python identifier. "a b", # becomes a_b because everything must be a valid python identifier. "42", # becomes _42 because everything must be a valid python identifier. "print", # becomes print_ because print is a keyword (in py2). "print@", # becomes print__ because print_ is taken. "None", # becomes None_ because None is a keyword (in py3). ] for colname in in_parent: dw.add(colname, "unicode") w = dw.get_split_write() w(_="- 1", a_b="a b 1", _42="42 1", print_="print 1", None_="None 1", print__="Will be overwritten 1") w(_="- 2", a_b="a b 2", _42="42 2", print_="print 2", None_="None 2", print__="Will be overwritten 2") parent = dw.finish() dw = DatasetWriter(name="child", parent=parent) in_child = [ # order still matters "print_*", # becomes print___ because print__ is taken. "print_", # becomes print____ because all shorter are taken. "normal", # no collision. "Normal", # no collision. "print@", # re-uses print__ from the parent dataset. ] for colname in in_child: dw.add(colname, "unicode") w = dw.get_split_write() w(print__="print@ 1", print___="print_* 1", print____="print_ 1", normal="normal 1", Normal="Normal 1") w(print__="print@ 2", print___="print_* 2", print____="print_ 2", normal="normal 2", Normal="Normal 2") child = dw.finish() for colname in in_parent + in_child: data = set(child.iterate(None, colname)) assert data == {colname + " 1", colname + " 2"}, "Bad data for %s: %r" % (colname, data)
def synthesis(prepare_res, params): dw_passed, _ = prepare_res # Using set_slice on a dataset that was written in analysis is not # actually supported, but since it currently works (as long as that # particular slice wasn't written in analysis) let's test it. dw_passed.set_slice(0) dw_passed.write(**{k: v[0] for k, v in test_data.data.items()}) dw_synthesis_split = DatasetWriter(name="synthesis_split", hashlabel="a") dw_synthesis_split.add("a", "int32") dw_synthesis_split.add("b", "unicode") dw_synthesis_split.get_split_write()(1, "a") dw_synthesis_split.get_split_write_list()([2, "b"]) dw_synthesis_split.get_split_write_dict()({"a": 3, "b": "c"}) dw_synthesis_manual = DatasetWriter(name="synthesis_manual", columns={"sliceno": "int32"}) dw_nonetest = DatasetWriter(name="nonetest", columns={t: t for t in test_data.data}) for sliceno in range(params.slices): dw_synthesis_manual.set_slice(sliceno) dw_synthesis_manual.write(sliceno) dw_nonetest.set_slice(sliceno) dw_nonetest.write( **{ k: v[0] if k in test_data.not_none_capable else None for k, v in test_data.data.items() })
def prepare(params): assert params.slices >= test_data.value_cnt dw_default = DatasetWriter() dw_default.add("a", "number") dw_default.add("b", "ascii") DatasetWriter(name="named", columns={"c": "bool", "d": "date"}) dw_passed = DatasetWriter(name="passed", columns={t: t for t in test_data.data}) return dw_passed, 42
def synthesis(jobid): manual_chain = [Dataset(jobids.selfchain, name) for name in "abcdefgh"] manual_abf = [manual_chain[0], manual_chain[1], manual_chain[5]] # build a local abf chain prev = None for ix, ds in enumerate(manual_abf): name = "abf%d" % (ix, ) ds.link_to_here(name, override_previous=prev) prev = ( jobid, name, ) manual_abf_data = list(Dataset.iterate_list(None, None, manual_abf)) local_abf_data = list(Dataset(jobid, "abf2").iterate_chain(None, None)) assert manual_abf_data == local_abf_data # disconnect h, verify there is no chain manual_chain[-1].link_to_here("alone", override_previous=None) assert len(Dataset(jobid, "alone").chain()) == 1 # check that the original chain is unhurt assert manual_chain == manual_chain[-1].chain() # So far so good, now make a chain long enough to have a cache. prev = None ix = 0 going = True while going: if prev and "cache" in Dataset(prev)._data: going = False name = "longchain%d" % (ix, ) dw = DatasetWriter(name=name, previous=prev) dw.add("ix", "number") dw.get_split_write()(ix) dw.finish() prev = ( jobid, name, ) ix += 1 # we now have a chain that goes one past the first cache point full_chain = Dataset(prev).chain() assert "cache" in full_chain[ -2]._data # just to check the above logic is correct assert "cache" not in full_chain[-1]._data # just to be sure.. full_chain[-2].link_to_here("nocache", override_previous=None) full_chain[-1].link_to_here("withcache", override_previous=full_chain[-3]) assert "cache" not in Dataset(jobid, "nocache")._data assert "cache" in Dataset(jobid, "withcache")._data # And make sure they both get the right data too. assert list(Dataset(prev).iterate_chain(None, "ix")) == list(range(ix)) assert list(Dataset(jobid, "nocache").iterate_chain(None, "ix")) == [ix - 2] assert list(Dataset(jobid, "withcache").iterate_chain( None, "ix")) == list(range(ix - 2)) + [ix - 1]
def prepare(params): dws = {} prev = None for name in "abcdefgh": dw = DatasetWriter(name=name, previous=prev) dw.add("ds", "ascii") dw.add("num", "number") dws[name] = dw prev = "%s/%s" % ( params.jobid, name, ) return dws
def prepare(): from dataset import DatasetWriter # previous allows chaining this method, should you wish to do so dw = DatasetWriter(previous=datasets.previous) dw.add('a string', 'ascii') # ascii is not "any string", use 'unicode' for that dw.add('large number', 'number') # number is any (real) number, a float or int of any size dw.add('small number', 'number') dw.add('small integer', 'int32') # int32 is a signed 32 bit number dw.add('gauss number', 'number') dw.add('gauss float', 'float64') # float64 is what many other languages call double return dw
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() for name, hashlabel in ( ("unhashed_manual", None), # manually interlaved ("unhashed_split", None), # split_write interlaved ("up_checked", "up"), # hashed on up using dw.hashcheck ("up_split", "up"), # hashed on up using split_write ("down_checked", "down"), # hashed on down using dw.hashcheck ("down_discarded", "down"), # hashed on down using discarding writes ("down_discarded_list", "down"), # hashed on down using discarding list writes ("down_discarded_dict", "down"), # hashed on down using discarding dict writes ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", "int32") dw.add("down", "int32") dws[name] = dw return dws
def prepare(): dw = DatasetWriter() dw.add("str", "ascii") dw.add("num", "number") return dw
def prepare(): dw = DatasetWriter(parent=datasets.source) dw.add('prod', 'number') # works for float as well as int return dw
def prepare(): dw = DatasetWriter(previous=datasets.previous) dw.add('rflt', 'float64') dw.add('rint', 'int64') return dw