def finish(self, path, timestamp=None, caption=None): path = self._path(path) assert self._current, 'Tried to finish %s with nothing running' % ( path, ) assert path == self._current, 'Tried to finish %s while running %s' % ( path, self._current, ) user, build = path.split('/') self._current = None caption = caption or self._current_caption or '' timestamp = timestamp or self._current_timestamp assert timestamp, 'No timestamp specified in begin or finish for %s' % ( path, ) data = DotDict( user=user, build=build, joblist=self.joblist.as_tuples, deps=self._deps, caption=caption, timestamp=timestamp, ) if self._update: data.flags = ['update'] url = self._url + '/add' return self._call(url, data)
def read_methods_conf(dirname, autodiscover): """ read and parse the methods.conf file """ db = {} if autodiscover: methods = glob(os.path.join(dirname, 'a_*.py')) for method in methods: if method not in db: db[os.path.basename(method)[2:-3]] = DotDict(version='DEFAULT') filename = os.path.join(dirname, 'methods.conf') if autodiscover and not os.path.exists(filename): return db with open(filename) as fh: for lineno, line in enumerate(fh, 1): data = line.split('#')[0].split() if not data: continue method = data.pop(0) if autodiscover and (method not in db): # in auto-discover, anything in methods.conf goes continue try: version = data.pop(0) except IndexError: version = 'DEFAULT' if data: raise AcceleratorError('Trailing garbage on %s:%d: %s' % ( filename, lineno, line, )) db[method] = DotDict(version=version) return db
def analysis(sliceno, slices, prepare_res): if options.numeric_comma: try_locales = [ 'da_DK', 'nb_NO', 'nn_NO', 'sv_SE', 'fi_FI', 'en_ZA', 'es_ES', 'es_MX', 'fr_FR', 'ru_RU', 'de_DE', 'nl_NL', 'it_IT', ] for localename in try_locales: localename = localename.encode('ascii') if not cstuff.backend.numeric_comma(localename): break if not cstuff.backend.numeric_comma(localename + b'.UTF-8'): break else: raise Exception("Failed to enable numeric_comma, please install at least one of the following locales: " + " ".join(try_locales)) dw, dws, lines, chain, column2type = prepare_res if dws: dw = dws[sliceno] rehashing = True else: rehashing = False vars = DotDict( sliceno=sliceno, slices=slices, known_line_count=0, badmap_size=0, badmap_fd=-1, slicemap_size=0, slicemap_fd=-1, map_fhs=[], res_bad_count={}, res_default_count={}, res_minmax={}, first_lap=True, rehashing=rehashing, hash_lines=None, dw=dw, chain=chain, lines=lines, column2type=column2type, rev_rename={v: k for k, v in options.rename.items() if k in datasets.source.columns and v in column2type}, ) if options.filter_bad: vars.badmap_fd = map_init(vars, 'badmap%d' % (sliceno,)) bad_count, default_count, minmax = analysis_lap(vars) if sum(sum(c) for c in itervalues(bad_count)): vars.first_lap = False vars.res_bad_count = {} final_bad_count, default_count, minmax = analysis_lap(vars) final_bad_count = [max(c) for c in zip(*final_bad_count.values())] else: final_bad_count = [0] * slices else: bad_count, default_count, minmax = analysis_lap(vars) final_bad_count = [0] * slices for fh in vars.map_fhs: fh.close() if rehashing: unlink('slicemap%d' % (sliceno,)) return bad_count, final_bad_count, default_count, minmax, vars.hash_lines
def params2defaults(params): d = DotDict() for key in ('datasets', 'jobs',): r = {} for v in params[key]: if isinstance(v, list): r[v[0]] = [] else: r[v] = None d[key] = r def fixup(item): if isinstance(item, dict): d = {k: fixup(v) for k, v in iteritems(item)} if len(d) == 1 and first_value(d) is None and first_value(item) is not None: return {} return d if isinstance(item, (list, tuple, set,)): l = [fixup(v) for v in item] if l == [None] and list(item) != [None]: l = [] return type(item)(l) if isinstance(item, (type, OptionEnum)): return None assert isinstance(item, (bytes, unicode, int, float, long, bool, OptionEnum, NoneType, datetime.datetime, datetime.date, datetime.time, datetime.timedelta)), type(item) return item def fixup0(item): if isinstance(item, RequiredOption): item = item.value if isinstance(item, OptionDefault): item = item.default return fixup(item) d.options = {k: fixup0(v) for k, v in iteritems(params.options)} return d
def load_setup(jobid): """Loads but does not type setup.json from jobid. You probably want to use extras.job_params instead. """ d = json_load('setup.json', jobid) version = d.version if version == 1: d.jobs = d.pop('jobids') version = 2 if version == 2: if 'exectime' not in d and 'profile' in d: d.exectime = d.pop('profile') d.versions = DotDict() python_path = d.pop('python', None) if python_path: d.versions.python_path = python_path version = 3 if version == 3: if '_typing' in d: d['_typing'] = {d.method: d['_typing']} d.params = { d.method: DotDict({k: d[k] for k in ('options', 'datasets', 'jobs')}) } else: raise Exception( "Don't know how to load setup.json version %d (in %s)" % ( d.version, jobid, )) return d
def synthesis(prepare_res, params): ds = DotDict() # Must be finished in order (.previous must be finished when .finish is called.) for name, dw in sorted(prepare_res.items()): ds[name] = dw.finish() last = ds.h assert last.chain() == sorted(ds.values()) ds.last = last test_partial_chains(ds) test_filters(ds)
def test(src_ds, opts, expect_lines): opts = DotDict(opts) def rename(colname): return opts.get('rename', {}).get(colname, colname) cols = set(opts.column2type) opts.discard_untyped = True msg = 'Testing with types %s' % (', '.join(v for k, v in sorted(opts.column2type.items())),) expect_hl = None if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json': # json is not hashable, so we have to override the hashlabel to nothing in this case. opts.hashlabel = '' msg += ' (clearing hashlabel)' elif src_ds.hashlabel: expect_hl = rename(src_ds.hashlabel) if expect_hl in opts.column2type: msg += ' (hashed on %s)' % (opts.column2type[expect_hl],) else: expect_hl = None msg += ' (hashed on <untyped column>)' print(msg) just_typed = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert just_typed.hashlabel == expect_hl, just_typed assert set(just_typed.columns) == cols, just_typed assert sum(just_typed.lines) == expect_lines, just_typed if rename(src_ds.hashlabel) not in opts.column2type or opts.get('hashlabel') == '': assert just_typed.hashlabel is None, just_typed else: assert just_typed.hashlabel == rename(src_ds.hashlabel), just_typed del opts.discard_untyped rev_rename = {v: k for k, v in opts.get('rename', {}).items()} discard = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols) if discard: d = opts.get('rename', {}) d.update({k: None for k in discard}) opts.rename = d for hashlabel in cols: if opts.column2type[hashlabel] == 'json': # not hashable continue opts['hashlabel'] = hashlabel print('%s rehashed on %s' % (msg, opts.column2type[hashlabel],)) hashed_by_type = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert hashed_by_type.hashlabel == hashlabel, hashed_by_type assert set(hashed_by_type.columns) == cols, hashed_by_type assert sum(hashed_by_type.lines) == expect_lines, hashed_by_type hashed_after = subjobs.build('dataset_hashpart', options=dict(hashlabel=hashlabel), datasets=dict(source=just_typed)).dataset() assert hashed_after.hashlabel == hashlabel, hashed_after assert set(hashed_after.columns) == cols, hashed_after assert sum(hashed_after.lines) == expect_lines, hashed_after if src_ds.hashlabel: # if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after # will have been hashed twice and therefore have a different order than hashed_by_type. if rename(src_ds.hashlabel) == hashlabel: # These should be the same though. subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=just_typed)) hashed_by_type = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_by_type)) hashed_after = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_after)) subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=hashed_after))
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() # all the numeric types should hash the same (for values they have in common) for name, hashlabel, typ in ( ("unhashed_manual", None, "int32"), # manually interlaved ("unhashed_split", None, "int64"), # split_write interlaved ("up_checked", "up", "float32"), # hashed on up using dw.hashcheck ("up_split", "up", "float64"), # hashed on up using split_write ("down_checked", "down", "bits32"), # hashed on down using dw.hashcheck ("down_discarded", "down", "bits64"), # hashed on down using discarding writes ("down_discarded_list", "down", "number"), # hashed on down using discarding list writes ("down_discarded_dict", "down", "complex32"), # hashed on down using discarding dict writes # we have too many types, so we need more datasets ("unhashed_complex64", None, "complex64"), ("unhashed_bytes", None, "bytes"), ("up_ascii", "up", "ascii"), ("down_unicode", "down", "unicode"), # datetime on 1970-01-01 hashes like time ("up_datetime", "up", "datetime"), ("down_time", "down", "time"), # date doesn't hash the same as anything else, so compare it to itself ("up_date", "up", "date"), ("down_date", "down", "date"), ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", typ) dw.add("down", typ) dws[name] = dw return dws
def _validate_data(self, data, with_deps=True): if with_deps: assert set(data) == { 'timestamp', 'joblist', 'caption', 'user', 'build', 'deps', 'flags', } assert isinstance(data.user, unicode) assert isinstance(data.build, unicode) assert isinstance(data.deps, dict) for v in itervalues(data.deps): assert isinstance(v, dict) self._validate_data(DotDict(v), False) else: assert set(data) == { 'timestamp', 'joblist', 'caption', } assert joblistlike(data.joblist), data.joblist assert data.joblist assert isinstance(data.caption, unicode) data.timestamp = TimeStamp(data.timestamp)
def load_setup(jobid): """Loads but does not type setup.json from jobid. You probably want to use extras.job_params instead. """ d = json_load('setup.json', jobid) version = d.version if version == 1: d.jobs = d.pop('jobids') version = 2 if version == 2: if 'exectime' not in d and 'profile' in d: d.exectime = d.pop('profile') d.versions = DotDict() python_path = d.pop('python', None) if python_path: d.versions.python_path = python_path version = 3 if version == 3: # no changes here, it's only used to know how to find datasets version = 4 if version != 4: raise AcceleratorError( "Don't know how to load setup.json version %d (in %s)" % ( d.version, jobid, )) return d
def synthesis(prepare_res, analysis_res): separator, _, _, filename, _, labels, dw, bad_dw, skipped_dw, fds, success_fd, _, = prepare_res # Analysis may have gotten a perfectly legitimate EOF if something # went wrong in the reader process, so we need to check that all # went well. try: reader_res = os.read(success_fd, 1) except OSError: reader_res = None if reader_res != b"\0": raise Exception("Reader process failed") good_counts = [] bad_counts = [] skipped_counts = [] for sliceno, (good_count, bad_count, skipped_count) in enumerate(analysis_res): dw.set_lines(sliceno, good_count) if bad_dw: bad_dw.set_lines(sliceno, bad_count) if skipped_dw: skipped_dw.set_lines(sliceno, skipped_count) good_counts.append(good_count) bad_counts.append(bad_count) skipped_counts.append(skipped_count) res = DotDict( num_lines=sum(good_counts), lines_per_slice=good_counts, num_broken_lines=sum(bad_counts), broken_lines_per_slice=bad_counts, num_skipped_lines=sum(skipped_counts), skipped_lines_per_slice=skipped_counts, ) blob.save(res, 'import') write_report(res, labels)
def add(): body = request.body if PY3: body = TextIOWrapper(body, encoding='utf-8') data = DotDict(json.load(body)) if data.user != request.auth[0]: abort(401, "Error: user does not match authentication!") result = db.add(data) return result
def run(jobidv, tlock): for jobid in jobidv: passed_cookie = None # This is not a race - all higher locks are locked too. while passed_cookie in job_tracking: passed_cookie = gen_cookie() concurrency_map = dict( data.concurrency_map) concurrency_map.update( setup.get('concurrency_map', ())) job_tracking[passed_cookie] = DotDict( lock=JLock(), last_error=None, last_time=0, workdir=workdir, concurrency_map=concurrency_map, ) try: self.ctrl.run_job( jobid, subjob_cookie=passed_cookie, parent_pid=setup.get( 'parent_pid', 0), concurrency=setup.get( 'concurrency') or concurrency_map.get( setup.method) or concurrency_map.get('-default-')) # update database since a new jobid was just created job = self.ctrl.add_single_jobid(jobid) with tlock: link2job[jobid]['make'] = 'DONE' link2job[jobid][ 'total_time'] = job.total except JobError as e: error.append( [e.jobid, e.method, e.status]) with tlock: link2job[jobid]['make'] = 'FAIL' return finally: del job_tracking[passed_cookie] # everything was built ok, update symlink try: dn = self.ctrl.workspaces[workdir].path ln = os.path.join(dn, workdir + "-LATEST_") try: os.unlink(ln) except OSError: pass os.symlink(jobid, ln) os.rename( ln, os.path.join(dn, workdir + "-LATEST")) except OSError: traceback.print_exc()
def synthesis(prepare_res): opts = DotDict( (k, v) for k, v in options.items() if k in a_csvimport.options) lst = prepare_res previous = datasets.previous for fn, info, dsn in lst: opts.filename = fn jid = subjobs.build('csvimport', options=opts, datasets=dict(previous=previous), caption="Import of %s from %s" % ( info.filename, options.filename, )) previous = Dataset(jid).link_to_here(dsn) if options.chaining == 'off': previous = None if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default': Dataset(jid).link_to_here('default')
def synthesis(): if options.inner: res = DotDict() res.datetime = options.datetime + options.timedelta res.time = options.time.replace(minute=0) res.date = options.date.replace(month=1) return res else: opts = dict( datetime=datetime(2019, 11, 6, 17, 37, 2, 987654), time=time(17, 37, 2, 987654), date=date(2019, 11, 6), timedelta=timedelta(microseconds=987654), inner=True, ) jid = subjobs.build('test_datetime', options=opts) res = jid.load() assert res.datetime == datetime(2019, 11, 6, 17, 37, 3, 975308) assert res.time == time(17, 0, 2, 987654) assert res.date == date(2019, 1, 6)
def synthesis(prepare_res, analysis_res): all = chain.from_iterable(analysis_res) if options.sort: all = sorted(all) res = md5(b''.join(all)).hexdigest() print("%s: %s" % ( datasets.source, res, )) return DotDict(sum=int(res, 16), sort=options.sort, columns=prepare_res, source=datasets.source)
def _parse_add(self, line): key = line[1] user, build = key.split('/') flags = line[4].split(',') if line[4] else [] data = DotDict(timestamp=line[0], user=user, build=build, deps=json.loads(line[2]), joblist=json.loads(line[3]), flags=flags, caption=line[5], ) self.add(data)
def _urd_typeify(d): if isinstance(d, str): d = json.loads(d) if not d or isinstance(d, unicode): return d res = DotDict() for k, v in d.items(): if k == 'joblist': v = JobList(Job(e[1], e[0]) for e in v) elif isinstance(v, dict): v = _urd_typeify(v) res[k] = v return res
def synthesis(): sum = 0 jobs = datasets.source.chain(length=options.chain_length, stop_ds=datasets.stop) for src in jobs: jid = build('dataset_checksum', options=dict(columns=options.columns, sort=options.sort), datasets=dict(source=src)) data = blob.load(jobid=jid) sum ^= data.sum print("Total: %016x" % (sum, )) return DotDict(sum=sum, columns=data.columns, sort=options.sort, sources=jobs)
def generate(caption, method, params, package=None, python=None, why_build=False): data = DotDict() data.caption = caption data.method = method if package: data.package = package if python: data.python = python if why_build: data.why_build = why_build data.params = params return data
def generate(caption, method, params, package=None, description=None, why_build=False): data = DotDict() data.caption = caption data.method = method if package: data.package = package data.versions = DotDict() if description: data.versions.python_path = description['interpreter'] data.versions.python = description['interpreter_version'] if description['accelerator_version']: data.versions.accelerator = description['accelerator_version'] if why_build: data.why_build = why_build data.params = params return data
def load_setup(jobid): """Loads but does not type setup.json from jobid. You probably want to use extras.job_params instead. """ d = json_load('setup.json', jobid) if d['version'] == 1: if '_typing' in d: d['_typing'] = {d.method: d['_typing']} d.params = { d.method: DotDict({k: d[k] for k in ('options', 'datasets', 'jobids')}) } else: raise Exception("Don't know how to load setup.json version %d" % (d['version'], )) return d
def read_method_conf(filename): """ read and parse the methods.conf file """ db = {} with open(filename) as fh: for lineno, line in enumerate(fh, 1): data = line.split('#')[0].split() if not data: continue method = data.pop(0) try: version = data.pop(0) except IndexError: version = 'DEFAULT' if data: raise Exception('Trailing garbage on %s:%d: %s' % (filename, lineno, line,)) db[method] = DotDict(version=version) return db
def run_job(self, jobid, subjob_cookie=None, parent_pid=0, concurrency=None): W = self.workspaces[Job(jobid).workdir] # active_workdirs = { name: ws.path for name, ws in self.workspaces.items() } slices = self.workspaces[self.target_workdir].slices t0 = time.time() setup = update_setup(jobid, starttime=t0) prof = setup.get('exectime', DotDict()) new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workdirs, slices, concurrency, self.debug, self.server_url, subjob_cookie, parent_pid) prefix = join(W.path, jobid) + '/' if not self.debug: for filename, temp in list(files.items()): if temp: unlink(join(prefix, filename)) del files[filename] prof.update(new_prof) prof.total = 0 prof.total = sum(v for v in prof.values() if isinstance(v, (float, int))) if concurrency: prof.concurrency = concurrency data = dict( starttime=t0, endtime=time.time(), exectime=prof, ) update_setup(jobid, **data) data['files'] = sorted( fn[len(prefix):] if fn.startswith(prefix) else fn for fn in files) data['subjobs'] = subjobs data['version'] = 1 json_save(data, jobid.filename('post.json'))
def truncate(self, key, timestamp): old = self.db[key] new = {} ghost = {} for ts, data in iteritems(old): if ts < timestamp: new[ts] = data else: ghost[ts] = data self.log('truncate', DotDict(key=key, timestamp=timestamp)) self.db[key] = new ghost_db = self.ghost_db[key] for ts, data in iteritems(ghost): ghost_db[ts].append(data) if ghost: deps = self._update_ghosts() else: deps = 0 return {'count': len(ghost), 'deps': deps}
def job_data(known, jid): if jid in known: data = known[jid] else: data = DotDict(method='???', totaltime=None, current=None) try: setup = load_setup(jid) data.method = setup.method if 'exectime' in setup: data.totaltime = setup.exectime.total except Exception: pass if isinstance(data.totaltime, (float, int)): data.totaltime = fmttime(data.totaltime) if data.totaltime is None: data.klass = 'unfinished' elif data.current: data.klass = 'current' else: data.klass = 'old' return data
def prepare(params): assert params.slices >= 2, "Hashing won't do anything with just one slice" dws = DotDict() for name, hashlabel in ( ("unhashed_manual", None), # manually interlaved ("unhashed_split", None), # split_write interlaved ("up_checked", "up"), # hashed on up using dw.hashcheck ("up_split", "up"), # hashed on up using split_write ("down_checked", "down"), # hashed on down using dw.hashcheck ("down_discarded", "down"), # hashed on down using discarding writes ("down_discarded_list", "down"), # hashed on down using discarding list writes ("down_discarded_dict", "down"), # hashed on down using discarding dict writes ): dw = DatasetWriter(name=name, hashlabel=hashlabel) dw.add("up", "int32") dw.add("down", "int32") dws[name] = dw return dws
def run_job(self, jobid, subjob_cookie=None, parent_pid=0): W = self.workspaces[Job(jobid).workdir] # active_workdirs = { name: ws.path for name, ws in self.workspaces.items() } slices = self.workspaces[self.target_workdir].slices t0 = time.time() setup = update_setup(jobid, starttime=t0) prof = setup.get('profile', DotDict()) new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config, self.Methods, active_workdirs, slices, self.debug, self.daemon_url, subjob_cookie, parent_pid) if self.debug: delete_from = Temp.TEMP else: delete_from = Temp.DEBUG for filename, temp in list(files.items()): if temp >= delete_from: unlink(join(W.path, jobid, filename)) del files[filename] prof.update(new_prof) prof.total = 0 prof.total = sum(v for v in prof.values() if isinstance(v, (float, int))) data = dict( starttime=t0, endtime=time.time(), profile=prof, ) update_setup(jobid, **data) data['files'] = files data['subjobs'] = subjobs json_save(data, jobid.filename('post.json'))
def load_setup(jobid): """Loads but does not type setup.json from jobid. You probably want to use extras.job_params instead. """ d = json_load('setup.json', jobid) version = d.version if version == 1: d.jobs = d.pop('jobids') version = 2 if version == 2: if 'exectime' not in d and 'profile' in d: d.exectime = d.pop('profile') d.versions = DotDict() python_path = d.pop('python', None) if python_path: d.versions.python_path = python_path version = 3 if version != 3: raise Exception( "Don't know how to load setup.json version %d (in %s)" % ( d.version, jobid, )) return d
def test(src_ds, opts, expect_lines): opts = DotDict(opts) def rename(colname): return opts.get('rename', {}).get(colname, colname) cols = set(opts.column2type) opts.discard_untyped = True msg = 'Testing with types %s' % (', '.join( v for k, v in sorted(opts.column2type.items())), ) expect_hl = None if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json': # json is not hashable, so we have to override the hashlabel to nothing in this case. opts.hashlabel = '' msg += ' (clearing hashlabel)' elif src_ds.hashlabel: expect_hl = rename(src_ds.hashlabel) if expect_hl in opts.column2type: msg += ' (hashed on %s)' % (opts.column2type[expect_hl], ) else: expect_hl = None msg += ' (hashed on <untyped column>)' print(msg) just_typed = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert just_typed.hashlabel == expect_hl assert set(just_typed.columns) == cols assert sum(just_typed.lines) == expect_lines if rename(src_ds.hashlabel) not in opts.column2type or opts.get( 'hashlabel') == '': assert just_typed.hashlabel is None else: assert just_typed.hashlabel == rename(src_ds.hashlabel) del opts.discard_untyped rev_rename = {v: k for k, v in opts.get('rename', {}).items()} typeaway = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols) if typeaway: # turn columns we don't want to see in the comparison into int32 so they get discarded on rehashing. new_src_ds = subjobs.build( 'dataset_type', options=dict(defaults=dict.fromkeys(typeaway, '0'), column2type=dict.fromkeys(typeaway, 'int32_10'), hashlabel=''), datasets=dict(source=src_ds)).dataset() assert set(new_src_ds.columns) == set( src_ds.columns) # only types differ assert new_src_ds.lines == src_ds.lines src_ds = new_src_ds for hashlabel in cols: if opts.column2type[hashlabel] == 'json': # not hashable continue opts['hashlabel'] = hashlabel print('%s rehashed on %s' % ( msg, opts.column2type[hashlabel], )) hashed_by_type = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset() assert hashed_by_type.hashlabel == hashlabel assert set(hashed_by_type.columns) == cols assert sum(hashed_by_type.lines) == expect_lines hashed_after = subjobs.build( 'dataset_rehash', options=dict(hashlabel=hashlabel), datasets=dict(source=just_typed)).dataset() assert hashed_after.hashlabel == hashlabel assert set(hashed_after.columns) == cols assert sum(hashed_after.lines) == expect_lines if src_ds.hashlabel: # if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after # will have been hashed twice and therefore have a different order than hashed_by_type. if rename(src_ds.hashlabel) == hashlabel: # These should be the same though. subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=just_typed)) hashed_by_type = subjobs.build( 'dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_by_type)) hashed_after = subjobs.build( 'dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_after)) subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=hashed_after))