Ejemplo n.º 1
0
 def finish(self, path, timestamp=None, caption=None):
     path = self._path(path)
     assert self._current, 'Tried to finish %s with nothing running' % (
         path, )
     assert path == self._current, 'Tried to finish %s while running %s' % (
         path,
         self._current,
     )
     user, build = path.split('/')
     self._current = None
     caption = caption or self._current_caption or ''
     timestamp = timestamp or self._current_timestamp
     assert timestamp, 'No timestamp specified in begin or finish for %s' % (
         path, )
     data = DotDict(
         user=user,
         build=build,
         joblist=self.joblist.as_tuples,
         deps=self._deps,
         caption=caption,
         timestamp=timestamp,
     )
     if self._update:
         data.flags = ['update']
     url = self._url + '/add'
     return self._call(url, data)
Ejemplo n.º 2
0
def read_methods_conf(dirname, autodiscover):
    """ read and parse the methods.conf file """
    db = {}
    if autodiscover:
        methods = glob(os.path.join(dirname, 'a_*.py'))
        for method in methods:
            if method not in db:
                db[os.path.basename(method)[2:-3]] = DotDict(version='DEFAULT')
    filename = os.path.join(dirname, 'methods.conf')
    if autodiscover and not os.path.exists(filename):
        return db
    with open(filename) as fh:
        for lineno, line in enumerate(fh, 1):
            data = line.split('#')[0].split()
            if not data:
                continue
            method = data.pop(0)
            if autodiscover and (method not in db):
                # in auto-discover, anything in methods.conf goes
                continue
            try:
                version = data.pop(0)
            except IndexError:
                version = 'DEFAULT'
            if data:
                raise AcceleratorError('Trailing garbage on %s:%d: %s' % (
                    filename,
                    lineno,
                    line,
                ))
            db[method] = DotDict(version=version)
    return db
Ejemplo n.º 3
0
def analysis(sliceno, slices, prepare_res):
	if options.numeric_comma:
		try_locales = [
			'da_DK', 'nb_NO', 'nn_NO', 'sv_SE', 'fi_FI',
			'en_ZA', 'es_ES', 'es_MX', 'fr_FR', 'ru_RU',
			'de_DE', 'nl_NL', 'it_IT',
		]
		for localename in try_locales:
			localename = localename.encode('ascii')
			if not cstuff.backend.numeric_comma(localename):
				break
			if not cstuff.backend.numeric_comma(localename + b'.UTF-8'):
				break
		else:
			raise Exception("Failed to enable numeric_comma, please install at least one of the following locales: " + " ".join(try_locales))
	dw, dws, lines, chain, column2type = prepare_res
	if dws:
		dw = dws[sliceno]
		rehashing = True
	else:
		rehashing = False
	vars = DotDict(
		sliceno=sliceno,
		slices=slices,
		known_line_count=0,
		badmap_size=0,
		badmap_fd=-1,
		slicemap_size=0,
		slicemap_fd=-1,
		map_fhs=[],
		res_bad_count={},
		res_default_count={},
		res_minmax={},
		first_lap=True,
		rehashing=rehashing,
		hash_lines=None,
		dw=dw,
		chain=chain,
		lines=lines,
		column2type=column2type,
		rev_rename={v: k for k, v in options.rename.items() if k in datasets.source.columns and v in column2type},
	)
	if options.filter_bad:
		vars.badmap_fd = map_init(vars, 'badmap%d' % (sliceno,))
		bad_count, default_count, minmax = analysis_lap(vars)
		if sum(sum(c) for c in itervalues(bad_count)):
			vars.first_lap = False
			vars.res_bad_count = {}
			final_bad_count, default_count, minmax = analysis_lap(vars)
			final_bad_count = [max(c) for c in zip(*final_bad_count.values())]
		else:
			final_bad_count = [0] * slices
	else:
		bad_count, default_count, minmax = analysis_lap(vars)
		final_bad_count = [0] * slices
	for fh in vars.map_fhs:
		fh.close()
	if rehashing:
		unlink('slicemap%d' % (sliceno,))
	return bad_count, final_bad_count, default_count, minmax, vars.hash_lines
Ejemplo n.º 4
0
def params2defaults(params):
	d = DotDict()
	for key in ('datasets', 'jobs',):
		r = {}
		for v in params[key]:
			if isinstance(v, list):
				r[v[0]] = []
			else:
				r[v] = None
		d[key] = r
	def fixup(item):
		if isinstance(item, dict):
			d = {k: fixup(v) for k, v in iteritems(item)}
			if len(d) == 1 and first_value(d) is None and first_value(item) is not None:
				return {}
			return d
		if isinstance(item, (list, tuple, set,)):
			l = [fixup(v) for v in item]
			if l == [None] and list(item) != [None]:
				l = []
			return type(item)(l)
		if isinstance(item, (type, OptionEnum)):
			return None
		assert isinstance(item, (bytes, unicode, int, float, long, bool, OptionEnum, NoneType, datetime.datetime, datetime.date, datetime.time, datetime.timedelta)), type(item)
		return item
	def fixup0(item):
		if isinstance(item, RequiredOption):
			item = item.value
		if isinstance(item, OptionDefault):
			item = item.default
		return fixup(item)
	d.options = {k: fixup0(v) for k, v in iteritems(params.options)}
	return d
Ejemplo n.º 5
0
def load_setup(jobid):
    """Loads but does not type setup.json from jobid.
	You probably want to use extras.job_params instead.
	"""
    d = json_load('setup.json', jobid)
    version = d.version
    if version == 1:
        d.jobs = d.pop('jobids')
        version = 2
    if version == 2:
        if 'exectime' not in d and 'profile' in d:
            d.exectime = d.pop('profile')
        d.versions = DotDict()
        python_path = d.pop('python', None)
        if python_path:
            d.versions.python_path = python_path
        version = 3
    if version == 3:
        if '_typing' in d:
            d['_typing'] = {d.method: d['_typing']}
        d.params = {
            d.method:
            DotDict({k: d[k]
                     for k in ('options', 'datasets', 'jobs')})
        }
    else:
        raise Exception(
            "Don't know how to load setup.json version %d (in %s)" % (
                d.version,
                jobid,
            ))
    return d
Ejemplo n.º 6
0
def synthesis(prepare_res, params):
    ds = DotDict()
    # Must be finished in order (.previous must be finished when .finish is called.)
    for name, dw in sorted(prepare_res.items()):
        ds[name] = dw.finish()
    last = ds.h
    assert last.chain() == sorted(ds.values())
    ds.last = last
    test_partial_chains(ds)
    test_filters(ds)
Ejemplo n.º 7
0
def test(src_ds, opts, expect_lines):
	opts = DotDict(opts)
	def rename(colname):
		return opts.get('rename', {}).get(colname, colname)
	cols = set(opts.column2type)
	opts.discard_untyped = True
	msg = 'Testing with types %s' % (', '.join(v for k, v in sorted(opts.column2type.items())),)
	expect_hl = None
	if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json':
		# json is not hashable, so we have to override the hashlabel to nothing in this case.
		opts.hashlabel = ''
		msg += ' (clearing hashlabel)'
	elif src_ds.hashlabel:
		expect_hl = rename(src_ds.hashlabel)
		if expect_hl in opts.column2type:
			msg += ' (hashed on %s)' % (opts.column2type[expect_hl],)
		else:
			expect_hl = None
			msg += ' (hashed on <untyped column>)'
	print(msg)
	just_typed = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset()
	assert just_typed.hashlabel == expect_hl, just_typed
	assert set(just_typed.columns) == cols, just_typed
	assert sum(just_typed.lines) == expect_lines, just_typed
	if rename(src_ds.hashlabel) not in opts.column2type or opts.get('hashlabel') == '':
		assert just_typed.hashlabel is None, just_typed
	else:
		assert just_typed.hashlabel == rename(src_ds.hashlabel), just_typed
	del opts.discard_untyped
	rev_rename = {v: k for k, v in opts.get('rename', {}).items()}
	discard = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols)
	if discard:
		d = opts.get('rename', {})
		d.update({k: None for k in discard})
		opts.rename = d
	for hashlabel in cols:
		if opts.column2type[hashlabel] == 'json':
			# not hashable
			continue
		opts['hashlabel'] = hashlabel
		print('%s rehashed on %s' % (msg, opts.column2type[hashlabel],))
		hashed_by_type = subjobs.build('dataset_type', options=opts, datasets=dict(source=src_ds)).dataset()
		assert hashed_by_type.hashlabel == hashlabel, hashed_by_type
		assert set(hashed_by_type.columns) == cols, hashed_by_type
		assert sum(hashed_by_type.lines) == expect_lines, hashed_by_type
		hashed_after = subjobs.build('dataset_hashpart', options=dict(hashlabel=hashlabel), datasets=dict(source=just_typed)).dataset()
		assert hashed_after.hashlabel == hashlabel, hashed_after
		assert set(hashed_after.columns) == cols, hashed_after
		assert sum(hashed_after.lines) == expect_lines, hashed_after
		if src_ds.hashlabel:
			# if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after
			# will have been hashed twice and therefore have a different order than hashed_by_type.
			if rename(src_ds.hashlabel) == hashlabel:
				# These should be the same though.
				subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=just_typed))
			hashed_by_type = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_by_type))
			hashed_after = subjobs.build('dataset_sort', options=dict(sort_columns=rename('a')), datasets=dict(source=hashed_after))
		subjobs.build('test_compare_datasets', datasets=dict(a=hashed_by_type, b=hashed_after))
Ejemplo n.º 8
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    # all the numeric types should hash the same (for values they have in common)
    for name, hashlabel, typ in (
        ("unhashed_manual", None, "int32"),  # manually interlaved
        ("unhashed_split", None, "int64"),  # split_write interlaved
        ("up_checked", "up", "float32"),  # hashed on up using dw.hashcheck
        ("up_split", "up", "float64"),  # hashed on up using split_write
        ("down_checked", "down",
         "bits32"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down",
         "bits64"),  # hashed on down using discarding writes
        ("down_discarded_list", "down",
         "number"),  # hashed on down using discarding list writes
        ("down_discarded_dict", "down",
         "complex32"),  # hashed on down using discarding dict writes
            # we have too many types, so we need more datasets
        ("unhashed_complex64", None, "complex64"),
        ("unhashed_bytes", None, "bytes"),
        ("up_ascii", "up", "ascii"),
        ("down_unicode", "down", "unicode"),
            # datetime on 1970-01-01 hashes like time
        ("up_datetime", "up", "datetime"),
        ("down_time", "down", "time"),
            # date doesn't hash the same as anything else, so compare it to itself
        ("up_date", "up", "date"),
        ("down_date", "down", "date"),
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", typ)
        dw.add("down", typ)
        dws[name] = dw
    return dws
Ejemplo n.º 9
0
 def _validate_data(self, data, with_deps=True):
     if with_deps:
         assert set(data) == {
             'timestamp',
             'joblist',
             'caption',
             'user',
             'build',
             'deps',
             'flags',
         }
         assert isinstance(data.user, unicode)
         assert isinstance(data.build, unicode)
         assert isinstance(data.deps, dict)
         for v in itervalues(data.deps):
             assert isinstance(v, dict)
             self._validate_data(DotDict(v), False)
     else:
         assert set(data) == {
             'timestamp',
             'joblist',
             'caption',
         }
     assert joblistlike(data.joblist), data.joblist
     assert data.joblist
     assert isinstance(data.caption, unicode)
     data.timestamp = TimeStamp(data.timestamp)
Ejemplo n.º 10
0
def load_setup(jobid):
    """Loads but does not type setup.json from jobid.
	You probably want to use extras.job_params instead.
	"""
    d = json_load('setup.json', jobid)
    version = d.version
    if version == 1:
        d.jobs = d.pop('jobids')
        version = 2
    if version == 2:
        if 'exectime' not in d and 'profile' in d:
            d.exectime = d.pop('profile')
        d.versions = DotDict()
        python_path = d.pop('python', None)
        if python_path:
            d.versions.python_path = python_path
        version = 3
    if version == 3:
        # no changes here, it's only used to know how to find datasets
        version = 4
    if version != 4:
        raise AcceleratorError(
            "Don't know how to load setup.json version %d (in %s)" % (
                d.version,
                jobid,
            ))
    return d
Ejemplo n.º 11
0
def synthesis(prepare_res, analysis_res):
	separator, _, _, filename, _, labels, dw, bad_dw, skipped_dw, fds, success_fd, _, = prepare_res
	# Analysis may have gotten a perfectly legitimate EOF if something
	# went wrong in the reader process, so we need to check that all
	# went well.
	try:
		reader_res = os.read(success_fd, 1)
	except OSError:
		reader_res = None
	if reader_res != b"\0":
		raise Exception("Reader process failed")
	good_counts = []
	bad_counts = []
	skipped_counts = []
	for sliceno, (good_count, bad_count, skipped_count) in enumerate(analysis_res):
		dw.set_lines(sliceno, good_count)
		if bad_dw:
			bad_dw.set_lines(sliceno, bad_count)
		if skipped_dw:
			skipped_dw.set_lines(sliceno, skipped_count)
		good_counts.append(good_count)
		bad_counts.append(bad_count)
		skipped_counts.append(skipped_count)
	res = DotDict(
		num_lines=sum(good_counts),
		lines_per_slice=good_counts,
		num_broken_lines=sum(bad_counts),
		broken_lines_per_slice=bad_counts,
		num_skipped_lines=sum(skipped_counts),
		skipped_lines_per_slice=skipped_counts,
	)
	blob.save(res, 'import')
	write_report(res, labels)
Ejemplo n.º 12
0
def add():
    body = request.body
    if PY3:
        body = TextIOWrapper(body, encoding='utf-8')
    data = DotDict(json.load(body))
    if data.user != request.auth[0]:
        abort(401, "Error:  user does not match authentication!")
    result = db.add(data)
    return result
Ejemplo n.º 13
0
 def run(jobidv, tlock):
     for jobid in jobidv:
         passed_cookie = None
         # This is not a race - all higher locks are locked too.
         while passed_cookie in job_tracking:
             passed_cookie = gen_cookie()
         concurrency_map = dict(
             data.concurrency_map)
         concurrency_map.update(
             setup.get('concurrency_map', ()))
         job_tracking[passed_cookie] = DotDict(
             lock=JLock(),
             last_error=None,
             last_time=0,
             workdir=workdir,
             concurrency_map=concurrency_map,
         )
         try:
             self.ctrl.run_job(
                 jobid,
                 subjob_cookie=passed_cookie,
                 parent_pid=setup.get(
                     'parent_pid', 0),
                 concurrency=setup.get(
                     'concurrency')
                 or concurrency_map.get(
                     setup.method) or
                 concurrency_map.get('-default-'))
             # update database since a new jobid was just created
             job = self.ctrl.add_single_jobid(jobid)
             with tlock:
                 link2job[jobid]['make'] = 'DONE'
                 link2job[jobid][
                     'total_time'] = job.total
         except JobError as e:
             error.append(
                 [e.jobid, e.method, e.status])
             with tlock:
                 link2job[jobid]['make'] = 'FAIL'
             return
         finally:
             del job_tracking[passed_cookie]
     # everything was built ok, update symlink
     try:
         dn = self.ctrl.workspaces[workdir].path
         ln = os.path.join(dn, workdir + "-LATEST_")
         try:
             os.unlink(ln)
         except OSError:
             pass
         os.symlink(jobid, ln)
         os.rename(
             ln,
             os.path.join(dn, workdir + "-LATEST"))
     except OSError:
         traceback.print_exc()
Ejemplo n.º 14
0
def synthesis(prepare_res):
    opts = DotDict(
        (k, v) for k, v in options.items() if k in a_csvimport.options)
    lst = prepare_res
    previous = datasets.previous
    for fn, info, dsn in lst:
        opts.filename = fn
        jid = subjobs.build('csvimport',
                            options=opts,
                            datasets=dict(previous=previous),
                            caption="Import of %s from %s" % (
                                info.filename,
                                options.filename,
                            ))
        previous = Dataset(jid).link_to_here(dsn)
        if options.chaining == 'off':
            previous = None
    if (len(lst) == 1 or options.chaining != 'off') and dsn != 'default':
        Dataset(jid).link_to_here('default')
Ejemplo n.º 15
0
def synthesis():
	if options.inner:
		res = DotDict()
		res.datetime = options.datetime + options.timedelta
		res.time = options.time.replace(minute=0)
		res.date = options.date.replace(month=1)
		return res
	else:
		opts = dict(
			datetime=datetime(2019, 11, 6, 17, 37, 2, 987654),
			time=time(17, 37, 2, 987654),
			date=date(2019, 11, 6),
			timedelta=timedelta(microseconds=987654),
			inner=True,
		)
		jid = subjobs.build('test_datetime', options=opts)
		res = jid.load()
		assert res.datetime == datetime(2019, 11, 6, 17, 37, 3, 975308)
		assert res.time == time(17, 0, 2, 987654)
		assert res.date == date(2019, 1, 6)
Ejemplo n.º 16
0
def synthesis(prepare_res, analysis_res):
    all = chain.from_iterable(analysis_res)
    if options.sort:
        all = sorted(all)
    res = md5(b''.join(all)).hexdigest()
    print("%s: %s" % (
        datasets.source,
        res,
    ))
    return DotDict(sum=int(res, 16),
                   sort=options.sort,
                   columns=prepare_res,
                   source=datasets.source)
Ejemplo n.º 17
0
	def _parse_add(self, line):
		key = line[1]
		user, build = key.split('/')
		flags = line[4].split(',') if line[4] else []
		data = DotDict(timestamp=line[0],
			user=user,
			build=build,
			deps=json.loads(line[2]),
			joblist=json.loads(line[3]),
			flags=flags,
			caption=line[5],
		)
		self.add(data)
Ejemplo n.º 18
0
def _urd_typeify(d):
    if isinstance(d, str):
        d = json.loads(d)
        if not d or isinstance(d, unicode):
            return d
    res = DotDict()
    for k, v in d.items():
        if k == 'joblist':
            v = JobList(Job(e[1], e[0]) for e in v)
        elif isinstance(v, dict):
            v = _urd_typeify(v)
        res[k] = v
    return res
def synthesis():
    sum = 0
    jobs = datasets.source.chain(length=options.chain_length,
                                 stop_ds=datasets.stop)
    for src in jobs:
        jid = build('dataset_checksum',
                    options=dict(columns=options.columns, sort=options.sort),
                    datasets=dict(source=src))
        data = blob.load(jobid=jid)
        sum ^= data.sum
    print("Total: %016x" % (sum, ))
    return DotDict(sum=sum,
                   columns=data.columns,
                   sort=options.sort,
                   sources=jobs)
Ejemplo n.º 20
0
def generate(caption,
             method,
             params,
             package=None,
             python=None,
             why_build=False):
    data = DotDict()
    data.caption = caption
    data.method = method
    if package:
        data.package = package
    if python:
        data.python = python
    if why_build:
        data.why_build = why_build
    data.params = params
    return data
Ejemplo n.º 21
0
def generate(caption,
             method,
             params,
             package=None,
             description=None,
             why_build=False):
    data = DotDict()
    data.caption = caption
    data.method = method
    if package:
        data.package = package
    data.versions = DotDict()
    if description:
        data.versions.python_path = description['interpreter']
        data.versions.python = description['interpreter_version']
        if description['accelerator_version']:
            data.versions.accelerator = description['accelerator_version']
    if why_build:
        data.why_build = why_build
    data.params = params
    return data
Ejemplo n.º 22
0
def load_setup(jobid):
    """Loads but does not type setup.json from jobid.
	You probably want to use extras.job_params instead.
	"""
    d = json_load('setup.json', jobid)
    if d['version'] == 1:
        if '_typing' in d:
            d['_typing'] = {d.method: d['_typing']}
        d.params = {
            d.method:
            DotDict({k: d[k]
                     for k in ('options', 'datasets', 'jobids')})
        }
    else:
        raise Exception("Don't know how to load setup.json version %d" %
                        (d['version'], ))
    return d
Ejemplo n.º 23
0
def read_method_conf(filename):
	""" read and parse the methods.conf file """
	db = {}
	with open(filename) as fh:
		for lineno, line in enumerate(fh, 1):
			data = line.split('#')[0].split()
			if not data:
				continue
			method = data.pop(0)
			try:
				version = data.pop(0)
			except IndexError:
				version = 'DEFAULT'
			if data:
				raise Exception('Trailing garbage on %s:%d: %s' % (filename, lineno, line,))
			db[method] = DotDict(version=version)
	return db
Ejemplo n.º 24
0
    def run_job(self,
                jobid,
                subjob_cookie=None,
                parent_pid=0,
                concurrency=None):
        W = self.workspaces[Job(jobid).workdir]
        #
        active_workdirs = {
            name: ws.path
            for name, ws in self.workspaces.items()
        }
        slices = self.workspaces[self.target_workdir].slices

        t0 = time.time()
        setup = update_setup(jobid, starttime=t0)
        prof = setup.get('exectime', DotDict())
        new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config,
                                                   self.Methods,
                                                   active_workdirs, slices,
                                                   concurrency, self.debug,
                                                   self.server_url,
                                                   subjob_cookie, parent_pid)
        prefix = join(W.path, jobid) + '/'
        if not self.debug:
            for filename, temp in list(files.items()):
                if temp:
                    unlink(join(prefix, filename))
                    del files[filename]
        prof.update(new_prof)
        prof.total = 0
        prof.total = sum(v for v in prof.values()
                         if isinstance(v, (float, int)))
        if concurrency:
            prof.concurrency = concurrency
        data = dict(
            starttime=t0,
            endtime=time.time(),
            exectime=prof,
        )
        update_setup(jobid, **data)
        data['files'] = sorted(
            fn[len(prefix):] if fn.startswith(prefix) else fn for fn in files)
        data['subjobs'] = subjobs
        data['version'] = 1
        json_save(data, jobid.filename('post.json'))
Ejemplo n.º 25
0
 def truncate(self, key, timestamp):
     old = self.db[key]
     new = {}
     ghost = {}
     for ts, data in iteritems(old):
         if ts < timestamp:
             new[ts] = data
         else:
             ghost[ts] = data
     self.log('truncate', DotDict(key=key, timestamp=timestamp))
     self.db[key] = new
     ghost_db = self.ghost_db[key]
     for ts, data in iteritems(ghost):
         ghost_db[ts].append(data)
     if ghost:
         deps = self._update_ghosts()
     else:
         deps = 0
     return {'count': len(ghost), 'deps': deps}
Ejemplo n.º 26
0
def job_data(known, jid):
	if jid in known:
		data = known[jid]
	else:
		data = DotDict(method='???', totaltime=None, current=None)
		try:
			setup = load_setup(jid)
			data.method = setup.method
			if 'exectime' in setup:
				data.totaltime = setup.exectime.total
		except Exception:
			pass
	if isinstance(data.totaltime, (float, int)):
		data.totaltime = fmttime(data.totaltime)
	if data.totaltime is None:
		data.klass = 'unfinished'
	elif data.current:
		data.klass = 'current'
	else:
		data.klass = 'old'
	return data
Ejemplo n.º 27
0
def prepare(params):
    assert params.slices >= 2, "Hashing won't do anything with just one slice"
    dws = DotDict()
    for name, hashlabel in (
        ("unhashed_manual", None),  # manually interlaved
        ("unhashed_split", None),  # split_write interlaved
        ("up_checked", "up"),  # hashed on up using dw.hashcheck
        ("up_split", "up"),  # hashed on up using split_write
        ("down_checked", "down"),  # hashed on down using dw.hashcheck
        ("down_discarded", "down"),  # hashed on down using discarding writes
        ("down_discarded_list",
         "down"),  # hashed on down using discarding list writes
        ("down_discarded_dict",
         "down"),  # hashed on down using discarding dict writes
    ):
        dw = DatasetWriter(name=name, hashlabel=hashlabel)
        dw.add("up", "int32")
        dw.add("down", "int32")
        dws[name] = dw
    return dws
Ejemplo n.º 28
0
    def run_job(self, jobid, subjob_cookie=None, parent_pid=0):
        W = self.workspaces[Job(jobid).workdir]
        #
        active_workdirs = {
            name: ws.path
            for name, ws in self.workspaces.items()
        }
        slices = self.workspaces[self.target_workdir].slices

        t0 = time.time()
        setup = update_setup(jobid, starttime=t0)
        prof = setup.get('profile', DotDict())
        new_prof, files, subjobs = dispatch.launch(W.path, setup, self.config,
                                                   self.Methods,
                                                   active_workdirs, slices,
                                                   self.debug, self.daemon_url,
                                                   subjob_cookie, parent_pid)
        if self.debug:
            delete_from = Temp.TEMP
        else:
            delete_from = Temp.DEBUG
        for filename, temp in list(files.items()):
            if temp >= delete_from:
                unlink(join(W.path, jobid, filename))
                del files[filename]
        prof.update(new_prof)
        prof.total = 0
        prof.total = sum(v for v in prof.values()
                         if isinstance(v, (float, int)))
        data = dict(
            starttime=t0,
            endtime=time.time(),
            profile=prof,
        )
        update_setup(jobid, **data)
        data['files'] = files
        data['subjobs'] = subjobs
        json_save(data, jobid.filename('post.json'))
Ejemplo n.º 29
0
def load_setup(jobid):
    """Loads but does not type setup.json from jobid.
	You probably want to use extras.job_params instead.
	"""
    d = json_load('setup.json', jobid)
    version = d.version
    if version == 1:
        d.jobs = d.pop('jobids')
        version = 2
    if version == 2:
        if 'exectime' not in d and 'profile' in d:
            d.exectime = d.pop('profile')
        d.versions = DotDict()
        python_path = d.pop('python', None)
        if python_path:
            d.versions.python_path = python_path
        version = 3
    if version != 3:
        raise Exception(
            "Don't know how to load setup.json version %d (in %s)" % (
                d.version,
                jobid,
            ))
    return d
def test(src_ds, opts, expect_lines):
    opts = DotDict(opts)

    def rename(colname):
        return opts.get('rename', {}).get(colname, colname)

    cols = set(opts.column2type)
    opts.discard_untyped = True
    msg = 'Testing with types %s' % (', '.join(
        v for k, v in sorted(opts.column2type.items())), )
    expect_hl = None
    if src_ds.hashlabel and opts.column2type.get(src_ds.hashlabel) == 'json':
        # json is not hashable, so we have to override the hashlabel to nothing in this case.
        opts.hashlabel = ''
        msg += ' (clearing hashlabel)'
    elif src_ds.hashlabel:
        expect_hl = rename(src_ds.hashlabel)
        if expect_hl in opts.column2type:
            msg += ' (hashed on %s)' % (opts.column2type[expect_hl], )
        else:
            expect_hl = None
            msg += ' (hashed on <untyped column>)'
    print(msg)
    just_typed = subjobs.build('dataset_type',
                               options=opts,
                               datasets=dict(source=src_ds)).dataset()
    assert just_typed.hashlabel == expect_hl
    assert set(just_typed.columns) == cols
    assert sum(just_typed.lines) == expect_lines
    if rename(src_ds.hashlabel) not in opts.column2type or opts.get(
            'hashlabel') == '':
        assert just_typed.hashlabel is None
    else:
        assert just_typed.hashlabel == rename(src_ds.hashlabel)
    del opts.discard_untyped
    rev_rename = {v: k for k, v in opts.get('rename', {}).items()}
    typeaway = set(src_ds.columns) - set(rev_rename.get(n, n) for n in cols)
    if typeaway:
        # turn columns we don't want to see in the comparison into int32 so they get discarded on rehashing.
        new_src_ds = subjobs.build(
            'dataset_type',
            options=dict(defaults=dict.fromkeys(typeaway, '0'),
                         column2type=dict.fromkeys(typeaway, 'int32_10'),
                         hashlabel=''),
            datasets=dict(source=src_ds)).dataset()
        assert set(new_src_ds.columns) == set(
            src_ds.columns)  # only types differ
        assert new_src_ds.lines == src_ds.lines
        src_ds = new_src_ds
    for hashlabel in cols:
        if opts.column2type[hashlabel] == 'json':
            # not hashable
            continue
        opts['hashlabel'] = hashlabel
        print('%s rehashed on %s' % (
            msg,
            opts.column2type[hashlabel],
        ))
        hashed_by_type = subjobs.build('dataset_type',
                                       options=opts,
                                       datasets=dict(source=src_ds)).dataset()
        assert hashed_by_type.hashlabel == hashlabel
        assert set(hashed_by_type.columns) == cols
        assert sum(hashed_by_type.lines) == expect_lines
        hashed_after = subjobs.build(
            'dataset_rehash',
            options=dict(hashlabel=hashlabel),
            datasets=dict(source=just_typed)).dataset()
        assert hashed_after.hashlabel == hashlabel
        assert set(hashed_after.columns) == cols
        assert sum(hashed_after.lines) == expect_lines
        if src_ds.hashlabel:
            # if src_ds has a hashlabel then just_typed will actually already be hashed, so hashed_after
            # will have been hashed twice and therefore have a different order than hashed_by_type.
            if rename(src_ds.hashlabel) == hashlabel:
                # These should be the same though.
                subjobs.build('test_compare_datasets',
                              datasets=dict(a=hashed_by_type, b=just_typed))
            hashed_by_type = subjobs.build(
                'dataset_sort',
                options=dict(sort_columns=rename('a')),
                datasets=dict(source=hashed_by_type))
            hashed_after = subjobs.build(
                'dataset_sort',
                options=dict(sort_columns=rename('a')),
                datasets=dict(source=hashed_after))
        subjobs.build('test_compare_datasets',
                      datasets=dict(a=hashed_by_type, b=hashed_after))