예제 #1
0
def sort(columniter):
    with status('Determining sort order'):
        info = datasets.source.columns
        if sum(info[column].type not in nononehandling_types
               for column in options.sort_columns):
            # At least one sort column can have unsortable values
            first = True
            iters = []
            for column in options.sort_columns:
                it = columniter(column, status_reporting=first)
                first = False
                if info[column].type not in nononehandling_types:
                    it = filter_unsortable(column, it)
                iters.append(it)
            if len(iters) == 1:
                # Special case to not make tuples when there is only one column.
                lst = list(iters[0])
            else:
                lst = list(izip(*iters))
        else:
            columns = options.sort_columns
            if len(columns) == 1:
                # Special case to not make tuples when there is only one column.
                columns = columns[0]
            lst = list(columniter(columns))
        reverse = (options.sort_order == 'descending')
        with status('Creating sort list'):
            return sorted(range(len(lst)),
                          key=lst.__getitem__,
                          reverse=reverse)
예제 #2
0
 def __enter__(self):
     self._status = status('Saving ' + self.filename)
     self._status.__enter__()
     # stupid python3 feels that w and x are exclusive, while python2 requires both.
     fh = getattr(self, '_open', open)(self.tmp_filename,
                                       'xb' if PY3 else 'wbx')
     self.close = fh.close
     return fh
예제 #3
0
def synthesis(params):
	if not options.sliced:
		filename = '%d.gz' if options.filename.lower().endswith('.gz') else '%d.csv'
		with open(options.filename, "wb") as outfh:
			for sliceno in range(params.slices):
				with status("Assembling %s (%d/%d)" % (options.filename, sliceno, params.slices)):
					with open(filename % sliceno, "rb") as infh:
						copyfileobj(infh, outfh)
					unlink(filename % sliceno)
예제 #4
0
def pickle_load(filename='result.pickle',
                jobid=None,
                sliceno=None,
                encoding='bytes'):
    filename = _fn(filename, jobid, sliceno)
    with status('Loading ' + filename):
        with open(filename, 'rb') as fh:
            if PY3:
                return pickle.load(fh, encoding=encoding)
            else:
                return pickle.load(fh)
예제 #5
0
def analysis(sliceno, params, prepare_res):
    dw, ds_list, sort_idx = prepare_res
    if options.sort_across_slices:
        columniter = partial(Dataset.iterate_list, None, datasets=ds_list)
        sort_idx = sort_idx[sliceno]
    else:
        columniter = partial(Dataset.iterate_list, sliceno, datasets=ds_list)
        sort_idx = sort(columniter)
    for ix, column in enumerate(datasets.source.columns, 1):
        colstat = '%r (%d/%d)' % (
            column,
            ix,
            len(datasets.source.columns),
        )
        with status('Reading ' + colstat):
            lst = list(columniter(column))
        with status('Writing ' + colstat):
            w = dw.writers[column].write
            for idx in sort_idx:
                w(lst[idx])
        # Delete the list before making a new one, so we use less memory.
        del lst
예제 #6
0
def build(method, options={}, datasets={}, jobids={}, name=None, caption=None):
    """Just like urd.build, but for making subjobs"""

    global _a
    assert g.running != 'analysis', "Analysis is not allowed to make subjobs"
    assert g.subjob_cookie, "Can't build subjobs: out of cookies"
    if not _a:
        _a = Automata(g.daemon_url, subjob_cookie=g.subjob_cookie)
        _a.update_method_deps()
        _a.record[None] = _a.jobs = jobs

    def run():
        return _a.call_method(method,
                              options=options,
                              datasets=datasets,
                              jobids=jobids,
                              record_as=name,
                              caption=caption)

    try:
        if name or caption:
            msg = 'Building subjob %s' % (name or method, )
            if caption:
                msg += ' "%s"' % (caption, )
            with status(msg):
                jid = run()
        else:
            jid = run()
    except DaemonError as e:
        raise DaemonError(e.args[0])
    except JobError as e:
        raise JobError(e.jobid, e.method, e.status)
    for d in _a.job_retur.jobs.values():
        if d.link not in _record:
            _record[d.link] = bool(d.make)
    return jid
예제 #7
0
def execute_process(workdir, jobid, slices, result_directory, common_directory, source_directory, index=None, workdirs=None, daemon_url=None, subjob_cookie=None, parent_pid=0):
	WORKDIRS.update(workdirs)

	g.job = jobid
	setproctitle('launch')
	path = os.path.join(workdir, jobid)
	try:
		os.chdir(path)
	except Exception:
		print("Cannot cd to workdir", path)
		exit(1)

	g.params = params = job_params()
	method_ref = import_module(params.package+'.a_'+params.method)
	g.sliceno = -1

	g.job = CurrentJob(jobid, params, result_directory, source_directory)
	g.slices = slices

	g.options          = params.options
	g.datasets         = params.datasets
	g.jobids           = params.jobids

	method_ref.options = params.options
	method_ref.datasets= params.datasets
	method_ref.jobids  = params.jobids

	g.daemon_url       = daemon_url
	g.running          = 'launch'
	status._start('%s %s' % (jobid, params.method,), parent_pid)

	def dummy():
		pass

	prepare_func   = getattr(method_ref, 'prepare'  , dummy)
	analysis_func  = getattr(method_ref, 'analysis' , dummy)
	synthesis_func = getattr(method_ref, 'synthesis', dummy)

	synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func)

	fd2pid, names, masters, slaves = iowrapper.setup(slices, prepare_func is not dummy, analysis_func is not dummy)
	def switch_output():
		fd = slaves.pop()
		os.dup2(fd, 1)
		os.dup2(fd, 2)
		os.close(fd)
	iowrapper.run_reader(fd2pid, names, masters, slaves)
	for fd in masters:
		os.close(fd)

	# A chain must be finished from the back, so sort on that.
	sortnum_cache = {}
	def dw_sortnum(name):
		if name not in sortnum_cache:
			dw = dataset._datasetwriters[name]
			if dw.previous and dw.previous.startswith(jobid + '/'):
				pname = dw.previous.split('/')[1]
				num = dw_sortnum(pname) + 1
			else:
				num = 0
			sortnum_cache[name] = num
		return sortnum_cache[name]

	prof = {}
	if prepare_func is dummy:
		prof['prepare'] = 0 # truthish!
	else:
		t = time()
		switch_output()
		g.running = 'prepare'
		g.subjob_cookie = subjob_cookie
		setproctitle(g.running)
		with status.status(g.running):
			g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare))
			to_finish = [dw.name for dw in dataset._datasetwriters.values() if dw._started]
			if to_finish:
				with status.status("Finishing datasets"):
					for name in sorted(to_finish, key=dw_sortnum):
						dataset._datasetwriters[name].finish()
		c_fflush()
		prof['prepare'] = time() - t
	switch_output()
	setproctitle('launch')
	from accelerator.extras import saved_files
	if analysis_func is dummy:
		prof['per_slice'] = []
		prof['analysis'] = 0
	else:
		t = time()
		g.running = 'analysis'
		g.subjob_cookie = None # subjobs are not allowed from analysis
		with status.status('Waiting for all slices to finish analysis') as update:
			g.update_top_status = update
			prof['per_slice'], files, g.analysis_res = fork_analysis(slices, analysis_func, args_for(analysis_func), synthesis_needs_analysis, slaves)
			del g.update_top_status
		prof['analysis'] = time() - t
		saved_files.update(files)
	t = time()
	g.running = 'synthesis'
	g.subjob_cookie = subjob_cookie
	setproctitle(g.running)
	with status.status(g.running):
		synthesis_res = synthesis_func(**args_for(synthesis_func))
		if synthesis_res is not None:
			blob.save(synthesis_res, temp=False)
		if dataset._datasetwriters:
			with status.status("Finishing datasets"):
				for name in sorted(dataset._datasetwriters, key=dw_sortnum):
					dataset._datasetwriters[name].finish()
	if dataset._datasets_written:
		with open('datasets.txt', 'w', encoding='utf-8') as fh:
			for name in dataset._datasets_written:
				fh.write(name)
				fh.write(u'\n')
	c_fflush()
	t = time() - t
	prof['synthesis'] = t

	from accelerator.subjobs import _record
	return None, (prof, saved_files, _record)