Beispiel #1
0
	def __enter__(self):
		self._status = status('Saving ' + self.filename)
		self._status.__enter__()
		# stupid python3 feels that w and x are exclusive, while python2 requires both.
		fh = getattr(self, '_open', open)(self.tmp_filename, 'xb' if PY3 else 'wbx')
		self.close = fh.close
		return fh
Beispiel #2
0
def sort(columniter):
    with status('Determining sort order'):
        info = datasets.source.columns
        special_handling = set()
        for column in options.sort_columns:
            if info[column].type.startswith(
                    'float') or info[column].type == 'number':
                # for NaN
                special_handling.add(column)
            if info[column].none_support:
                special_handling.add(column)
        if special_handling:
            # At least one sort column can have unsortable values
            first = True
            iters = []
            for column in options.sort_columns:
                it = columniter(column, status_reporting=first)
                first = False
                if column in special_handling:
                    it = filter_unsortable(column, it)
                iters.append(it)
            if len(iters) == 1:
                # Special case to not make tuples when there is only one column.
                lst = list(iters[0])
            else:
                lst = list(izip(*iters))
        else:
            columns = options.sort_columns
            if len(columns) == 1:
                # Special case to not make tuples when there is only one column.
                columns = columns[0]
            lst = list(columniter(columns))
        if options.trigger_column:
            if len(options.sort_columns) == 1:
                sort_extra = lst
            else:
                with status('Creating trigger list'):
                    ix = options.sort_columns.index(options.trigger_column)
                    sort_extra = [el[ix] for el in lst]
        else:
            sort_extra = None
        reverse = (options.sort_order == 'descending')
        with status('Creating sort list'):
            return sorted(range(len(lst)),
                          key=lst.__getitem__,
                          reverse=reverse), sort_extra
Beispiel #3
0
def pickle_load(filename='result.pickle', jobid=None, sliceno=None, encoding='bytes'):
	filename = _fn(filename, jobid, sliceno)
	with status('Loading ' + filename):
		with open(filename, 'rb') as fh:
			if PY3:
				return pickle.load(fh, encoding=encoding)
			else:
				return pickle.load(fh)
Beispiel #4
0
def analysis(sliceno, params, prepare_res):
	dw, ds_list, sort_idx = prepare_res
	if options.sort_across_slices:
		sort_idx = sort_idx[sliceno]
		columniter = partial(ds_list.iterate, None, copy_mode=True)
	else:
		sort_idx, _ = sort(partial(ds_list.iterate, sliceno))
		columniter = partial(ds_list.iterate, sliceno, copy_mode=True)
	for ix, column in enumerate(datasets.source.columns, 1):
		colstat = '%r (%d/%d)' % (column, ix, len(datasets.source.columns),)
		with status('Reading ' + colstat):
			lst = list(columniter(column))
		with status('Writing ' + colstat):
			w = dw.writers[column].write
			for idx in sort_idx:
				w(lst[idx])
		# Delete the list before making a new one, so we use less memory.
		del lst
Beispiel #5
0
def build(method,
          options={},
          datasets={},
          jobs={},
          name=None,
          caption=None,
          **kw):
    """Just like urd.build, but for making subjobs"""

    global _a, _bad_kws
    assert g.running != 'analysis', "Analysis is not allowed to make subjobs"
    assert g.subjob_cookie, "Can't build subjobs: out of cookies"
    if not _a:
        _a = Automata(g.server_url, subjob_cookie=g.subjob_cookie)
        _a.update_method_info()
        _a.record[None] = _a.jobs = globals()['jobs']
        _bad_kws = set(getarglist(_a.call_method))
    bad_kws = _bad_kws & set(kw)
    if bad_kws:
        raise Exception('subjobs.build does not accept these keywords: %r' %
                        (bad_kws, ))

    def run():
        return _a.call_method(method,
                              options=options,
                              datasets=datasets,
                              jobs=jobs,
                              record_as=name,
                              caption=caption,
                              **kw)

    try:
        if name or caption:
            msg = 'Building subjob %s' % (name or method, )
            if caption:
                msg += ' "%s"' % (caption, )
            with status(msg):
                jid = run()
        else:
            jid = run()
    except ServerError as e:
        raise ServerError(e.args[0])
    except JobError as e:
        raise JobError(e.job, e.method, e.status)
    for d in _a.job_retur.jobs.values():
        if d.link not in _record:
            _record[d.link] = bool(d.make)
    return jid
Beispiel #6
0
def prepare(params):
    if options.trigger_column:
        assert options.sort_across_slices, 'trigger_column is meaningless without sort_across_slices'
        assert options.trigger_column in options.sort_columns, 'can only trigger on a column that is sorted on'
    d = datasets.source
    ds_list = d.chain(stop_ds={datasets.previous: 'source'})
    if options.sort_across_slices:
        columniter = partial(Dataset.iterate_list, None, datasets=ds_list)
        sort_idx, sort_extra = sort(columniter)
        total = len(sort_idx)
        per_slice = [total // params.slices] * params.slices
        extra = total % params.slices
        if extra:
            # spread the left over length over pseudo-randomly selected slices
            # (using the start of sort_idx to select slices).
            # this will always select the first slices if data is already sorted
            # but at least it's deterministic.
            selector = sorted(range(min(params.slices, total)),
                              key=sort_idx.__getitem__)
            for sliceno in selector[:extra]:
                per_slice[sliceno] += 1
        # Switch to tracking what line the slices end at
        slice_end = []
        end = 0
        for cnt in per_slice:
            end += cnt
            slice_end.append(end)
        if options.trigger_column:
            # extra definitely changed value last to simplify loop
            sort_extra.append(object())
            sort_idx.append(-1)

            # move slice_end counts around to only switch when trigger_column changes
            def fixup_fwd(cnt):
                trigger_v = sort_extra[sort_idx[cnt - 1]]
                while trigger_v == sort_extra[sort_idx[cnt]]:
                    cnt += 1
                return cnt

            def fixup_bck(cnt, min_cnt):
                trigger_v = sort_extra[sort_idx[cnt - 1]]
                while cnt > min_cnt and trigger_v == sort_extra[sort_idx[cnt]]:
                    cnt -= 1
                return cnt

            with status('Adjusting for trigger_column'):
                prev = 0
                for sliceno, cnt in enumerate(slice_end[:-1]):
                    if cnt:
                        cnt = max(cnt, prev)
                        choosen = fwd = fixup_fwd(cnt)
                        bck = fixup_bck(cnt, prev)
                        # This could be smarter
                        if (cnt - bck) <= (fwd < cnt):
                            choosen = bck
                        prev = slice_end[sliceno] = choosen
        # and now switch sort_idx to be per slice
        sort_idx = [
            sort_idx[start:end]
            for start, end in zip([0] + slice_end, slice_end)
        ]
        assert sum(len(part) for part in sort_idx) == total  # all rows used
        if not options.trigger_column:
            assert len(set(
                len(part)
                for part in sort_idx)) < 3  # only 1 or 2 lengths possible
    else:
        sort_idx = None
    if options.sort_across_slices:
        hashlabel = None
    else:
        hashlabel = d.hashlabel
    if len(ds_list) == 1:
        filename = d.filename
    else:
        filename = None
    dw = DatasetWriter(
        columns=d.columns,
        caption=params.caption,
        hashlabel=hashlabel,
        filename=filename,
        previous=datasets.previous,
    )
    return dw, ds_list, sort_idx
Beispiel #7
0
def execute_process(workdir,
                    jobid,
                    slices,
                    concurrency,
                    result_directory,
                    common_directory,
                    input_directory,
                    index=None,
                    workdirs=None,
                    server_url=None,
                    subjob_cookie=None,
                    parent_pid=0):
    WORKDIRS.update(workdirs)

    g.job = jobid
    setproctitle('launch')
    path = os.path.join(workdir, jobid)
    try:
        os.chdir(path)
    except Exception:
        print("Cannot cd to workdir", path)
        exit(1)

    g.params = params = job_params()
    method_ref = import_module(params.package + '.a_' + params.method)
    g.sliceno = -1

    g.job = CurrentJob(jobid, params, result_directory, input_directory)
    g.slices = slices

    g.options = params.options
    g.datasets = params.datasets
    g.jobs = params.jobs

    method_ref.options = params.options
    method_ref.datasets = params.datasets
    method_ref.jobs = params.jobs

    g.server_url = server_url
    g.running = 'launch'
    statmsg._start('%s %s' % (
        jobid,
        params.method,
    ), parent_pid)

    def dummy():
        pass

    prepare_func = getattr(method_ref, 'prepare', dummy)
    analysis_func = getattr(method_ref, 'analysis', dummy)
    synthesis_func = getattr(method_ref, 'synthesis', dummy)

    synthesis_needs_analysis = 'analysis_res' in getarglist(synthesis_func)

    fd2pid, names, masters, slaves = iowrapper.setup(
        slices, prepare_func is not dummy, analysis_func is not dummy)

    def switch_output():
        fd = slaves.pop()
        os.dup2(fd, 1)
        os.dup2(fd, 2)
        os.close(fd)

    if analysis_func is dummy:
        q = None
    else:
        q = LockFreeQueue()
    iowrapper.run_reader(fd2pid, names, masters, slaves, q=q)
    for fd in masters:
        os.close(fd)

    # A chain must be finished from the back, so sort on that.
    sortnum_cache = {}

    def dw_sortnum(name):
        if name not in sortnum_cache:
            dw = dataset._datasetwriters.get(name)
            if not dw:  # manually .finish()ed
                num = -1
            elif dw.previous and dw.previous.startswith(jobid + '/'):
                pname = dw.previous.split('/')[1]
                num = dw_sortnum(pname) + 1
            else:
                num = 0
            sortnum_cache[name] = num
        return sortnum_cache[name]

    prof = {}
    if prepare_func is dummy:
        prof['prepare'] = 0  # truthish!
    else:
        t = monotonic()
        switch_output()
        g.running = 'prepare'
        g.subjob_cookie = subjob_cookie
        setproctitle(g.running)
        with statmsg.status(g.running):
            g.prepare_res = method_ref.prepare(**args_for(method_ref.prepare))
            to_finish = [
                dw.name for dw in dataset._datasetwriters.values()
                if dw._started
            ]
            if to_finish:
                with statmsg.status("Finishing datasets"):
                    for name in sorted(to_finish, key=dw_sortnum):
                        dataset._datasetwriters[name].finish()
        c_fflush()
        prof['prepare'] = monotonic() - t
    switch_output()
    setproctitle('launch')
    from accelerator.extras import saved_files
    if analysis_func is dummy:
        prof['per_slice'] = []
        prof['analysis'] = 0
    else:
        t = monotonic()
        g.running = 'analysis'
        g.subjob_cookie = None  # subjobs are not allowed from analysis
        with statmsg.status(
                'Waiting for all slices to finish analysis') as update:
            g.update_top_status = update
            prof['per_slice'], files, g.analysis_res = fork_analysis(
                slices, concurrency, analysis_func, args_for(analysis_func),
                synthesis_needs_analysis, slaves, q)
            del g.update_top_status
        prof['analysis'] = monotonic() - t
        saved_files.update(files)
    t = monotonic()
    g.running = 'synthesis'
    g.subjob_cookie = subjob_cookie
    setproctitle(g.running)
    with statmsg.status(g.running):
        synthesis_res = synthesis_func(**args_for(synthesis_func))
        if synthesis_res is not None:
            blob.save(synthesis_res, temp=False)
        if dataset._datasetwriters:
            with statmsg.status("Finishing datasets"):
                for name in sorted(dataset._datasetwriters, key=dw_sortnum):
                    dataset._datasetwriters[name].finish()
    if dataset._datasets_written:
        blob.save(dataset._datasets_written,
                  'DS/LIST',
                  temp=False,
                  _hidden=True)
    c_fflush()
    t = monotonic() - t
    prof['synthesis'] = t

    from accelerator.subjobs import _record
    return None, (prof, saved_files, _record)