def launch(workdir, setup, config, Methods, active_workdirs, slices, concurrency, debug, server_url, subjob_cookie, parent_pid): starttime = monotonic() jobid = setup.jobid method = setup.method if subjob_cookie: print_prefix = '' else: print_prefix = ' ' print('%s| %s [%s] |' % (print_prefix, jobid, method,)) args = dict( workdir=workdir, slices=slices, concurrency=concurrency, jobid=jobid, result_directory=config.get('result_directory', ''), common_directory=config.get('common_directory', ''), input_directory=config.get('input_directory', ''), workdirs=active_workdirs, server_url=server_url, subjob_cookie=subjob_cookie, parent_pid=parent_pid, debuggable=config.debuggable, ) from accelerator.runner import runners runner = runners[Methods.db[method].version] child, prof_r = runner.launch_start(args) # There's a race where if we get interrupted right after fork this is not recorded # (the launched job could continue running) try: children.add(child) status, data = runner.launch_finish(child, prof_r, workdir, jobid, method) if status: os.killpg(child, SIGTERM) # give it a chance to exit gracefully # The dying process won't have sent an end message, so it has # the endwait time until we SIGKILL it. print('%s| %s [%s] failed! (%5.1fs) |' % (print_prefix, jobid, method, monotonic() - starttime)) # There is a race where stuff on the status socket has not arrived when # the sending process exits. This is basically benign, but let's give # it a chance to arrive to cut down on confusing warnings. statmsg_endwait(child, 0.1) finally: try: os.killpg(child, SIGKILL) # this should normally be a no-op, but in case it left anything. except Exception: pass try: children.remove(child) except Exception: pass try: # won't block long (we just killed it, plus it had probably already exited) runner.launch_waitpid(child) except Exception: pass if status: raise JobError(jobid, method, status) print('%s| %s [%s] completed. (%5.1fs) |' % (print_prefix, jobid, method, monotonic() - starttime)) return data
def build(method, options={}, datasets={}, jobids={}, name=None, caption=None): """Just like urd.build, but for making subjobs""" global _a assert g.running != 'analysis', "Analysis is not allowed to make subjobs" assert g.subjob_cookie, "Can't build subjobs: out of cookies" if not _a: _a = Automata(g.daemon_url, subjob_cookie=g.subjob_cookie) _a.update_method_deps() _a.record[None] = _a.jobs = jobs def run(): return _a.call_method(method, options=options, datasets=datasets, jobids=jobids, record_as=name, caption=caption) try: if name or caption: msg = 'Building subjob %s' % (name or method, ) if caption: msg += ' "%s"' % (caption, ) with status(msg): jid = run() else: jid = run() except DaemonError as e: raise DaemonError(e.args[0]) except JobError as e: raise JobError(e.jobid, e.method, e.status) for d in _a.job_retur.jobs.values(): if d.link not in _record: _record[d.link] = bool(d.make) return jid
def run(jobidv, tlock): for jobid in jobidv: passed_cookie = None # This is not a race - all higher locks are locked too. while passed_cookie in job_tracking: passed_cookie = gen_cookie() concurrency_map = dict( data.concurrency_map) concurrency_map.update( setup.get('concurrency_map', ())) job_tracking[passed_cookie] = DotDict( lock=JLock(), last_error=None, last_time=0, workdir=workdir, concurrency_map=concurrency_map, ) try: explicit_concurrency = setup.get( 'concurrency' ) or concurrency_map.get(setup.method) concurrency = explicit_concurrency or concurrency_map.get( '-default-') if concurrency and setup.method == 'csvimport': # just to be safe, check the package too if load_setup( jobid ).package == 'accelerator.standard_methods': # ignore default concurrency, error on explicit. if explicit_concurrency: raise JobError( jobid, 'csvimport', { 'server': 'csvimport can not run with reduced concurrency' }) concurrency = None self.ctrl.run_job( jobid, subjob_cookie=passed_cookie, parent_pid=setup.get( 'parent_pid', 0), concurrency=concurrency) # update database since a new jobid was just created job = self.ctrl.add_single_jobid(jobid) with tlock: link2job[jobid]['make'] = 'DONE' link2job[jobid][ 'total_time'] = job.total except JobError as e: error.append( [e.job, e.method, e.status]) with tlock: link2job[jobid]['make'] = 'FAIL' return finally: del job_tracking[passed_cookie] # everything was built ok, update symlink try: dn = self.ctrl.workspaces[workdir].path ln = os.path.join(dn, workdir + "-LATEST_") try: os.unlink(ln) except OSError: pass os.symlink(jobid, ln) os.rename( ln, os.path.join(dn, workdir + "-LATEST")) except OSError: traceback.print_exc(file=sys.stderr)