def _req(route_name, params, op, raw_response): """Make request and parse result Args: route_name (str): string name of route params (dict): parameters to apply to route op (func): how to request Returns: object: parsed JSON result """ from sirepo import simulation_db uri = None resp = None try: uri = _uri(route_name, params) resp = op(uri) if raw_response: return resp return simulation_db.json_load(resp.data) except Exception as e: pkdlog('{}: uri={} resp={}', e, uri, resp) pkdexc() raise
async def _fastcgi_read(self, connection): s = None m = None try: s = tornado.iostream.IOStream( connection, max_buffer_size=job.cfg.max_message_bytes, ) while True: m = await self._fastcgi_msg_q.get() # Avoid issues with exceptions. We don't use q.join() # so not an issue to call before work is done. self._fastcgi_msg_q.task_done() await s.write(pkjson.dump_bytes(m) + b'\n') await self.job_cmd_reply( m, job.OP_ANALYSIS, await s.read_until(b'\n', job.cfg.max_message_bytes), ) except Exception as e: pkdlog('msg={} error={} stack={}', m, e, pkdexc()) # If self.fastcgi_cmd is None we initiated the kill so not an error if not self.fastcgi_cmd: return await self._fastcgi_handle_error(m, e, pkdexc()) finally: if s: s.close()
def default_command(in_file): """Reads `in_file` passes to `msg.jobCmd` Must be called in run_dir Writes its output on stdout. Args: in_file (str): json parsed to msg Returns: str: json output of command, e.g. status msg """ try: job.init() f = pkio.py_path(in_file) msg = pkjson.load_any(f) #TODO(e-carlin): find common place to serialize/deserialize paths msg.runDir = pkio.py_path(msg.runDir) f.remove() res = globals()['_do_' + msg.jobCmd](msg, sirepo.template.import_module( msg.simulationType)) if res is None: return r = PKDict(res).pksetdefault(state=job.COMPLETED) except Exception as e: r = PKDict( state=job.ERROR, error=e.sr_args.error if isinstance(e, sirepo.util.UserAlert) else str(e), stack=pkdexc(), ) return pkjson.dump_pretty(r, pretty=False)
def _on_do_compute_exit(success_exit, is_parallel, template, run_dir): # locals() must be called before anything else so we only get the function # arguments kwargs = locals() def _failure_exit(): a = _post_processing() if not a: f = run_dir.join(template_common.RUN_LOG) if f.exists(): a = _parse_python_errors(pkio.read_text(f)) if not a: a = 'non-zero exit code' return PKDict(state=job.ERROR, error=a) def _post_processing(): if hasattr(template, 'post_execution_processing'): return template.post_execution_processing(**kwargs) return None def _success_exit(): return PKDict( state=job.COMPLETED, alert=_post_processing(), ) try: return _success_exit() if success_exit else _failure_exit() except Exception as e: return PKDict(state=sirepo.job.ERROR, error=e, stack=pkdexc())
def api_runCancel(): try: return _request() except Exception as e: pkdlog('ignoring exception={} stack={}', e, pkdexc()) # Always true from the client's perspective return sirepo.http_reply.gen_json({'state': 'canceled'})
def fixup_old_data(data): """Upgrade data to latest schema and updates version. Args: data (dict): to be updated (destructively) Returns: dict: upgraded `data` bool: True if data changed """ try: if 'version' in data and data['version'] == SCHEMA_COMMON['version']: return data, False data['version'] = SCHEMA_COMMON['version'] if not 'simulationType' in data: if 'sourceIntensityReport' in data['models']: data['simulationType'] = 'srw' elif 'fieldAnimation' in data['models']: data['simulationType'] = 'warp' elif 'bunchSource' in data['models']: data['simulationType'] = 'elegant' else: pkdlog('simulationType: not found; data={}', data) raise AssertionError('must have simulationType') if not 'simulationSerial' in data['models']['simulation']: data['models']['simulation']['simulationSerial'] = 0 sirepo.template.import_module(data['simulationType']).fixup_old_data(data) try: del data['models']['simulationStatus'] except KeyError: pass return data, True except Exception as e: pkdlog('{}: error: {}', data, pkdexc()) raise
def _end_job(self, job): """Free the slot associated with the job POSIT: job is locked """ slot = None with self.__lock: try: self.__queued_jobs.remove(job) # No slot, just done return except ValueError: pass try: s = self.__running_slots[job.jid] if s.job == job: slot = s s.job = None del self.__running_slots[job.jid] except KeyError as e: pkdlog( '{}: PROGRAM ERROR: not in running, ignoring job: {}\n{}', job.jid, e, pkdexc(), ) if slot: self.__available_slots.append(slot) self.__event.set()
async def _run(self, req, op): try: if self.db.computeJobHash != req.content.computeJobHash: pkdlog('invalid computeJobHash self={} req={}', self.db.computeJobHash, req.content.computeJobHash) return try: while True: r = await op.reply_ready() if r.state == job.CANCELED: break self.db.status = r.state if self.db.status == job.ERROR: self.db.error = r.get('error', '<unknown error>') if 'computeJobStart' in r: self.db.computeJobStart = r.computeJobStart if 'parallelStatus' in r: self.db.parallelStatus.update(r.parallelStatus) self.db.lastUpdateTime = r.parallelStatus.lastUpdateTime else: # sequential jobs don't send this self.db.lastUpdateTime = int(time.time()) #TODO(robnagler) will need final frame count self.__db_write() if r.state in job.EXIT_STATUSES: break except Exception as e: pkdlog('error={} stack={}', e, pkdexc()) self.db.status = job.ERROR self.db.error = e finally: self.destroy_op(op)
async def loop(self): while True: self._websocket = None try: #TODO(robnagler) connect_timeout, max_message_size, ping_interval, ping_timeout self._websocket = await tornado.websocket.websocket_connect( tornado.httpclient.HTTPRequest( url=cfg.supervisor_uri, validate_cert=sirepo.job.cfg.verify_tls, ), ) m = self.format_op(None, job.OP_ALIVE) while True: if m and not await self.send(m): break m = await self._websocket.read_message() if m is None: raise ValueError('response from supervisor was None') m = await self._op(m) except Exception as e: pkdlog('error={} stack={}', e, pkdexc()) # TODO(e-carlin): exponential backoff? await tornado.gen.sleep(_RETRY_SECS) finally: if self._websocket: self._websocket.close()
async def _agent_start(self, op): if self._agent_starting_timeout: return async with self._agent_start_lock: # POSIT: we do not have to raise Awaited(), because # this is the first thing an op waits on. if self._agent_starting_timeout or self._websocket_ready.is_set(): return try: t = self.cfg.agent_starting_secs if pkconfig.channel_in_internal_test(): x = op.msg.pkunchecked_nested_get( 'data.models.dog.favoriteTreat') if x: x = re.search(r'agent_start_delay=(\d+)', x) if x: self._agent_start_delay = int(x.group(1)) t += self._agent_start_delay pkdlog('op={} agent_start_delay={}', op, self._agent_start_delay) pkdlog('{} {} await _do_agent_start', self, op) # All awaits must be after this. If a call hangs the timeout # handler will cancel this task self._agent_starting_timeout = tornado.ioloop.IOLoop.current( ).call_later( t, self._agent_starting_timeout_handler, ) # POSIT: Canceled errors aren't smothered by any of the below calls await self.kill() await self._do_agent_start(op) except Exception as e: pkdlog('{} error={} stack={}', self, e, pkdexc()) self.free_resources(internal_error='failure starting agent') raise
def _gen_exception_reply_SRException(args): r = args.routeName p = args.params or PKDict() try: t = sirepo.http_request.sim_type(p.pkdel('sim_type')) s = simulation_db.get_schema(sim_type=t) except Exception as e: pkdc('exception={} stack={}', e, pkdexc()) # sim_type is bad so don't cascade errors, just # try to get the schema without the type t = None s = simulation_db.get_schema(sim_type=None) # If default route or always redirect/reload if r: assert r in s.localRoutes, \ 'route={} not found in schema for type={}'.format(r, t) else: r = sirepo.uri.default_local_route_name(s) p = PKDict(reload_js=True) if ( # must be first, to always delete reload_js not p.pkdel('reload_js') and flask.request.method == 'POST' and r not in _RELOAD_JS_ROUTES): pkdc('POST response={} route={} params={}', SR_EXCEPTION_STATE, r, p) return gen_json( PKDict({ _STATE: SR_EXCEPTION_STATE, SR_EXCEPTION_STATE: args, }), ) pkdc('redirect to route={} params={} type={}', r, p, t) return gen_redirect_for_local_route(t, route=r, params=p)
def restrict_op_to_first_rank(op): """If the process has rank FIRST_RANK, call a function. Otherwise do nothing. Use this to call a function that will cause conflicts if called by multiple processes, such as writing results to a file Args: op (function): function to call """ c = None r = FIRST_RANK res = None try: import mpi4py.MPI c = mpi4py.MPI.COMM_WORLD if c.Get_size() > 1: r = c.Get_rank() except Exception: pass if r == FIRST_RANK: try: res = op() except Exception as e: pkdlog('op={} exception={} stack={}', op, e, pkdexc()) if c: c.Abort(1) raise e if c: res = c.bcast(res, root=FIRST_RANK) return res
async def purge_free_simulations(cls): def _get_uids_and_files(): r = [] u = None p = sirepo.auth_db.UserRole.uids_of_paid_users() for f in pkio.sorted_glob( _DB_DIR.join('*{}'.format( sirepo.simulation_db.JSON_SUFFIX, ))): n = sirepo.sim_data.split_jid(jid=f.purebasename).uid if n in p or f.mtime() > _too_old \ or f.purebasename in cls._purged_jids_cache: continue if u != n: # POSIT: Uid is the first part of each db file. The files are # sorted so this should yield all of a user's files if r: yield u, r u = n r = [] r.append(f) if r: yield u, r def _purge_sim(jid): d = cls.__db_load(jid) # OPTIMIZATION: We assume the uids_of_paid_users doesn't change very # frequently so we don't need to check again. A user could run a sim # at anytime so we need to check that they haven't if d.lastUpdateTime > _too_old: return cls._purged_jids_cache.add(jid) if d.status == job.JOB_RUN_PURGED: return p = sirepo.simulation_db.simulation_run_dir(d) pkio.unchecked_remove(p) n = cls.__db_init_new(d, d) n.status = job.JOB_RUN_PURGED cls.__db_write_file(n) if not cfg.purge_non_premium_task_secs: return s = sirepo.srtime.utc_now() u = None f = None try: _too_old = (sirepo.srtime.utc_now_as_int() - cfg.purge_non_premium_after_secs) with sirepo.auth_db.session(): for u, v in _get_uids_and_files(): with sirepo.auth.set_user_outside_of_http_request(u): for f in v: _purge_sim(jid=f.purebasename) await tornado.gen.sleep(0) except Exception as e: pkdlog('u={} f={} error={} stack={}', u, f, e, pkdexc()) finally: tornado.ioloop.IOLoop.current().call_later( cfg.purge_non_premium_task_secs, cls.purge_free_simulations, )
def _do_download_data_file(msg, template): try: r = template.get_data_file( msg.runDir, msg.analysisModel, msg.frame, options=PKDict(suffix=msg.suffix), ) if not isinstance(r, PKDict): if isinstance(r, str): r = msg.runDir.join(r, abs=1) r = PKDict(filename=r) u = r.get('uri') if u is None: u = r.filename.basename c = r.get('content') if c is None: c = pkcompat.to_bytes(pkio.read_text(r.filename)) \ if u.endswith(('py', 'txt', 'csv')) \ else r.filename.read_binary() requests.put( msg.dataFileUri + u, data=c, verify=job.cfg.verify_tls, ).raise_for_status() return PKDict() except Exception as e: return PKDict(state=job.ERROR, error=e, stack=pkdexc())
def test_importer(import_req): from pykern import pkcollections from pykern import pkjson from pykern.pkunit import pkeq from sirepo.template import zgoubi import sirepo.sim_data with pkunit.save_chdir_work() as w: for fn in pkio.sorted_glob(pkunit.data_dir().join('*.dat')): error = None try: data = zgoubi.import_file(import_req(fn), unit_test_mode=True) sirepo.sim_data.get_class('zgoubi').fixup_old_data(data) #TODO(pjm): easier way to convert nested dict to pkcollections.Dict? data = pkcollections.json_load_any(pkjson.dump_pretty(data)) except Exception as e: pkdlog(pkdexc()) error = e.message if error: actual = error else: actual = zgoubi.python_source_for_model(data) outfile = fn.basename + '.txt' pkio.write_text(outfile, actual) e = pkunit.data_dir().join(outfile) expect = pkio.read_text(e) pkeq(expect, actual, 'diff {} {}', e, w.join(outfile))
def test_import(): from pykern import pkjson from pykern.pkunit import pkeq from sirepo.template import flash_parser import re def _parse_config(fn): return flash_parser.ConfigParser().parse(pkio.read_text(fn)) def _parse_par(fn): data_file = fn.basename.replace('-flash.par', '') return flash_parser.ParameterParser().parse( pkjson.load_any( pkio.read_text( pkunit.data_dir().join(f'{data_file}-sirepo-data.json'))), pkio.read_text(fn), ) with pkunit.save_chdir_work(): for fn in pkio.sorted_glob(pkunit.data_dir().join('*')): if re.search(r'-Config$', fn.basename): parser = _parse_config elif re.search(r'flash.par$', fn.basename): parser = _parse_par else: continue try: actual = pkjson.dump_pretty(parser(fn)) except Exception as e: pkdlog(pkdexc()) actual = str(e) outfile = f'{fn.basename}.out' pkio.write_text(outfile, actual) expect = pkio.read_text(pkunit.data_dir().join(outfile)) pkeq(expect, actual)
def api_runCancel(): jid = None try: req = http_request.parse_post(id=True, model=True, check_sim_exists=True) jid = req.sim_data.parse_jid(req.req_data) # TODO(robnagler) need to have a way of listing jobs # Don't bother with cache_hit check. We don't have any way of canceling # if the parameters don't match so for now, always kill. #TODO(robnagler) mutex required if runner.job_is_processing(jid): run_dir = simulation_db.simulation_run_dir(req.req_data) # Write first, since results are write once, and we want to # indicate the cancel instead of the termination error that # will happen as a result of the kill. try: simulation_db.write_result({'state': 'canceled'}, run_dir=run_dir) except Exception as e: if not pykern.pkio.exception_is_not_found(e): raise # else: run_dir may have been deleted runner.job_kill(jid) # TODO(robnagler) should really be inside the template (t.cancel_simulation()?) # the last frame file may not be finished, remove it t = sirepo.template.import_module(req.req_data) if hasattr(t, 'remove_last_frame'): t.remove_last_frame(run_dir) except Exception as e: pkdlog('ignoring exception={} jid={} stack={}', e, jid, pkdexc()) # Always true from the client's perspective return http_reply.gen_json({'state': 'canceled'})
async def loop(self): while True: self._websocket = None try: #TODO(robnagler) connect_timeout, ping_interval, ping_timeout self._websocket = await tornado.websocket.websocket_connect( tornado.httpclient.HTTPRequest( url=cfg.supervisor_uri, validate_cert=sirepo.job.cfg.verify_tls, ), max_message_size=job.cfg.max_message_bytes, ping_interval=job.cfg.ping_interval_secs, ping_timeout=job.cfg.ping_timeout_secs, ) s = self.format_op(None, job.OP_ALIVE) while True: if s and not await self.send(s): break r = await self._websocket.read_message() if r is None: pkdlog( 'websocket closed in response to len={} send={}', s and len(s), s, ) raise tornado.iostream.StreamClosedError() s = await self._op(r) except Exception as e: pkdlog('error={} stack={}', e, pkdexc()) # TODO(e-carlin): exponential backoff? await tornado.gen.sleep(_RETRY_SECS) finally: if self._websocket: self._websocket.close()
def api_importFile(simulation_type): """ Args: simulation_type (str): which simulation type Params: file: file data folder: where to import to """ import sirepo.importer error = None f = None try: f = flask.request.files.get('file') if not f: raise sirepo.util.Error('must supply a file') req = http_request.parse_params( filename=f.filename, folder=flask.request.form.get('folder'), id=flask.request.form.get('simulationId'), template=True, type=simulation_type, ) req.file_stream = f.stream req.import_file_arguments = flask.request.form.get('arguments', '') def s(data): data.models.simulation.folder = req.folder data.models.simulation.isExample = False return _save_new_and_reply(data) if pkio.has_file_extension(req.filename, 'json'): data = sirepo.importer.read_json(req.file_stream.read(), req.type) #TODO(pjm): need a separate URI interface to importer, added exception for rs4pi for now # (dicom input is normally a zip file) elif pkio.has_file_extension(req.filename, 'zip') and req.type != 'rs4pi': data = sirepo.importer.read_zip(req.file_stream, sim_type=req.type) else: if not hasattr(req.template, 'import_file'): raise sirepo.util.Error('Only zip files are supported') with simulation_db.tmp_dir() as d: data = req.template.import_file(req, tmp_dir=d, reply_op=s) if 'error' in data: return http_reply.gen_json(data) return s(data) except werkzeug.exceptions.HTTPException: raise except sirepo.util.Reply: raise except Exception as e: pkdlog('{}: exception: {}', f and f.filename, pkdexc()) #TODO(robnagler) security issue here. Really don't want to report errors to user error = str(e.args) if hasattr(e, 'args') else str(e) return http_reply.gen_json({ 'error': error if error else 'An unknown error occurred', })
def _catch_and_log_errors(exc_type, msg, *args, **kwargs): try: yield except trio.MultiError as multi_exc: raise AssertionError('handle MultiErrors in _catch_and_log_errors') except exc_type: pkdlog(msg, *args, **kwargs) pkdlog(pkdexc())
def _run_shadow(): """Run shadow program with isolated locals() """ try: exec(_script(), locals(), locals()) except Exception: pkdlog('script={} error={}', _script(), pkdexc()) return beam
def on_close(self): try: d = getattr(self, 'sr_driver', None) if d: del self.sr_driver d.websocket_on_close() except Exception as e: pkdlog('error={} {}', e, pkdexc())
def start_sbatch(): def get_host(): h = socket.gethostname() if '.' not in h: h = socket.getfqdn() return h def kill_agent(pid_file): if get_host() == pid_file.host: os.kill(pid_file.pid, signal.SIGKILL) else: try: subprocess.run( ('ssh', pid_file.host, 'kill', '-KILL', str(pid_file.pid)), capture_output=True, text=True, ).check_returncode() except subprocess.CalledProcessError as e: if '({}) - No such process'.format( pid_file.pid) not in e.stderr: pkdlog('cmd={cmd} returncode={returncode} stderr={stderr}', **vars(e)) f = None try: f = pkjson.load_any(pkio.py_path(_PID_FILE)) except Exception as e: if not pkio.exception_is_not_found(e): pkdlog('error={} stack={}', e, pkdexc()) try: if f: kill_agent(f) except Exception as e: pkdlog('error={} stack={}', e, pkdexc()) pkjson.dump_pretty( PKDict( host=get_host(), pid=os.getpid(), ), _PID_FILE, ) try: start() finally: #TODO(robnagler) https://github.com/radiasoft/sirepo/issues/2195 pkio.unchecked_remove(_PID_FILE)
def import_python(code, tmp_dir, user_filename=None, arguments=None): """Converts script_text into json and stores as new simulation. Avoids too much data back to the user in the event of an error. This could be a potential security issue, because the script could be used to probe the system. Args: simulation_type (str): always "srw", but used to find lib dir code (str): Python code that runs SRW user_filename (str): uploaded file name for log arguments (str): argv to be passed to script Returns: dict: simulation data """ script = None # Patch for the mirror profile for the exported .py file from Sirepo: code = _patch_mirror_profile(code) try: with pkio.save_chdir(tmp_dir): # This string won't show up anywhere script = pkio.write_text( 'in.py', re.sub(r'^main\(', '#', code, flags=re.MULTILINE), ) o = SRWParser( script, user_filename=user_filename, arguments=arguments, ) return o.data except Exception as e: lineno = script and _find_line_in_trace(script) if hasattr(e, 'args'): if len(e.args) == 1: m = str(e.args[0]) elif e.args: m = str(e.args) else: m = e.__class__.__name__ else: m = str(e) pkdlog( 'Error: {}; exception={}; script={}; filename={}; stack:\n{}', m, e.__class__.__name__, script, user_filename, pkdexc(), ) m = m[:50] raise ValueError( 'Error on line {}: {}'.format(lineno, m) if lineno else 'Error: {}'.format(m), )
def run(self): """Start jobs if slots available else check for available""" pkdlog( '{}: {} available={}', self.name, self.__kind, len(self.__available_slots), ) while True: self.__event.wait(_SLOT_MANAGER_POLL_SECS) got_one = False while True: with self.__lock: self.__event.clear() if not (self.__queued_jobs and self.__available_slots): if self.__queued_jobs: pkdlog( 'waiting: queue={} available={}', [x.jid for x in self.__queued_jobs], [str(x) for x in self.__available_slots], ) break j = self.__queued_jobs.pop(0) s = self.__available_slots.pop(0) s.job = j self.__running_slots[j.jid] = s # have to release slot lock before locking job try: with j.lock: if j._is_state_ok_to_start(): j._slot_start(s) got_one = True except Exception as e: j._error_during_start(e, pkdexc()) try: j.kill() except Exception as e: pkdlog( '{}: error during cleanup after error: {}\n{}', j.jid, e, pkdexc(), ) if not got_one: self._poll_running_jobs()
def set_job_status(self, status): self.computeJob.set_status(self, status) try: yield self.computeJob.set_status(self, None) except Exception as e: pkdlog('{} status={} stack={}', self, status, pkdexc()) self.computeJob.set_status(self, None, exception=e) raise
def _do_get_simulation_frame(msg, template): try: return template_common.sim_frame_dispatch( msg.data.copy().pkupdate(run_dir=msg.runDir), ) except Exception as e: r = 'report not generated' if isinstance(e, sirepo.util.UserAlert): r = e.sr_args.error return PKDict(state=job.ERROR, error=r, stack=pkdexc())
def set_job_situation(self, situation): self.computeJob.set_situation(self, situation) try: yield self.computeJob.set_situation(self, None) except Exception as e: pkdlog('{} situation={} stack={}', self, situation, pkdexc()) self.computeJob.set_situation(self, None, exception=e) raise
def call_api(func_or_name, kwargs=None, data=None): """Call another API with permission checks. Note: also calls `save_to_cookie`. Args: func_or_name (object): api function or name (without `api_` prefix) kwargs (dict): to be passed to API [None] data (dict): will be returned `http_request.parse_json` Returns: flask.Response: result """ p = None s = None try: # must be first so exceptions have access to sim_type if kwargs: # Any (GET) uri will have simulation_type in uri if it is application # specific. s = sirepo.http_request.set_sim_type(kwargs.get('simulation_type')) f = func_or_name if callable(func_or_name) \ else _api_to_route[func_or_name].func sirepo.api_auth.check_api_call(f) try: if data: p = sirepo.http_request.set_post(data) r = flask.make_response(f(**kwargs) if kwargs else f()) finally: if data: sirepo.http_request.set_post(p) except Exception as e: if isinstance(e, (sirepo.util.Reply, werkzeug.exceptions.HTTPException)): pkdc('api={} exception={} stack={}', func_or_name, e, pkdexc()) else: pkdlog('api={} exception={} stack={}', func_or_name, e, pkdexc()) r = sirepo.http_reply.gen_exception(e) finally: # http_request tries to keep a valid sim_type so # this is ok to call (even if s is None) sirepo.http_request.set_sim_type(s) sirepo.cookie.save_to_cookie(r) sirepo.events.emit('end_api_call', PKDict(resp=r)) return r
async def purge_free_simulations(cls, init=False): def _get_uids_and_files(): r = [] u = None p = sirepo.auth_db.UserRole.uids_of_paid_users() for f in pkio.sorted_glob(_DB_DIR.join('*{}'.format( sirepo.simulation_db.JSON_SUFFIX, ))): n = sirepo.sim_data.uid_from_jid(f.basename) if n in p or f.mtime() > _too_old: continue if u != n: # POSIT: Uid is the first part of each db file. The files are # sorted so this should yield all of a user's files if r: yield u, r u = n r = [] r.append(f) if r: yield u, r def _purge_sim(db_file): d = pkcollections.json_load_any(db_file) # OPTIMIZATION: We assume the uids_of_paid_users doesn't change very # frequently so we don't need to check again. A user could run a sim # at anytime so we need to check that they haven't if d.lastUpdateTime > _too_old: return if d.status == job.FREE_USER_PURGED: return p = sirepo.simulation_db.simulation_run_dir(d) pkio.unchecked_remove(p) d.status = job.FREE_USER_PURGED cls.__db_write_file(d) jids_purged.append(db_file.purebasename) s = sirepo.srtime.utc_now() u = None f = None try: _too_old = sirepo.srtime.utc_now_as_float() - ( cfg.purge_free_after_days * 24 * 60 * 60 ) jids_purged = [] for u, v in _get_uids_and_files(): with sirepo.auth.set_user(u): for f in v: _purge_sim(f) await tornado.gen.sleep(0) pkdlog('jids={}', jids_purged) except Exception as e: pkdlog('u={} f={} error={} stack={}', u, f, e, pkdexc()) finally: cls._purge_free_simulations_set(s, init)
def _start(self): """Detach a process from the controlling terminal and run it in the background as a daemon. We don't use pksubprocess. This method is not called from the MainThread so can't set signals. """ env = _safe_env() env['SIREPO_MPI_CORES'] = str(mpi.cfg.cores) try: pid = os.fork() except OSError as e: pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno) reraise if pid != 0: pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd) self.__pid = pid return try: os.chdir(str(self.run_dir)) #Don't os.setsid() so signals propagate properly maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if (maxfd == resource.RLIM_INFINITY): maxfd = runner.MAX_OPEN_FILES for fd in range(0, maxfd): try: os.close(fd) except OSError: pass sys.stdin = open(template_common.RUN_LOG, 'a+') assert sys.stdin.fileno() == 0 os.dup2(0, 1) sys.stdout = os.fdopen(1, 'a+') os.dup2(0, 2) sys.stderr = os.fdopen(2, 'a+') pkdlog('{}: child will exec: {}', self.jid, self.cmd) sys.stderr.flush() try: simulation_db.write_status('running', self.run_dir) os.execvpe(self.cmd[0], self.cmd, env=env) except BaseException as e: pkdlog( '{}: execvp error: {} errno={}', self.jid, e.strerror if hasattr(e, 'strerror') else '', e.errno if hasattr(e, 'errno') else '', ) finally: sys.exit(1) except BaseException as e: # NOTE: there's no lock here so just append to the log. This # really shouldn't happen, but it might (out of memory) so just # log to the run log and hope somebody notices self._error_during_start(e, pkdexc()) raise
def read_result(run_dir): """Read result data file from simulation Args: run_dir (py.path): where to find output Returns: dict: result or describes error """ fn = json_filename(template_common.OUTPUT_BASE_NAME, run_dir) res = None err = None try: res = read_json(fn) except Exception as e: pkdc('{}: exception={}', fn, e) err = pkdexc() if pkio.exception_is_not_found(e): #TODO(robnagler) change POSIT matches _SUBPROCESS_ERROR_RE err = 'ERROR: Terminated unexpectedly' # Not found so return run.log as err rl = run_dir.join(template_common.RUN_LOG) try: e = pkio.read_text(rl) if _RUN_LOG_CANCEL_RE.search(e): err = None elif e: err = e except Exception as e: if not pkio.exception_is_not_found(e): pkdlog('{}: error reading log: {}', rl, pkdexc()) else: pkdlog('{}: error reading output: {}', fn, err) if err: return None, err if not res: res = {} if 'state' not in res: # Old simulation or other error, just say is canceled so restarts res = {'state': 'canceled'} return res, None
def _repo(self, repo): fn = repo.full_name bd = re.sub('/', '-', fn) def _clone(suffix): base = bd + suffix for cmd in [ ['git', 'clone', '--quiet', '--mirror', _GITHUB_URI + '/' + fn + suffix, base], ['tar', 'cJf', base + '.txz', base], ]: _shell(cmd) pkio.unchecked_remove(base) def _json(gen, suffix): base = bd + suffix with open(base, 'wt') as f: sep = '[' for i in gen: f.write(sep) j = i.as_json() assert json.loads(j) f.write(j) sep = ',' if sep == '[': # Empty iteration f.write(sep) f.write(']') _shell(['xz', base]) try: _clone('.git') if repo.has_issues: _json(repo.issues(state='all'), '.issues') if repo.has_wiki: try: _clone('.wiki.git') except subprocess.CalledProcessError as e: if not re.search(_WIKI_ERROR_OK, str(e.output)): raise _json(repo.comments(), '.comments') except Exception as e: pkdlog( 'ERROR: {} {} {} {} {}', fn, type(e), e, getattr(e, 'output', None), pkdexc(), )
def _from_cookie_header(self, header): global _try_beaker_compat s = None err = None try: match = re.search(r'\b{}=([^;]+)'.format(cfg.http_name), header) if match: s = self._decrypt(match.group(1)) self.update(self._deserialize(s)) self.incoming_serialized = s set_log_user(self.get(_COOKIE_USER)) return except Exception as e: if 'crypto' in type(e).__module__: # cryptography module exceptions serialize to empty string # so just report the type. e = type(e) err = e pkdc(pkdexc()) # wait for decoding errors until after beaker attempt if not self.get(_COOKIE_SENTINEL) and _try_beaker_compat: try: import sirepo.beaker_compat res = sirepo.beaker_compat.update_session_from_cookie_header(header) if not res is None: self.clear() self.set_sentinel() self.update(res) err = None set_log_user(self.get(_COOKIE_USER)) except AssertionError: pkdlog('Unconfiguring beaker_compat: {}', pkdexc()) _try_beaker_compat = False if err: pkdlog('Cookie decoding failed: {} value={}', err, s)
def open_json_file(sim_type, path=None, sid=None, fixup=True): """Read a db file and return result Args: sim_type (str): simulation type (app) path (py.path.local): where to read the file sid (str): simulation id Returns: dict: data Raises: CopyRedirect: if the simulation is in another user's """ if not path: path = sim_data_file(sim_type, sid) if not os.path.isfile(str(path)): global_sid = None if sid: #TODO(robnagler) workflow should be in server.py, # because only valid in one case, not e.g. for opening examples # which are not found. user_copy_sid = _find_user_simulation_copy(sim_type, sid) if find_global_simulation(sim_type, sid): global_sid = sid if global_sid: raise CopyRedirect({ 'redirect': { 'simulationId': global_sid, 'userCopySimulationId': user_copy_sid, }, }) util.raise_not_found( '{}/{}: global simulation not found', sim_type, sid, ) data = None try: with open(str(path)) as f: data = json_load(f) # ensure the simulationId matches the path if sid: data['models']['simulation']['simulationId'] = _sid_from_path(path) except Exception as e: pkdlog('{}: error: {}', path, pkdexc()) raise return fixup_old_data(data)[0] if fixup else data
def import_python(code, tmp_dir, lib_dir, user_filename=None, arguments=None): """Converts script_text into json and stores as new simulation. Avoids too much data back to the user in the event of an error. This could be a potential security issue, because the script could be used to probe the system. Args: simulation_type (str): always "srw", but used to find lib dir code (str): Python code that runs SRW user_filename (str): uploaded file name for log arguments (str): argv to be passed to script Returns: dict: simulation data """ script = None # Patch for the mirror profile for the exported .py file from Sirepo: code = _patch_mirror_profile(code, lib_dir) try: with pkio.save_chdir(tmp_dir): # This string won't show up anywhere script = pkio.write_text('in.py', code) o = SRWParser( script, lib_dir=py.path.local(lib_dir), user_filename=user_filename, arguments=arguments, ) return o.data except Exception as e: lineno = script and _find_line_in_trace(script) # Avoid pkdlog( 'Error: {}; exception={}; script={}; filename={}; stack:\n{}', e.message, e, script, user_filename, pkdexc(), ) e = str(e)[:50] raise ValueError( 'Error on line {}: {}'.format(lineno, e) if lineno else 'Error: {}'.format(e))
def test_importer(): from pykern import pkcollections from pykern import pkio from pykern.pkunit import pkeq from sirepo.template import elegant with pkunit.save_chdir_work(): for fn in pkio.sorted_glob(pkunit.data_dir().join('*')): if not pkio.has_file_extension(fn, ('ele', 'lte')) \ or fn.basename.endswith('ele.lte'): continue error = None try: data = elegant.import_file(FlaskRequest(fn)) except Exception as e: pkdlog(pkdexc()) error = e.message if error: actual = error else: if pkio.has_file_extension(fn, 'lte'): data['models']['commands'] = [] actual = '{}{}'.format( elegant._generate_variables(data), elegant.generate_lattice( data, elegant._build_filename_map(data), elegant._build_beamline_map(data), pkcollections.Dict(), ), ) else: data2 = elegant.import_file(FlaskRequest('{}.lte'.format(fn)), test_data=data) actual = elegant._generate_commands( data2, elegant._build_filename_map(data2), elegant._build_beamline_map(data2), pkcollections.Dict(), ) outfile = fn.basename + '.txt' pkio.write_text(outfile, actual) expect = pkio.read_text(pkunit.data_dir().join(outfile)) #TODO(pjm): this takes too long if there are a lot of diffs #assert expect == actual pkeq(expect, actual)
def fixup_old_data(data, force=False): """Upgrade data to latest schema and updates version. Args: data (dict): to be updated (destructively) force (bool): force validation Returns: dict: upgraded `data` bool: True if data changed """ try: if not force and 'version' in data and data.version == SCHEMA_COMMON.version: return data, False try: data.fixup_old_version = data.version except AttributeError: data.fixup_old_version = _OLDEST_VERSION data.version = SCHEMA_COMMON.version if 'simulationType' not in data: if 'sourceIntensityReport' in data.models: data.simulationType = 'srw' elif 'fieldAnimation' in data.models: data.simulationType = 'warppba' elif 'bunchSource' in data.models: data.simulationType = 'elegant' else: pkdlog('simulationType: not found; data={}', data) raise AssertionError('must have simulationType') elif data.simulationType == 'warp': data.simulationType = 'warppba' elif data.simulationType == 'fete': data.simulationType = 'warpvnd' if 'simulationSerial' not in data.models.simulation: data.models.simulation.simulationSerial = 0 sirepo.template.import_module(data.simulationType).fixup_old_data(data) pkcollections.unchecked_del(data.models, 'simulationStatus') pkcollections.unchecked_del(data, 'fixup_old_version') return data, True except Exception as e: pkdlog('{}: error: {}', data, pkdexc()) raise
def _dispatch(path): """Called by Flask and routes the base_uri with parameters Args: path (str): what to route Returns: Flask.response """ cookie.init() try: if path is None: return _dispatch_call(_empty_route.func, {}) # werkzeug doesn't convert '+' to ' ' parts = re.sub(r'\+', ' ', path).split('/') try: route = _uri_to_route[parts[0]] parts.pop(0) except KeyError: route = _default_route kwargs = pkcollections.Dict() for p in route.params: if not parts: if not p.is_optional: raise NotFound('{}: uri missing parameter ({})', path, p.name) break if p.is_path_info: kwargs[p.name] = '/'.join(parts) parts = None break kwargs[p.name] = parts.pop(0) if parts: raise NotFound('{}: unknown parameters in uri ({})', parts, path) return _dispatch_call(route.func, kwargs) except NotFound as e: util.raise_not_found(e.log_fmt, *e.args, **e.kwargs) except Exception as e: pkdlog('{}: error: {}', path, pkdexc()) raise
def _simulation_run_status(data, quiet=False): """Look for simulation status and output Args: data (dict): request quiet (bool): don't write errors to log Returns: dict: status response """ try: #TODO(robnagler): Lock rep = simulation_db.report_info(data) is_processing = cfg.job_queue.is_processing(rep.job_id) is_running = rep.job_status in _RUN_STATES res = {'state': rep.job_status} pkdc( '{}: is_processing={} is_running={} state={} cached_data={}', rep.job_id, is_processing, is_running, rep.job_status, bool(rep.cached_data), ) if is_processing and not is_running: cfg.job_queue.race_condition_reap(rep.job_id) pkdc('{}: is_processing and not is_running', rep.job_id) is_processing = False if is_processing: if not rep.cached_data: return _simulation_error( 'input file not found, but job is running', rep.input_file, ) else: is_running = False if rep.run_dir.exists(): res, err = simulation_db.read_result(rep.run_dir) if err: return _simulation_error(err, 'error in read_result', rep.run_dir) if simulation_db.is_parallel(data): template = sirepo.template.import_module(data) new = template.background_percent_complete( rep.model_name, rep.run_dir, is_running, simulation_db.get_schema(data['simulationType']), ) new.setdefault('percentComplete', 0.0) new.setdefault('frameCount', 0) res.update(new) res['parametersChanged'] = rep.parameters_changed if res['parametersChanged']: pkdlog( '{}: parametersChanged=True req_hash={} cached_hash={}', rep.job_id, rep.req_hash, rep.cached_hash, ) #TODO(robnagler) verify serial number to see what's newer res.setdefault('startTime', _mtime_or_now(rep.input_file)) res.setdefault('lastUpdateTime', _mtime_or_now(rep.run_dir)) res.setdefault('elapsedTime', res['lastUpdateTime'] - res['startTime']) if is_processing: res['nextRequestSeconds'] = simulation_db.poll_seconds(rep.cached_data) res['nextRequest'] = { 'report': rep.model_name, 'reportParametersHash': rep.cached_hash, 'simulationId': rep.cached_data['simulationId'], 'simulationType': rep.cached_data['simulationType'], } pkdc( '{}: processing={} state={} cache_hit={} cached_hash={} data_hash={}', rep.job_id, is_processing, res['state'], rep.cache_hit, rep.cached_hash, rep.req_hash, ) except Exception: return _simulation_error(pkdexc(), quiet=quiet) return res
def check_call_with_signals(cmd, output=None, env=None, msg=None): """Run cmd, writing to output. stdin is `os.devnull`. Passes SIGTERM and SIGINT on to the child process. If `output` is a string, it will be opened in write ('w') mode. Args: cmd (list): passed to subprocess verbatim output (file or str): where to write stdout and stderr env (dict): environment to use """ assert _is_main_thread(), \ 'subprocesses which require signals need to be started in main thread' p = None prev_signal = dict([(sig, signal.getsignal(sig)) for sig in _SIGNALS]) def signal_handler(sig, frame): if p: p.send_signal(sig) ps = prev_signal[sig] if ps in (None, signal.SIG_IGN, signal.SIG_DFL): return ps(sig, frame) pid = None try: stdout = output if isinstance(output, six.string_types): stdout = open(output, 'w') stderr = subprocess.STDOUT if stdout else None for sig in _SIGNALS: signal.signal(sig, signal_handler) p = subprocess.Popen( cmd, stdin=open(os.devnull), stdout=stdout, stderr=stderr, env=env, ) pid = p.pid if msg: msg('{}: started: {}', pid, cmd) rc = p.wait() p = None if rc != 0: raise RuntimeError('error exit({})'.format(rc)) if msg: msg('{}: normal exit(0): {}', pid, cmd) except Exception as e: if msg: msg('{}: exception: {} {}', pid, cmd, pkdexc()) raise finally: for sig in _SIGNALS: signal.signal(sig, prev_signal[sig]) if not p is None: if msg: msg('{}: terminating: {}', pid, cmd) p.terminate() if stdout != output: stdout.close()
def tag1234(): try: force_error() except: return pkdexc()