def _validate_field(model, field, model_info): if field in ('_id', 'type'): return assert field in model_info, \ 'unknown model field: {}.{}, value: {}'.format(model['type'], field, model[field]) field_info = model_info[field] field_type = field_info[1] if field_type == 'Float': model[field] = float(model[field]) mf = '{}.{}'.format(model['type'], field) if mf in _DEGREE_TO_RADIAN_FIELDS: model[field] *= math.pi / 180.0 elif mf in _MRAD_FIELDS: model[field] /= 1000.0 elif field in _CM_FIELDS and model['type'] != 'CAVITE': model[field] *= 0.01 elif model['type'] == 'CHANGREF' and field == 'order' and model['format'] == 'new': res = '' model['order'] = re.sub(r'\s+$', '', model['order']) k = '' for v in model['order'].split(' '): if re.search(r'^[XYZ]', v): k = v else: if k[1] == 'R': res += '{} {} '.format(k, float(v) * math.pi / 180.0) else: res += '{} {} '.format(k, float(v) * 0.01) model['order'] = re.sub(r'\s+$', '', res) elif field_type in _SCHEMA['enum']: for v in _SCHEMA['enum'][field_type]: if v[0] == model[field]: return pkdlog('invalid enum value, {}.{} {}: {}', model['type'], field, field_type, model[field]) model[field] = field_info[3]
def update_session_from_cookie_header(header): """Update the flask session from the beaker file identified by the cookie header """ maps = _init_maps() try: cookie = SignedCookie(cfg.secret, input=header) if not cfg.key in cookie: return None identifier = cookie[cfg.key].value if not identifier: return None path = beaker.util.encoded_path( str(flask.current_app.sirepo_db_dir.join('beaker/container_file')), [identifier], extension='.cache', digest_filenames=False, ) with open(path, 'rb') as fh: values = pickle.load(fh) res = {} if 'session' in values and _ORIG_KEY in values['session']: for f in maps['key'].keys(): v = values['session'].get(f) if not v is None: if not isinstance(v, str): # pickle decodes certains strings as unicode in Python 2 v = v.encode('ascii') res[maps['key'][f]] = maps['value'].get(v, v) pkdlog('retrieved user from beaker cookie: res={}', res) return res except Exception as e: pkdlog('ignoring exception with beaker compat: error={}, header={}', e, header) return None
def subprocess_output(cmd): """Run cmd and return output or None, logging errors. Args: cmd (list): what to run Returns: str: output is None on error else a stripped string """ err = None out = None try: p = subprocess.Popen( cmd, env=subprocess_env(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) out, err = p.communicate() if p.wait() != 0: raise subprocess.CalledProcessError(returncode=p.returncode, cmd=cmd) except subprocess.CalledProcessError as e: pkdlog('{}: exit={} err={}', cmd, e.returncode, err) return None if out != None and len(out): return out.strip() return ''
def process_sdds_page(filename, page_index, callback, *args, **kwargs): try: if sdds.sddsdata.InitializeInput(_SDDS_INDEX, filename) != 1: pkdlog('{}: cannot access'.format(filename)) # In normal execution, the file may not yet be available over NFS err = _sdds_error('Output file is not yet available.') else: #TODO(robnagler) SDDS_GotoPage not in sddsdata, why? for _ in xrange(page_index + 1): if sdds.sddsdata.ReadPage(_SDDS_INDEX) <= 0: #TODO(robnagler) is this an error? break try: return callback(*args, **kwargs) except SystemError as e: pkdlog('{}: page not found in {}'.format(page_index, filename)) err = _sdds_error('Output page {} not found'.format(page_index) if page_index else 'No output was generated for this report.') finally: try: sdds.sddsdata.Terminate(_SDDS_INDEX) except Exception: pass return { 'err': err, }
def fixup_old_data(data): """Upgrade data to latest schema and updates version. Args: data (dict): to be updated (destructively) Returns: dict: upgraded `data` bool: True if data changed """ try: if 'version' in data and data['version'] == SCHEMA_COMMON['version']: return data, False data['version'] = SCHEMA_COMMON['version'] if not 'simulationType' in data: if 'sourceIntensityReport' in data['models']: data['simulationType'] = 'srw' elif 'fieldAnimation' in data['models']: data['simulationType'] = 'warp' elif 'bunchSource' in data['models']: data['simulationType'] = 'elegant' else: pkdlog('simulationType: not found; data={}', data) raise AssertionError('must have simulationType') if not 'simulationSerial' in data['models']['simulation']: data['models']['simulation']['simulationSerial'] = 0 sirepo.template.import_module(data['simulationType']).fixup_old_data(data) try: del data['models']['simulationStatus'] except KeyError: pass return data, True except Exception as e: pkdlog('{}: error: {}', data, pkdexc()) raise
def __init__(self, data): with self._lock: self.jid = simulation_db.job_id(data) pkdc('{}: created', self.jid) if self.jid in self._job: pkdlog( '{}: Collision tid={} celery_state={}', jid, self.async_result, self.async_result and self.async_result.state, ) raise Collision(self.jid) self.cmd, self.run_dir = simulation_db.prepare_simulation(data) self._job[self.jid] = self self.data = data self._job[self.jid] = self self.async_result = self._start_job() pkdc( '{}: started tid={} dir={} queue={} len_jobs={}', self.jid, self.async_result.task_id, self.run_dir, self.celery_queue, len(self._job), )
def validate_serial(req_data): """Verify serial in data validates Args: req_data (dict): request with serial and possibly models Returns: object: None if all ok, or json response (bad) """ with _global_lock: sim_type = sirepo.template.assert_sim_type(req_data['simulationType']) sid = parse_sid(req_data) req_ser = req_data['models']['simulation']['simulationSerial'] curr = read_simulation_json(sim_type, sid=sid) curr_ser = curr['models']['simulation']['simulationSerial'] if not req_ser is None: if req_ser == curr_ser: return None status = 'newer' if req_ser > curr_ser else 'older' pkdlog( '{}: incoming serial {} than stored serial={} sid={}, resetting client', req_ser, status, curr_ser, sid, ) return curr
def import_file(text): data = simulation_db.default_data(_SIM_TYPE) beamline = [] data['models']['beamlines'] = [ { 'name': 'BL1', 'id': 1, 'items': beamline, }, ] current_id = 2 title, elements = zgoubi_parser.parse_file(text, 1) data['models']['simulation']['name'] = title if title else 'zgoubi' ids_and_elements = [[], []] for el in elements: _validate_model(el) if 'name' in el: if el not in ids_and_elements[1]: current_id += 1 ids_and_elements[0].append(current_id) ids_and_elements[1].append(el) beamline.append(ids_and_elements[0][ids_and_elements[1].index(el)]) else: if el['type'] in data['models']: pkdlog('replacing existing {} model', el['type']) data['models'][el['type']] = el for idx in range(len(ids_and_elements[0])): el = ids_and_elements[1][idx] el['_id'] = ids_and_elements[0][idx] data['models']['elements'].append(el) elegant_common.sort_elements_and_beamlines(data) return data
def _run_shadow(): """Run shadow program with isolated locals() """ try: exec(_script(), locals(), locals()) except Exception: pkdlog('script={} error={}', _script(), pkdexc()) return beam
def race_condition_reap(cls, jid): """Job terminated, but not still in queue. This can happen due to race condition in is_processing. Call again to remove the job from the queue. """ pkdlog('{}: sigchld_handler in another thread', jid) cls.is_processing(jid)
def _add_command(parser, command, elements): command_type = command[0][0] method = '_zgoubi_{}'.format(command_type).lower() if method not in globals(): pkdlog('unknown zgoubi element: {}', method) return el = globals()[method](command) if el: elements.append(el)
def _variables_to_postfix(rpn_variables): res = [] for v in rpn_variables: if 'value' not in v: pkdlog('rpn var missing value: {}', v['name']) v['value'] = '0' res.append({ 'name': v['name'], 'value': _infix_to_postfix(v['value']), }) return res
def backup(): """Backs up all github repositories associated with user into pwd Creates timestamped directory, and purges directories older than cfg.keep_days """ try: _Backup() except subprocess.CalledProcessError as e: if hasattr(e, 'output'): pkdlog('ERROR: Backup {}', e.output) pkdlog('DONE')
def _kill(self): from celery.exceptions import TimeoutError if not self._is_processing(): return False res = self.__async_result tid = getattr(res, 'task_id', None) pkdlog('{}: kill SIGTERM tid={}', self.jid, tid) try: res.revoke(terminate=True, wait=True, timeout=runner.KILL_TIMEOUT_SECS, signal='SIGTERM') except TimeoutError as e: pkdlog('{}: kill SIGKILL tid={}', self.jid, tid) res.revoke(terminate=True, signal='SIGKILL')
def _json_input(): req = flask.request if req.mimetype != 'application/json': pkdlog('{}: req.mimetype is not application/json', req.mimetype) raise werkzeug.Eexceptions.BadRequest('expecting application/json') # Adapted from flask.wrappers.Request.get_json # We accept a request charset against the specification as # certain clients have been using this in the past. This # fits our general approach of being nice in what we accept # and strict in what we send out. charset = req.mimetype_params.get('charset') data = req.get_data(cache=False) return simulation_db.json_load(data, encoding=charset)
def _repo(self, repo): fn = repo.full_name bd = re.sub('/', '-', fn) def _clone(suffix): base = bd + suffix for cmd in [ ['git', 'clone', '--quiet', '--mirror', _GITHUB_URI + '/' + fn + suffix, base], ['tar', 'cJf', base + '.txz', base], ]: _shell(cmd) pkio.unchecked_remove(base) def _json(gen, suffix): base = bd + suffix with open(base, 'wt') as f: sep = '[' for i in gen: f.write(sep) j = i.as_json() assert json.loads(j) f.write(j) sep = ',' if sep == '[': # Empty iteration f.write(sep) f.write(']') _shell(['xz', base]) try: _clone('.git') if repo.has_issues: _json(repo.issues(state='all'), '.issues') if repo.has_wiki: try: _clone('.wiki.git') except subprocess.CalledProcessError as e: if not re.search(_WIKI_ERROR_OK, str(e.output)): raise _json(repo.comments(), '.comments') except Exception as e: pkdlog( 'ERROR: {} {} {} {} {}', fn, type(e), e, getattr(e, 'output', None), pkdexc(), )
def _field_type_for_field(el, field): if re.search(r'\[\d+\]$', field): field = re.sub(r'\[\d+\]$', '', field) field_type = None model_name = _model_name_for_data(el) for f in _SCHEMA['model'][model_name]: if f == field: field_type = _SCHEMA['model'][model_name][f][1] break if not field_type: if not field in _IGNORE_FIELD: pkdlog('{}: unknown field type for {}', field, model_name) del el[field] return field_type
def kill(cls, jid): from celery.exceptions import TimeoutError with cls._lock: self = cls._find_job(jid) if not self: return res = self.async_result tid = res.task_id pkdlog('{}: killing: tid={}', jid, tid) try: res.revoke(terminate=True, wait=True, timeout=2, signal='SIGTERM') except TimeoutError as e: pkdlog('{}: sending a SIGKILL tid={}', jid, tid) res.revoke(terminate=True, signal='SIGKILL') with cls._lock: self = cls._find_job(jid) if not self: return if self.async_result.task_id == tid: del self._job[self.jid] pkdlog('{}: deleted (killed) job; tid={} celery_state={}', jid, tid, self.async_result.state) return pkdlog( '{}: job reaped by another thread; old_tid={}, new_tid={}', jid, tid, self.async_result, )
def iterate_simulation_datafiles(simulation_type, op, search=None): res = [] for path in glob.glob( str(simulation_dir(simulation_type).join('*', SIMULATION_DATA_FILE)), ): path = py.path.local(path) try: data = open_json_file(simulation_type, path) if search and not _search_data(data, search): continue op(res, path, data) except ValueError as e: pkdlog('{}: error: {}', path, e) return res
def app_run_simulation(): data = _parse_data_input(validate=True) res = _simulation_run_status(data, quiet=True) if ( ( not res['state'] in _RUN_STATES and (res['state'] != 'completed' or data.get('forceRun', False)) ) or res.get('parametersChanged', True) ): try: _start_simulation(data) except runner.Collision: pkdlog('{}: runner.Collision, ignoring start', simulation_db.job_id(data)) res = _simulation_run_status(data) return _json_response(res)
def open_json_file(sim_type, path=None, sid=None, fixup=True): """Read a db file and return result Args: sim_type (str): simulation type (app) path (py.path.local): where to read the file sid (str): simulation id Returns: dict: data Raises: CopyRedirect: if the simulation is in another user's """ if not path: path = sim_data_file(sim_type, sid) if not os.path.isfile(str(path)): global_sid = None if sid: #TODO(robnagler) workflow should be in server.py, # because only valid in one case, not e.g. for opening examples # which are not found. user_copy_sid = _find_user_simulation_copy(sim_type, sid) if find_global_simulation(sim_type, sid): global_sid = sid if global_sid: raise CopyRedirect({ 'redirect': { 'simulationId': global_sid, 'userCopySimulationId': user_copy_sid, }, }) util.raise_not_found( '{}/{}: global simulation not found', sim_type, sid, ) data = None try: with open(str(path)) as f: data = json_load(f) # ensure the simulationId matches the path if sid: data['models']['simulation']['simulationId'] = _sid_from_path(path) except Exception as e: pkdlog('{}: error: {}', path, pkdexc()) raise return fixup_old_data(data)[0] if fixup else data
def app_error_logging(): ip = flask.request.remote_addr try: pkdlog( '{}: javascript error: {}', ip, simulation_db.generate_json(_json_input(), pretty=True), ) except ValueError as e: pkdlog( '{}: error parsing javascript app_error: {} input={}', ip, e, flask.request.data.decode('unicode-escape'), ) return _json_response_ok();
def import_python(code, tmp_dir, lib_dir, user_filename=None, arguments=None): """Converts script_text into json and stores as new simulation. Avoids too much data back to the user in the event of an error. This could be a potential security issue, because the script could be used to probe the system. Args: simulation_type (str): always "srw", but used to find lib dir code (str): Python code that runs SRW user_filename (str): uploaded file name for log arguments (str): argv to be passed to script Returns: dict: simulation data """ script = None # Patch for the mirror profile for the exported .py file from Sirepo: code = _patch_mirror_profile(code, lib_dir) try: with pkio.save_chdir(tmp_dir): # This string won't show up anywhere script = pkio.write_text('in.py', code) o = SRWParser( script, lib_dir=py.path.local(lib_dir), user_filename=user_filename, arguments=arguments, ) return o.data except Exception as e: lineno = script and _find_line_in_trace(script) # Avoid pkdlog( 'Error: {}; exception={}; script={}; filename={}; stack:\n{}', e.message, e, script, user_filename, pkdexc(), ) e = str(e)[:50] raise ValueError( 'Error on line {}: {}'.format(lineno, e) if lineno else 'Error: {}'.format(e))
def race_condition_reap(cls, jid): """Race condition due to lack of mutex and reliable results. """ with cls._lock: self = cls._find_job(jid) if self: res = self.async_result pkdlog( '{}: aborting and deleting job; tid={} celery_state={}', jid, res, res and res.state, ) del self._job[self.jid] res.revoke(terminate=True, signal='SIGKILL') else: pkdlog('{}: job finished finally', jid)
def _pid(): res = -1 try: with open(lock_pid) as f: res = int(f.read()) except Exception: pass pkdlog(res) if res <= 0: return res try: os.kill(res, 0) except Exception as e: pkdlog(e) if isinstance(e, OSError) and e.errno == errno.ESRCH: return res return -1
def move_user_simulations(to_uid): """Moves all non-example simulations for the current session into the target user's dir. """ from_uid = cookie.get_user() with _global_lock: for path in glob.glob( str(user_dir_name(from_uid).join('*', '*', SIMULATION_DATA_FILE)), ): data = read_json(path) sim = data['models']['simulation'] if 'isExample' in sim and sim['isExample']: continue dir_path = os.path.dirname(path) new_dir_path = dir_path.replace(from_uid, to_uid) pkdlog('{} -> {}', dir_path, new_dir_path) pkio.mkdir_parent(new_dir_path) os.rename(dir_path, new_dir_path)
def _start(self): """Detach a process from the controlling terminal and run it in the background as a daemon. We don't use pksubprocess. This method is not called from the MainThread so can't set signals. """ env = _safe_env() env['SIREPO_MPI_CORES'] = str(mpi.cfg.cores) try: pid = os.fork() except OSError as e: pkdlog('{}: fork OSError: {} errno={}', self.jid, e.strerror, e.errno) reraise if pid != 0: pkdlog('{}: started: pid={} cmd={}', self.jid, pid, self.cmd) self.__pid = pid return try: os.chdir(str(self.run_dir)) #Don't os.setsid() so signals propagate properly maxfd = resource.getrlimit(resource.RLIMIT_NOFILE)[1] if (maxfd == resource.RLIM_INFINITY): maxfd = runner.MAX_OPEN_FILES for fd in range(0, maxfd): try: os.close(fd) except OSError: pass sys.stdin = open(template_common.RUN_LOG, 'a+') assert sys.stdin.fileno() == 0 os.dup2(0, 1) sys.stdout = os.fdopen(1, 'a+') os.dup2(0, 2) sys.stderr = os.fdopen(2, 'a+') pkdlog('{}: child will exec: {}', self.jid, self.cmd) sys.stderr.flush() try: simulation_db.write_status('running', self.run_dir) os.execvpe(self.cmd[0], self.cmd, env=env) except BaseException as e: pkdlog( '{}: execvp error: {} errno={}', self.jid, e.strerror if hasattr(e, 'strerror') else '', e.errno if hasattr(e, 'errno') else '', ) finally: sys.exit(1) except BaseException as e: # NOTE: there's no lock here so just append to the log. This # really shouldn't happen, but it might (out of memory) so just # log to the run log and hope somebody notices self._error_during_start(e, pkdexc()) raise
def test_importer(): from pykern import pkcollections from pykern import pkio from pykern.pkunit import pkeq from sirepo.template import elegant with pkunit.save_chdir_work(): for fn in pkio.sorted_glob(pkunit.data_dir().join('*')): if not pkio.has_file_extension(fn, ('ele', 'lte')) \ or fn.basename.endswith('ele.lte'): continue error = None try: data = elegant.import_file(FlaskRequest(fn)) except Exception as e: pkdlog(pkdexc()) error = e.message if error: actual = error else: if pkio.has_file_extension(fn, 'lte'): data['models']['commands'] = [] actual = '{}{}'.format( elegant._generate_variables(data), elegant.generate_lattice( data, elegant._build_filename_map(data), elegant._build_beamline_map(data), pkcollections.Dict(), ), ) else: data2 = elegant.import_file(FlaskRequest('{}.lte'.format(fn)), test_data=data) actual = elegant._generate_commands( data2, elegant._build_filename_map(data2), elegant._build_beamline_map(data2), pkcollections.Dict(), ) outfile = fn.basename + '.txt' pkio.write_text(outfile, actual) expect = pkio.read_text(pkunit.data_dir().join(outfile)) #TODO(pjm): this takes too long if there are a lot of diffs #assert expect == actual pkeq(expect, actual)
def __init__(self): self._date_d = datetime.datetime.now().strftime('%Y%m%d%H%M%S') with pkio.save_chdir(self._date_d, mkdir=True): self._login() sleep = 0 for r in self._github.subscriptions(): if cfg.test_mode: if r.name != 'pykern': continue if cfg.exclude_re and cfg.exclude_re.search(r.full_name): pkdc('exclude: {}', r.full_name) continue if sleep: time.sleep(sleep) else: sleep = cfg.api_pause_seconds pkdlog('{}: begin', r.full_name) self._repo(r) self._purge()
def _update_database(user_data, oauth_type): with _db_serial_lock: user = User.query.filter_by(oauth_id=user_data['id'], oauth_type=oauth_type).first() session_uid = cookie.get_user(checked=False) if user: if session_uid and session_uid != user.uid: simulation_db.move_user_simulations(user.uid) user.user_name = user_data['login'] user.display_name = user_data['name'] cookie.set_user(user.uid) else: if not session_uid: # ensures the user session (uid) is ready if new user logs in from logged-out session pkdlog('creating new session for user: {}', user_data['id']) simulation_db.simulation_dir('') user = User(cookie.get_user(), user_data['login'], user_data['name'], oauth_type, user_data['id']) _db.session.add(user) _db.session.commit() return user
def _auth_hook_from_header(values): """Migrate from old cookie values Always sets _COOKIE_STATE, which is our sentinel. Args: values (dict): just parsed values Returns: dict: unmodified or migrated values """ if values.get(_COOKIE_STATE): # normal case: we've seen a cookie at least once # check for cfg.methods changes m = values.get(_COOKIE_METHOD) if m and m not in valid_methods: # invalid method (changed config), reset state pkdlog( 'possibly misconfigured server: invalid cookie_method={}, clearing values={}', m, values, ) pkcollections.unchecked_del( values, _COOKIE_METHOD, _COOKIE_USER, _COOKIE_STATE, ) return values u = values.get('sru') or values.get('uid') if not u: # normal case: new visitor, and no user/state; set logged out # and return all values values[_COOKIE_STATE] = _STATE_LOGGED_OUT return values # Migrate o = values.get('sros') or values.get('oauth_login_state') s = _STATE_COMPLETE_REGISTRATION if o is None or o in ('anonymous', 'a'): m = METHOD_GUEST elif o in ('logged_in', 'li', 'logged_out', 'lo'): m = 'github' if 'i' not in o: s = _STATE_LOGGED_OUT else: pkdlog('unknown cookie values, clearing, not migrating: {}', values) return {} # Upgrade cookie to current structure. Set the sentinel, too. values = { _COOKIE_USER: u, _COOKIE_METHOD: m, _COOKIE_STATE: s, } cookie.set_sentinel(values) pkdlog('migrated cookie={}', values) return values
def _remove_old_tmp_dirs(): pkdlog('scanning for stale tmp dirs') count = 0 cutoff = time.time() - srdb.TMP_DIR_CLEANUP_TIME for dirpath, dirnames, filenames in os.walk(srdb.root()): if (dirpath.endswith(srdb.TMP_DIR_SUFFIX) and os.stat(dirpath).st_mtime < cutoff): pkdlog('removing stale tmp dir: {}', dirpath) pkio.unchecked_remove(dirpath) count += 1 pkdlog('finished scanning for stale tmp dirs ({} found)', count)
def sr_run_sim(self, data, model, expect_completed=True, timeout=10, **post_args): from pykern import pkunit from pykern.pkdebug import pkdlog, pkdexc import time if self.sr_job_run_mode: data.models[model].jobRunMode = self.sr_job_run_mode cancel = None try: r = self.sr_post( 'runSimulation', PKDict( models=data.models, report=model, simulationId=data.models.simulation.simulationId, simulationType=data.simulationType, ).pkupdate(**post_args), ) if r.state == 'completed': return r cancel = r.get('nextRequest') for _ in range(timeout): if r.state in ('completed', 'error'): pkdlog(r.state) cancel = None break r = self.sr_post('runStatus', r.nextRequest) time.sleep(1) else: pkunit.pkok(not expect_completed, 'did not complete: runStatus={}', r) if expect_completed: pkunit.pkeq('completed', r.state) return r finally: if cancel: pkdlog('runCancel') self.sr_post('runCancel', cancel) import subprocess o = pkcompat.from_bytes( subprocess.check_output(['ps', 'axww'], stderr=subprocess.STDOUT), ) o = list(filter(lambda x: 'mpiexec' in x, o.split('\n'))) if o: pkdlog('found "mpiexec" after cancel in ps={}', '\n'.join(o)) # this exception won't be seen because in finally raise AssertionError('cancel failed')
def sigchld_handler(cls, signum=None, frame=None): try: pid, status = os.waitpid(-1, os.WNOHANG) pkdlog('{}: waitpid: status={}', pid, status) with cls._lock: for self in cls._job.values(): if self.pid == pid: del self._job[self.jid] pkdlog('{}: delete successful', self.jid) return except OSError as e: if e.errno != errno.ECHILD: pkdlog('waitpid: OSError: {} errno={}', e.strerror, e.errno)
def _receive(self, msg): c = msg.content i = c.get('opId') if ( ('opName' not in c or c.opName == job.OP_ERROR) or ('reply' in c and c.reply.get('state') == job.ERROR) ): pkdlog('agentId={} msg={}', self._agentId, c) else: pkdlog('{} agentId={} opId={}', c.opName, self._agentId, i) if i: if 'reply' not in c: pkdlog('agentId={} No reply={}', self._agentId, c) c.reply = PKDict(state='error', error='no reply') if i in self.ops_pending_done: self.ops_pending_done[i].reply_put(c.reply) else: pkdlog('agentId={} not pending opId={} opName={}', self._agentId, i, c.opName) else: getattr(self, '_receive_' + c.opName)(msg)
async def run_extract_job(self, run_dir, jhash, subcmd, arg): pkdc('{} {}: {} {}', run_dir, jhash, subcmd, arg) status = self.report_job_status(run_dir, jhash) if status is runner_client.JobStatus.MISSING: pkdlog('{} {}: report is missing; skipping extract job', run_dir, jhash) return {} # figure out which backend and any backend-specific info runner_info_file = run_dir.join(_RUNNER_INFO_BASENAME) if runner_info_file.exists(): runner_info = pkjson.load_any(runner_info_file) else: # Legacy run_dir runner_info = pkcollections.Dict( version=1, backend='local', backend_info={}, ) assert runner_info.version == 1 # run the job cmd = ['sirepo', 'extract', subcmd, arg] result = await _BACKENDS[runner_info.backend].run_extract_job( run_dir, cmd, runner_info.backend_info, ) if result.stderr: pkdlog( 'got output on stderr ({} {}):\n{}', run_dir, jhash, result.stderr.decode('utf-8', errors='ignore'), ) if result.returncode != 0: pkdlog( 'failed with return code {} ({} {}), stdout:\n{}', result.returncode, run_dir, subcmd, result.stdout.decode('utf-8', errors='ignore'), ) raise AssertionError return pkjson.load_any(result.stdout)
def run(self): """Start jobs if slots available else check for available""" pkdlog( '{}: {} available={}', self.name, self.__kind, len(self.__available_slots), ) while True: self.__event.wait(_SLOT_MANAGER_POLL_SECS) got_one = False while True: with self.__lock: self.__event.clear() if not (self.__queued_jobs and self.__available_slots): if self.__queued_jobs: pkdlog( 'waiting: queue={} available={}', [x.jid for x in self.__queued_jobs], [str(x) for x in self.__available_slots], ) break j = self.__queued_jobs.pop(0) s = self.__available_slots.pop(0) s.job = j self.__running_slots[j.jid] = s # have to release slot lock before locking job try: with j.lock: if j._is_state_ok_to_start(): j._slot_start(s) got_one = True except Exception as e: j._error_during_start(e, pkdexc()) try: j.kill() except Exception as e: pkdlog( '{}: error during cleanup after error: {}\n{}', j.jid, e, pkdexc(), ) if not got_one: self._poll_running_jobs()
def sigchld_handler(cls, signum=None, frame=None): try: with cls._lock: if not cls._job: # Can't be our job so don't waitpid. # Only important at startup, when other modules # are doing popens, which does a waitpid. # see radiasoft/sirepo#681 return pid, status = os.waitpid(-1, os.WNOHANG) pkdlog('{}: waitpid: status={}', pid, status) for self in cls._job.values(): if self.pid == pid: del self._job[self.jid] pkdlog('{}: delete successful', self.jid) return except OSError as e: if e.errno != errno.ECHILD: pkdlog('waitpid: OSError: {} errno={}', e.strerror, e.errno)
async def _incoming(content, handler): try: c = content if not isinstance(content, dict): c = pkjson.load_any(content) if c.get('api') != 'api_runStatus': pkdc( 'class={} content={}', handler.sr_class, c, ) await handler.sr_class(handler=handler, content=c).receive() except Exception as e: pkdlog('exception={} handler={} content={}', e, handler, content) pkdlog(pkdexc()) try: handler.sr_on_exception() except Exception as e: pkdlog('sr_on_exception: exception={}', e)
def api_authEmailAuthorized(simulation_type, token): """Clicked by user in an email Token must exist in db and not be expired. """ if http_request.is_spider(): sirepo.util.raise_forbidden('robots not allowed') req = http_request.parse_params(type=simulation_type) with auth_db.thread_lock: u = AuthEmailUser.search_by(token=token) if u and u.expires >= srtime.utc_now(): n = _verify_confirm(req.type, token, auth.need_complete_registration(u)) u.query.filter( (AuthEmailUser.user_name == u.unverified_email), AuthEmailUser.unverified_email != u.unverified_email, ).delete() u.user_name = u.unverified_email u.token = None u.expires = None u.save() auth.login(this_module, sim_type=req.type, model=u, display_name=n) raise AssertionError('auth.login returned unexpectedly') if not u: pkdlog('login with invalid token={}', token) else: pkdlog( 'login with expired token={}, email={}', token, u.unverified_email, ) # if user is already logged in via email, then continue to the app if auth.user_if_logged_in(AUTH_METHOD): pkdlog( 'user already logged in. ignoring invalid token: {}, user: {}', token, auth.logged_in_user(), ) raise sirepo.util.Redirect(sirepo.uri.local_route(req.type)) auth.login_fail_redirect(req.type, this_module, 'email-token')
def _write_rtdose_file(files, rtdose_path, prefix, filename=_VTI_RTDOSE_ZIP_FILE): rtdose = pydicom.dcmread(rtdose_path) doseinfo = _extract_dcm_info(files, None, rtdose) doseinfo.DoseMax = int(rtdose.pixel_array.max()) doseinfo.DoseGridScaling = rtdose.DoseGridScaling pkdlog('max dose: {}, scaler: {}', doseinfo.DoseMax, doseinfo.DoseGridScaling) pkdlog('max dose (scaled): {}', rtdose.pixel_array.max() * rtdose.DoseGridScaling) #doseinfo.ImagePositionPatient[2] += (doseinfo.Count - 1) * doseinfo.SliceThickness #pkdp('dose pixel array size: {}, len(rtdose.pixel_array)) pkio.mkdir_parent(_PIXEL_DATA_DIR) pkdlog(rtdose.pixel_array.shape) # order frame in direction used by ct (assumes HFS) with open (_PIXEL_DATA_FILE, 'ab') as f: #for di in reversed(range(rtdose.pixel_array.shape[0])): for di in range(rtdose.pixel_array.shape[0]): for yi in range(rtdose.pixel_array.shape[1]): pixels = rtdose.pixel_array[di][yi] # pixels = pixels.astype(np.uint16) pixels.tofile(f) _write_vti_file(filename, doseinfo, prefix) return doseinfo
async def _agent_starting_timeout_handler(self): pkdlog('{} timeout={}', self, self.cfg.agent_starting_secs) await self.kill() self.free_resources( internal_error='timeout waiting for agent to start')
def _receive_error(self, msg): #TODO(robnagler) what does this mean? Just a way of logging? Document this. pkdlog('{} msg={}', self, msg)
def send(self, op): pkdlog('{} {} runDir={}', self, op, op.msg.get('runDir')) self._websocket.write_message(pkjson.dump_bytes(op.msg))
def websocket_on_close(self): pkdlog('{}', self) self.free_resources()
def _simulation_run_status_runner_daemon(data, quiet=False): """Look for simulation status and output Args: data (dict): request quiet (bool): don't write errors to log Returns: dict: status response """ try: run_dir = simulation_db.simulation_run_dir(data) jhash = template_common.report_parameters_hash(data) status = runner_client.report_job_status(run_dir, jhash) is_running = status is runner_client.JobStatus.RUNNING rep = simulation_db.report_info(data) res = {'state': status.value} if not is_running: if status is not runner_client.JobStatus.MISSING: res, err = runner_client.run_extract_job( run_dir, jhash, 'result', data, ) if err: return _simulation_error(err, 'error in read_result', run_dir) if simulation_db.is_parallel(data): new = runner_client.run_extract_job( run_dir, jhash, 'background_percent_complete', is_running, ) new.setdefault('percentComplete', 0.0) new.setdefault('frameCount', 0) res.update(new) res['parametersChanged'] = rep.parameters_changed if res['parametersChanged']: pkdlog( '{}: parametersChanged=True req_hash={} cached_hash={}', rep.job_id, rep.req_hash, rep.cached_hash, ) #TODO(robnagler) verify serial number to see what's newer res.setdefault('startTime', _mtime_or_now(rep.input_file)) res.setdefault('lastUpdateTime', _mtime_or_now(rep.run_dir)) res.setdefault('elapsedTime', res['lastUpdateTime'] - res['startTime']) if is_running: res['nextRequestSeconds'] = simulation_db.poll_seconds( rep.cached_data) res['nextRequest'] = { 'report': rep.model_name, 'reportParametersHash': rep.cached_hash, 'simulationId': rep.cached_data['simulationId'], 'simulationType': rep.cached_data['simulationType'], } pkdc( '{}: processing={} state={} cache_hit={} cached_hash={} data_hash={}', rep.job_id, is_running, res['state'], rep.cache_hit, rep.cached_hash, rep.req_hash, ) except Exception: return _simulation_error(pkdexc(), quiet=quiet) return res
def run_timeout(self): if self.do_not_send: return pkdlog('opId={opId} opName={opName} maxRunSecs={maxRunSecs}', **self) self.set_canceled()
async def _terminate(): try: await sirepo.job_supervisor.terminate() except Exception as e: pkdlog('error={} stack={}', e, pkdexc()) tornado.ioloop.IOLoop.current().stop()
def __req(self, route_or_uri, params, query, op, raw_response, **kwargs): """Make request and parse result Args: route_or_uri (str): string name of route or uri if contains '/' (http:// or '/foo') params (dict): parameters to apply to route op (func): how to request Returns: object: parsed JSON result """ from pykern.pkdebug import pkdlog, pkdexc, pkdc, pkdp import pykern.pkjson import sirepo.http_reply import sirepo.uri import sirepo.util redirects = kwargs.setdefault('__redirects', 0) + 1 assert redirects <= 5 kwargs['__redirects'] = redirects u = None r = None try: u = sirepo.uri.server_route(route_or_uri, params, query) pkdc('uri={}', u) r = op(u) pkdc( 'status={} data={}', r.status_code, '<snip-file>' if 'download-data-file' in u else r.data, ) # Emulate code in sirepo.js to deal with redirects if r.status_code == 200 and r.mimetype == 'text/html': m = _JAVASCRIPT_REDIRECT_RE.search(pkcompat.from_bytes(r.data)) if m: if m.group(1).endswith('#/error'): raise sirepo.util.Error( PKDict(error='server error uri={}'.format( m.group(1))), ) if kwargs.get('redirect', True): # Execute the redirect return self.__req( m.group(1), None, None, self.get, raw_response, __redirects=redirects, ) return flask.redirect(m.group(1)) if r.status_code in (301, 302, 303, 305, 307, 308): if kwargs.get('redirect', True): # Execute the redirect return self.__req( r.headers['Location'], None, None, self.get, raw_response, __redirects=redirects, ) return r if raw_response: return r # Treat SRException as a real exception (so we don't ignore them) d = pykern.pkjson.load_any(r.data) if (isinstance(d, dict) and d.get('state') == sirepo.http_reply.SR_EXCEPTION_STATE): raise sirepo.util.SRException( d.srException.routeName, d.srException.params, ) return d except Exception as e: if not isinstance(e, (sirepo.util.Reply)): pkdlog( 'Exception: {}: msg={} uri={} status={} data={} stack={}', type(e), e, u, r and r.status_code, r and r.data, pkdexc(), ) raise
def login(module, uid=None, model=None, sim_type=None, display_name=None, is_mock=False, want_redirect=False): """Login the user Raises an exception if successful, except in the case of methods Args: module (module): method module uid (str): user to login model (auth_db.UserDbBase): user to login (overrides uid) sim_type (str): app to redirect to """ _validate_method(module, sim_type=sim_type) guest_uid = None if model: uid = model.uid # if previously cookied as a guest, move the non-example simulations into uid below m = cookie.unchecked_get_value(_COOKIE_METHOD) if m == METHOD_GUEST and module.AUTH_METHOD != METHOD_GUEST: guest_uid = _get_user() if _is_logged_in() else None if uid: _login_user(module, uid) if module.AUTH_METHOD in cfg.deprecated_methods: pkdlog('deprecated auth method={} uid={}'.format( module.AUTH_METHOD, uid)) if not uid: # No user so clear cookie so this method is removed reset_state() # We are logged in with a deprecated method, and now the user # needs to login with an allowed method. login_fail_redirect(sim_type, module, 'deprecated', reload_js=not uid) if not uid: # No user in the cookie and method didn't provide one so # the user might be switching methods (e.g. github to email or guest to email). # Not allowed to go to guest from other methods, because there's # no authentication for guest. # Or, this is just a new user, and we'll create one. uid = _get_user() if _is_logged_in() else None m = cookie.unchecked_get_value(_COOKIE_METHOD) if uid and module.AUTH_METHOD not in (m, METHOD_GUEST): # switch this method to this uid (even for methods) # except if the same method, then assuming logging in as different user. # This handles the case where logging in as guest, creates a user every time _login_user(module, uid) else: uid = simulation_db.user_create(lambda u: _login_user(module, u)) _create_roles_for_user(uid, module.AUTH_METHOD) if model: model.uid = uid model.save() if display_name: complete_registration(_parse_display_name(display_name)) if is_mock: return if sim_type: if guest_uid and guest_uid != uid: simulation_db.move_user_simulations(guest_uid, uid) login_success_response(sim_type, want_redirect) assert not module.AUTH_METHOD_VISIBLE
def _init_hosts_slots_balance(): """Balance sequential and parallel slot counts""" global _hosts_ordered def _host_cmp(a, b): """This (local) host will get (first) sequential slots. Sequential slots are "faster" and don't go over NFS (usually) so the interactive jobs will be more responsive (hopefully). We don't assign sequential slots randomly, but in fixed order. This helps reproduce bugs, because you know the first host is the sequential host. Slots are then randomized for execution. """ if a.remote_ip == a.local_ip: return -1 if b.remote_ip == b.local_ip: return +1 return cmp(a.name, b.name) def _ratio_not_ok(): """Minimum sequential job slots should be 40% of total""" mp = 0 ms = 0 for h in _hosts.values(): mp += h.num_slots.parallel ms += h.num_slots.sequential if mp + ms == 1: # Edge case where ratio calculation can't work (only dev) h = _hosts.values()[0] h.num_slots.sequential = 1 h.num_slots.parallel = 1 return False # Must be at least one parallel slot if mp <= 1: return False #TODO(robnagler) needs to be more complex, because could have many more # parallel nodes than sequential, which doesn't need to be so large. This # is a good guess for reasonable configurations. r = float(ms) / (float(mp) + float(ms)) return r < 0.4 _hosts_ordered = sorted(_hosts.values(), cmp=_host_cmp) while _ratio_not_ok(): for h in _hosts_ordered: # Balancing consists of making the first host have # all the sequential jobs, then the next host. This # is a guess at the best way to distribute sequential # vs parallel jobs. if h.num_slots.parallel > 0: # convert a parallel slot on first available host h.num_slots.sequential += _parallel_cores h.num_slots.parallel -= 1 break else: raise AssertionError( 'should never get here: {}'.format(pkdpretty(hosts)), ) for h in _hosts_ordered: pkdlog( '{}: parallel={} sequential={}', h.name, h.num_slots.parallel, h.num_slots.sequential, )
def __init__(self, sr_args, *args, **kwargs): super(Reply, self).__init__() if args or kwargs: kwargs['pkdebug_frame'] = inspect.currentframe().f_back.f_back pkdlog(*args, **kwargs) self.sr_args = sr_args
def _raise(exc, fmt, *args, **kwargs): import werkzeug.exceptions kwargs['pkdebug_frame'] = inspect.currentframe().f_back.f_back pkdlog(fmt, *args, **kwargs) raise getattr(werkzeug.exceptions, exc)()
def _simulation_run_status(data, quiet=False): """Look for simulation status and output Args: data (dict): request quiet (bool): don't write errors to log Returns: dict: status response """ try: #TODO(robnagler): Lock rep = simulation_db.report_info(data) is_processing = runner.job_is_processing(rep.job_id) is_running = rep.job_status in _RUN_STATES res = {'state': rep.job_status} pkdc( '{}: is_processing={} is_running={} state={} cached_data={}', rep.job_id, is_processing, is_running, rep.job_status, bool(rep.cached_data), ) if is_processing and not is_running: runner.job_race_condition_reap(rep.job_id) pkdc('{}: is_processing and not is_running', rep.job_id) is_processing = False template = sirepo.template.import_module(data) if is_processing: if not rep.cached_data: return _simulation_error( 'input file not found, but job is running', rep.input_file, ) else: is_running = False if rep.run_dir.exists(): if hasattr(template, 'prepare_output_file') and 'models' in data: template.prepare_output_file(rep.run_dir, data) res2, err = simulation_db.read_result(rep.run_dir) if err: if simulation_db.is_parallel(data): # allow parallel jobs to use template to parse errors below res['state'] = 'error' else: if hasattr(template, 'parse_error_log'): res = template.parse_error_log(rep.run_dir) if res: return res return _simulation_error(err, 'error in read_result', rep.run_dir) else: res = res2 if simulation_db.is_parallel(data): new = template.background_percent_complete( rep.model_name, rep.run_dir, is_running, ) new.setdefault('percentComplete', 0.0) new.setdefault('frameCount', 0) res.update(new) res['parametersChanged'] = rep.parameters_changed if res['parametersChanged']: pkdlog( '{}: parametersChanged=True req_hash={} cached_hash={}', rep.job_id, rep.req_hash, rep.cached_hash, ) #TODO(robnagler) verify serial number to see what's newer res.setdefault('startTime', _mtime_or_now(rep.input_file)) res.setdefault('lastUpdateTime', _mtime_or_now(rep.run_dir)) res.setdefault('elapsedTime', res['lastUpdateTime'] - res['startTime']) if is_processing: res['nextRequestSeconds'] = simulation_db.poll_seconds( rep.cached_data) res['nextRequest'] = { 'report': rep.model_name, 'reportParametersHash': rep.cached_hash, 'simulationId': rep.cached_data['simulationId'], 'simulationType': rep.cached_data['simulationType'], } pkdc( '{}: processing={} state={} cache_hit={} cached_hash={} data_hash={}', rep.job_id, is_processing, res['state'], rep.cache_hit, rep.cached_hash, rep.req_hash, ) except Exception: return _simulation_error(pkdexc(), quiet=quiet) return res
def _init_tables(app): if not os.path.exists(_db_filename(app)): pkdlog('creating user oauth database') _db.create_all()
def open(self): pkdlog( 'uri={} remote_ip={} ', self.request.uri, self.request.remote_ip, )
def raise_not_found(fmt, *args, **kwargs): pkdlog(fmt, *args, **kwargs) raise werkzeug.exceptions.NotFound()
def destroy_op(self, op): pkdlog('destroy_op={}', op.opId) self._ops.remove(op) op.destroy()
async def _do_agent_start(self, op): log_file = 'job_agent.log' agent_start_dir = self._srdb_root script = f'''#!/bin/bash {self._agent_start_dev()} set -e mkdir -p '{agent_start_dir}' cd '{self._srdb_root}' {self._agent_env()} (/usr/bin/env; setsid {self.cfg.sirepo_cmd} job_agent start_sbatch) >& {log_file} & disown ''' def write_to_log(stdout, stderr, filename): p = pkio.py_path(self._local_user_dir).join( 'agent-sbatch', self.cfg.host) pkio.mkdir_parent(p) pkjson.dump_pretty( PKDict(stdout=stdout, stderr=stderr), p.join( f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}-{filename}.log' ), ) async def get_agent_log(connection): await tornado.gen.sleep(self.cfg.agent_log_read_sleep) async with connection.create_process( f'/bin/cat {agent_start_dir}/{log_file}') as p: o, e = await p.communicate() write_to_log(o, e, 'remote') try: async with asyncssh.connect( self.cfg.host, username=self._creds.username, password=self._creds.password + self._creds.otp if 'nersc' in self.cfg.host else self._creds.password, known_hosts=self._KNOWN_HOSTS, ) as c: async with c.create_process( '/bin/bash --noprofile --norc -l') as p: o, e = await p.communicate(input=script) if o or e: write_to_log(o, e, 'start') self.driver_details.pkupdate( host=self.cfg.host, username=self._creds.username, ) try: await get_agent_log(c) except sirepo.util.ASYNC_CANCELED_ERROR: raise except Exception as e: pkdlog( '{} e={} stack={}', self, e, pkdexc(), ) except asyncssh.misc.PermissionDenied: pkdlog('{}', pkdexc()) self._srdb_root = None self._raise_sbatch_login_srexception('invalid-creds', op.msg) except asyncssh.misc.ProtocolError: pkdlog('{}', pkdexc()) raise sirepo.util.UserAlert( f'Unable to connect to {self.cfg.host}. Please try again later.', ) except OSError as e: pkdlog('{}', pkdexc()) if e.errno == errno.EHOSTUNREACH: raise sirepo.util.UserAlert( f'Host {self.cfg.host} unreachable. Please try again later.', ) raise finally: self.pkdel('_creds')
def _validate_method(module, sim_type=None): if module.AUTH_METHOD in valid_methods: return None pkdlog('invalid auth method={}'.format(module.AUTH_METHOD)) login_fail_redirect(sim_type, module, 'invalid-method', reload_js=True)