def submit(self, jobs): # derive RP task descriptions and submit them jobs = ru.as_list(jobs) for job in jobs: job.status.update({ 'meta_data': {}, 'exit_code': None, 'final': False }) tds = [self._job_2_descr(job) for job in jobs] tasks = self._tmgr.submit_tasks(tds) with self._lock: # TODO: bulk advance for task, job in zip(tasks, jobs): self._jobs[job.uid] = [job, task] job._set_jex(self._executor) job.status.update({ 'message': 'rp task submitted', 'time': time.time(), 'native_id': task.uid }) self._executor._advance(job, jpsi.status.SUBMITTED)
def _result_cb(self, msg): # self._log.debug('master _result_cb: %s', msg) # update result and error information for the corresponding request UID uid = msg['req'] out = msg['out'] err = msg['err'] ret = msg['ret'] req = self._requests[uid] req.set_result(out, err, ret) try: new_items = ru.as_list(self.result_cb([req])) for item in new_items: self.request(item) except: self._log.exception('result callback failed')
def _bulk_cbs(self, units, metrics=None): if not metrics: metrics = [rpc.UNIT_STATE] else : metrics = ru.as_list(metrics) cbs = dict() # bulked callbacks to call with self._cb_lock: for metric in metrics: # get wildcard callbacks cb_dicts = self._callbacks[metric].get('*') for cb_name in cb_dicts: cbs[cb_name] = {'cb' : cb_dicts[cb_name]['cb'], 'cb_data': cb_dicts[cb_name]['cb_data'], 'units' : set(units)} # add unit specific callbacks if needed for unit in units: uid = unit.uid if uid not in self._callbacks[metric]: continue cb_dicts = self._callbacks[metric].get(uid, {}) for cb_name in cb_dicts: if cb_name in cbs: cbs[cb_name]['units'].add(unit) else: cbs[cb_name] = {'cb' : cb_dicts[cb_name]['cb'], 'cb_data': cb_dicts[cb_name]['cb_data'], 'units' : set([unit])} for cb_name in cbs: cb = cbs[cb_name]['cb'] cb_data = cbs[cb_name]['cb_data'] objs = cbs[cb_name]['units'] if cb_data: cb(list(objs), cb_data) else : cb(list(objs))
def work(self, units): ''' This is the main callback of the component, which is called for any incoming (set of) unit(s). Units arriving here must always be in `AGENT_SCHEDULING_PENDING` state, and must always leave in either `AGENT_EXECUTING_PENDING` or in a FINAL state (`FAILED` or `CANCELED`). While handled by this component, the units will be in `AGENT_SCHEDULING` state. This methods takes care of initial state change to `AGENT_SCHEDULING`, and then puts them forward onto the queue towards the actual scheduling process (self._schedule_units). ''' # unify handling of bulks / non-bulks units = ru.as_list(units) # advance state, publish state change, and push to scheduler process self.advance(units, rps.AGENT_SCHEDULING, publish=True, push=False) self._queue_sched.put(units)
def unregister_callback(self, cb=None, metrics=None, uid=None): if not metrics: metrics = [rpc.UMGR_METRICS] else : metrics = ru.as_list(metrics) if not uid: uid = '*' elif uid not in self._units: raise ValueError('no such unit %s' % uid) for metric in metrics: if metric not in rpc.UMGR_METRICS : raise ValueError ("invalid umgr metric '%s'" % metric) with self._cb_lock: for metric in metrics: if metric not in rpc.UMGR_METRICS : raise ValueError("cb metric '%s' unknown" % metric) if metric not in self._callbacks: raise ValueError("cb metric '%s' invalid" % metric) if uid not in self._callbacks[metric]: raise ValueError("cb target '%s' invalid" % uid) if cb: to_delete = [cb.__name__] else: to_delete = list(self._callbacks[metric][uid].keys()) for cb_name in to_delete: if cb_name not in self._callbacks[uid][metric]: raise ValueError("cb %s not registered" % cb_name) del(self._callbacks[uid][metric][cb_name])
def expand_ln(to_link, src_sbox, tgt_sbox, rid, cycle, task_id=None): expand = {'rid': rid, 'cycle': cycle} if not src_sbox: src_sbox = '.' if not tgt_sbox: tgt_sbox = '.' ret = list() for data in ru.as_list(to_link): src, tgt = data.split('>') try: src = src.strip() % expand tgt = tgt.strip() % expand except Exception as e: raise RuntimeError('expansion error: %s : %s : %s' % (src, tgt, expand)) # if task_id is None: # ret.append('%s/%s > %s/%s_%s' # % (src_sbox, src, tgt_sbox, tgt, task_id)) # else: # ret.append('%s/%s > %s/%s' % (src_sbox, src, tgt_sbox, tgt)) return ret
def run(self): ''' run the replica exchange pipelines, and after all is done, fetch the requested output data ''' # run the preparator, set resulting data as `shared_data`, and begin to # work fnames = ru.as_list(self._pre_alg(self._workload)) if self._workload.data.inputs not in fnames: fnames.append(self._workload.data.inputs) # write exchange algorithm to disk (once), and then stage with every # exchange task self._ex_alg_file = 'exchange_algorithm.py' with open('%s/%s' % (self._workload.data.inputs, self._ex_alg_file), 'w') as fout: fout.write( exchange_alg_prefix % (inspect.getsource(self._exc_alg), self._exc_alg.__name__)) self.shared_data = fnames re.AppManager.run(self)
def work_cb(self): ''' This is the main routine of the component, as it runs in the component process. It will first initialize the component in the process context. Then it will attempt to get new things from all input queues (round-robin). For each thing received, it will route that thing to the respective worker method. Once the thing is worked upon, the next attempt on getting a thing is up. ''' # if no action occurs in this iteration, idle if not self._inputs: time.sleep(0.1) return True for name in self._inputs: input = self._inputs[name]['queue'] states = self._inputs[name]['states'] # FIXME: a simple, 1-thing caching mechanism would likely # remove the req/res overhead completely (for any # non-trivial worker). things = input.get_nowait(500) # in microseconds things = ru.as_list(things) if not things: return True # the worker target depends on the state of things, so we # need to sort the things into buckets by state before # pushing them buckets = dict() for thing in things: state = thing['state'] uid = thing['uid'] self._prof.prof('get', uid=uid, state=state) if state not in buckets: buckets[state] = list() buckets[state].append(thing) # We now can push bulks of things to the workers for state, things in buckets.items(): assert (state in states), 'inconsistent state' assert (state in self._workers), 'no worker for state %s' % state try: to_cancel = list() for thing in things: uid = thing['uid'] ttype = thing['type'] state = thing['state'] # FIXME: this can become expensive over time # if the cancel list is never cleaned if uid in self._cancel_list: with self._cancel_lock: self._cancel_list.remove(uid) to_cancel.append(thing) self._log.debug('got %s (%s)', ttype, uid) if to_cancel: self.advance(to_cancel, rps.CANCELED, publish=True, push=False) with self._cb_lock: self._workers[state](things) except Exception: # this is not fatal -- only the 'things' fail, not # the component self._log.exception("work %s failed", self._workers[state]) self.advance(things, rps.FAILED, publish=True, push=False) # keep work_cb registered return True
def advance(self, things, state=None, publish=True, push=False, ts=None, prof=True): ''' Things which have been operated upon are pushed down into the queues again, only to be picked up by the next component, according to their state model. This method will update the thing state, and push it into the output queue registered as target for that state. things: list of things to advance state: new state to set for the things publish: determine if state update notifications should be issued push: determine if things should be pushed to outputs prof: determine if state advance creates a profile event (publish, and push are always profiled) 'Things' are expected to be a dictionary, and to have 'state', 'uid' and optionally 'type' set. If 'thing' contains an '$all' key, the complete dict is published; otherwise, *only the state* is published. This is evaluated in self.publish. ''' if not ts: ts = time.time() things = ru.as_list(things) if not things: return self._log.debug('advance bulk: %s [%s, %s]', len(things), push, publish) # assign state, sort things by state buckets = dict() for thing in things: uid = thing['uid'] # if thing['type'] not in ['unit', 'pilot']: # raise TypeError("thing has unknown type (%s)" % uid) if state: # state advance done here thing['state'] = state _state = thing['state'] if prof: self._prof.prof('advance', uid=uid, state=_state, ts=ts) if _state not in buckets: buckets[_state] = list() buckets[_state].append(thing) # should we publish state information on the state pubsub? if publish: to_publish = list() # If '$all' is set, we update the complete thing_dict. # Things in final state are also published in full. # If '$set' is set, we also publish all keys listed in there. # In all other cases, we only send 'uid', 'type' and 'state'. for thing in things: if '$all' in thing: del (thing['$all']) to_publish.append(thing) elif thing['state'] in rps.FINAL: to_publish.append(thing) else: tmp = { 'uid': thing['uid'], 'type': thing['type'], 'state': thing['state'] } for key in thing.get('$set', []): tmp[key] = thing[key] to_publish.append(tmp) self.publish(rpc.STATE_PUBSUB, { 'cmd': 'update', 'arg': to_publish }) # ts = time.time() # for thing in things: # self._prof.prof('publish', uid=thing['uid'], # state=thing['state'], ts=ts) # never carry $all across component boundaries! for thing in things: if '$all' in thing: del (thing['$all']) # should we push things downstream, to the next component if push: # the push target depends on the state of things, so we need to sort # the things into buckets by state before pushing them # now we can push the buckets as bulks for _state, _things in buckets.items(): # ts = time.time() if _state in rps.FINAL: # things in final state are dropped for thing in _things: self._log.debug('final %s [%s]', thing['uid'], _state) self._prof.prof('drop', uid=thing['uid'], state=_state, ts=ts) continue if _state not in self._outputs: # unknown target state -- error for thing in _things: import pprint self._log.debug('%s', pprint.pformat(self._outputs)) self._log.debug("lost %s [%s]", thing['uid'], _state) self._prof.prof('lost', uid=thing['uid'], state=_state, ts=ts) continue if not self._outputs[_state]: # empty output -- drop thing for thing in _things: self._log.debug('drop %s [%s]', thing['uid'], _state) self._prof.prof('drop', uid=thing['uid'], state=_state, ts=ts) continue output = self._outputs[_state] # push the thing down the drain self._log.debug('put bulk %s: %s', _state, len(_things)) output.put(_things) ts = time.time() for thing in _things: self._prof.prof('put', uid=thing['uid'], state=_state, msg=output.name, ts=ts)