def two_processes_one_sup_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") true_pid = factory.get_pidantic(command="true", process_name="true", directory=tempdir) true_pid.start() false_pid = factory.get_pidantic(command="false", process_name="false", directory=tempdir) false_pid.start() while not false_pid.is_done() or not true_pid.is_done(): factory.poll() rc = false_pid.get_result_code() self.assertNotEqual(rc, 0) rc = true_pid.get_result_code() self.assertEqual(rc, 0) factory.terminate()
def imediately_terminate_facorty_with_running_pgm_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name="cat", directory=tempdir) pidantic.start() factory.terminate()
def simple_get_cancel_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_PENDING) pidantic.cancel_request()
def simple_get_state_start_test(self): name = "cat" + str(uuid.uuid4()).split("-")[0] tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name=name, directory=tempdir) pidantic.start() state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_STARTING) factory.terminate()
def simple_api_walk_through_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() state = pidantic.get_state() while not pidantic.is_done(): factory.poll() factory.terminate()
def simple_get_state_exit_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() state = pidantic.get_state() self.assertEquals(state, PIDanticState.STATE_EXITED) factory.terminate()
def simple_return_code_success_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="false", process_name="false", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def terminate_done_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.start() while not pidantic.is_done(): factory.poll() try: pidantic.terminate() self.assertFalse(True, "should not get here") except PIDanticStateException: pass factory.terminate()
def simple_terminate_test(self): process_name = str(uuid.uuid4()).split("-")[0] tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 5000", process_name=process_name, directory=tempdir) pidantic.start() factory.poll() pidantic.terminate() while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def simple_double_terminate_kill_test(self): tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 5000", process_name="longnap", directory=tempdir) pidantic.start() factory.poll() pidantic.terminate() try: pidantic.terminate() self.fail("The terminate call should raise an error") except: pass while not pidantic.is_done(): factory.poll() rc = pidantic.get_result_code() self.assertNotEqual(rc, 0) factory.terminate()
def restart_test(self): from time import sleep tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/cat", process_name="cat", directory=tempdir) pidantic.start() while not pidantic.get_state() == PIDanticState.STATE_RUNNING: factory.poll() sleep(1) original_pid = pidantic._supd.get_all_state()[0]['pid'] pidantic.restart() while not pidantic.get_state() == PIDanticState.STATE_RUNNING: factory.poll() sleep(1) new_pid = pidantic._supd.get_all_state()[0]['pid'] assert int(new_pid) != 0 assert new_pid != original_pid
def state_change_callback_test(self): global cb_called cb_called = False def my_callback(arg): print "callback" global cb_called cb_called = True tempdir = tempfile.mkdtemp() factory = SupDPidanticFactory(directory=tempdir, name="tester") pidantic = factory.get_pidantic(command="/bin/sleep 1", process_name="sleep", directory=tempdir) pidantic.set_state_change_callback(my_callback, None) pidantic.start() state = pidantic.get_state() while not pidantic.is_done(): factory.poll() factory.terminate() assert cb_called
class BrickWriterDispatcher(object): def __init__(self, failure_callback, num_workers=1, pidantic_dir=None, working_dir=None): self.guid = create_guid() self.prep_queue = queue.Queue() self.work_queue = queue.Queue() self._pending_work = {} self._stashed_work = {} self._active_work = {} self._failures = {} self._do_stop = False self._count = -1 self._shutdown = False self._failure_callback = failure_callback self.context = zmq.Context(1) self.prov_sock = self.context.socket(zmq.REP) self.prov_port = self._get_port(self.prov_sock) log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port)) self.resp_sock = self.context.socket(zmq.SUB) self.resp_port = self._get_port(self.resp_sock) self.resp_sock.setsockopt(zmq.SUBSCRIBE, '') log.info('Response url: tcp://*:{0}'.format(self.resp_port)) self.num_workers = num_workers if num_workers > 0 else 1 self.is_single_worker = self.num_workers == 1 self.working_dir = working_dir or '.' self.pidantic_dir = pidantic_dir or './pid_dir' self.workers = [] self._configure_workers() def _get_port(self, socket): for x in xrange(PORT_RANGE[0], PORT_RANGE[1]): try: socket.bind('tcp://*:{0}'.format(x)) return x except ZMQError: continue def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile( os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [ x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names ]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format( self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic( command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready = False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError( 'Error starting worker - cannot continue') else: raise SystemError( 'Problem starting worker - cannot continue') ready = True def has_pending_work(self): return len(self._pending_work) > 0 def has_active_work(self): return len(self._active_work) > 0 def has_stashed_work(self): return len(self._stashed_work) > 0 def is_dirty(self): if not self.has_active_work(): if not self.has_stashed_work(): if not self.has_pending_work(): return False return True def get_dirty_values_async_result(self): dirty_async_res = AsyncResult() def dirty_check(self, res): while True: if self.is_dirty(): time.sleep(0.1) else: res.set(True) break spawn(dirty_check, self, dirty_async_res) return dirty_async_res def run(self): self._do_stop = False self._org_g = spawn(self.organize_work) self._prov_g = spawn(self.provisioner) self._rec_g = spawn(self.receiver) def shutdown(self, force=False, timeout=None): if self._shutdown: return # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!! self._do_stop = True try: log.debug('Force == %s', force) if not force: log.debug('Waiting for organizer; timeout == %s', timeout) # Wait for the organizer to finish - ensures the prep_queue is empty self._org_g.join(timeout=timeout) log.debug('Waiting for provisioner; timeout == %s', timeout) # Wait for the provisioner to finish - ensures work_queue is empty self._prov_g.join(timeout=timeout) log.debug('Waiting for receiver; timeout == %s', timeout) # Wait for the receiver to finish - allows workers to finish their work self._rec_g.join(timeout=timeout) log.debug('Killing organizer, provisioner, and receiver greenlets') # Terminate the greenlets self._org_g.kill() self._prov_g.kill() self._rec_g.kill() log.debug('Greenlets killed') log.debug('Shutdown workers') # Shutdown workers - work should be completed by now... if self.is_single_worker: # Current work will be finished self.workers[0].stop() else: self.workers = self.factory.reload_instances() # CBM TODO: THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!! for x in self.workers: self.workers[x].cleanup() self.factory.terminate() log.debug('Workers shutdown') except: raise finally: log.debug('Closing provisioner and receiver sockets') # Close sockets self.prov_sock.close() self.resp_sock.close() log.debug('Sockets closed') log.debug('Terminating the context') self.context.term() log.debug('Context terminated') self._shutdown = True def organize_work(self): while True: if self._do_stop and self.prep_queue.empty(): break try: # Timeout after 1 second to allow stopage and _stashed_work cleanup wd = self.prep_queue.get(timeout=1) except queue.Empty: # No new work added - see if there's anything on the stash to cleanup... for k in self._stashed_work: log.debug('Cleanup _stashed_work...') # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded self.put_work(k, self._stashed_work[k][0], []) continue try: k, wm, w = wd is_list = isinstance(w, list) if k not in self._stashed_work and len(w) == 0: log.debug('Discarding empty work') continue log.debug('Work: %s', w) if k in self._active_work: log.debug('Do Stash') # The work_key is being worked on if k not in self._stashed_work: # Create the stash for this work_key self._stashed_work[k] = (wm, []) # Add the work to the stash if is_list: self._stashed_work[k][1].extend(w[:]) else: self._stashed_work[k][1].append(w) else: # If there is a stash for this work_key, prepend it to work if k in self._stashed_work: log.debug('Was a stash, prepend: %s, %s', self._stashed_work[k], w) _, sv = self._stashed_work.pop(k) if is_list: sv.extend(w[:]) else: sv.append(w) w = sv is_list = True # Work is a list going forward!! log.debug('Work: %s', w) # The work_key is not yet pending not_in_pend = k not in self._pending_work if not_in_pend: # Create the pending for this work_key log.debug('-> new pointer \'%s\'', k) self._pending_work[k] = (wm, []) # Add the work to the pending log.debug('-> adding work to \'%s\': %s', k, w) if is_list: self._pending_work[k][1].extend(w[:]) else: self._pending_work[k][1].append(w) if not_in_pend: # Add the not-yet-pending work to the work_queue self.work_queue.put(k) except: raise def put_work(self, work_key, work_metrics, work): if self._shutdown: raise SystemError( 'This BrickDispatcher has been shutdown and cannot process more work!' ) self.prep_queue.put((work_key, work_metrics, work)) def _add_failure(self, wp): pwp = pack(wp) log.warn('Adding to _failures: %s', pwp) if pwp in self._failures: self._failures[pwp] += 1 else: self._failures[pwp] = 1 if self._failures[pwp] > WORK_FAILURE_RETRIES: raise ValueError('Maximum failure retries exceeded') def receiver(self): while True: try: if self.resp_sock.closed: break if self._do_stop and len(self._active_work) == 0: break log.debug('Receive response message (loop)') msg = None while msg is None: try: msg = self.resp_sock.recv(zmq.NOBLOCK) except zmq.ZMQError, e: if e.errno == zmq.EAGAIN: if self._do_stop: break else: time.sleep(0.1) else: raise if msg is not None: resp_type, worker_guid, work_key, work = unpack(msg) work = list(work) if work is not None else work if resp_type == SUCCESS: log.debug('Worker %s was successful', worker_guid) wguid, pw = self._active_work.pop(work_key) if pw in self._failures: self._failures.pop(pw) elif resp_type == FAILURE: log.debug( 'Failure reported for work on %s by worker %s', work_key, worker_guid) if work_key is None: # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer # Because it failed so miserably, need to find the work_key based on guid for k, v in self._active_work.iteritems(): if v[0] == worker_guid: work_key = k break if work_key is not None: wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError, e: self._failure_callback( e.message, unpack(pw)) continue self.put_work(*unpack(pw)) else: # Normal failure # Pop the work from active work, and queue the work returned by the worker wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError, e: self._failure_callback(e.message, unpack(pw)) continue _, wm, wk = unpack(pw) self.put_work(work_key, wm, work)
class SupDExe(object): def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, "supdexe", None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None def set_state_change_callback(self, cb, user_arg): self._state_change_cb = cb self._state_change_cb_arg = user_arg for name in self._known_pws: pw = self._known_pws["name"] pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters["exec"] + " " + " ".join(parameters["argv"]) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) if self._state_change_cb: pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, process_name, ignore_round=False): if ignore_round: process_upid, process_round = unmake_id(process_name) for name, proc in self._known_pws.iteritems(): upid, round = unmake_id(name) if process_upid == upid: return proc else: return None else: if process_name not in self._known_pws: return None return self._known_pws[process_name] def get_all(self): return self._known_pws def _get_running(self): running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()
class SupDExe(object): def __init__(self, eeagent_cfg): self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters['exec'] + " " + ' '.join(parameters['argv']) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, name): if name not in self._known_pws: return None return self._known_pws[name] def get_all(self): return self._known_pws def _get_running(self): running_states = [PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.REQUESTING] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()
class BrickWriterDispatcher(object): def __init__(self, failure_callback, num_workers=1, pidantic_dir=None, working_dir=None): self.guid = create_guid() self.prep_queue = queue.Queue() self.work_queue = queue.Queue() self._pending_work = {} self._stashed_work = {} self._active_work = {} self._failures = {} self._do_stop = False self._count = -1 self._shutdown = False self._failure_callback = failure_callback self.context = zmq.Context(1) self.prov_sock = self.context.socket(zmq.REP) self.prov_port = self._get_port(self.prov_sock) log.info('Provisioning url: tcp://*:{0}'.format(self.prov_port)) self.resp_sock = self.context.socket(zmq.SUB) self.resp_port = self._get_port(self.resp_sock) self.resp_sock.setsockopt(zmq.SUBSCRIBE, '') log.info('Response url: tcp://*:{0}'.format(self.resp_port)) self.num_workers = num_workers if num_workers > 0 else 1 self.is_single_worker = self.num_workers == 1 self.working_dir = working_dir or '.' self.pidantic_dir = pidantic_dir or './pid_dir' self.workers = [] self._configure_workers() def _get_port(self, socket): for x in xrange(PORT_RANGE[0], PORT_RANGE[1]): try: socket.bind('tcp://*:{0}'.format(x)) return x except ZMQError: continue def _configure_workers(self): # TODO: if num_workers == 1, simply run one in-line (runs in a greenlet anyhow) if self.is_single_worker: from brick_worker import run_worker worker = run_worker(self.prov_port, self.resp_port) self.workers.append(worker) else: if os.path.exists(self.pidantic_dir): bdp = os.path.join(self.pidantic_dir, 'brick_dispatch') if os.path.exists(bdp): import zipfile, zlib with zipfile.ZipFile(os.path.join(bdp, 'archived_worker_logs.zip'), 'a', zipfile.ZIP_DEFLATED) as f: names = f.namelist() for x in [x for x in os.listdir(bdp) if x.startswith('worker_') and x not in names]: fn = os.path.join(bdp, x) f.write(filename=fn, arcname=x) os.remove(fn) else: os.makedirs(self.pidantic_dir) self.factory = SupDPidanticFactory(name='brick_dispatch', directory=self.pidantic_dir) # Check for old workers - FOR NOW, TERMINATE THEM TODO: These should be reusable... old_workers = self.factory.reload_instances() for x in old_workers: old_workers[x].cleanup() worker_cmd = 'bin/python coverage_model/brick_worker.py {0} {1}'.format(self.prov_port, self.resp_port) for x in xrange(self.num_workers): w = self.factory.get_pidantic(command=worker_cmd, process_name='worker_{0}'.format(x), directory=os.path.realpath(self.working_dir)) w.start() self.workers.append(w) ready=False while not ready: self.factory.poll() for x in self.workers: s = x.get_state() if s is PIDanticState.STATE_STARTING: break elif s is PIDanticState.STATE_RUNNING: continue elif s is PIDanticState.STATE_EXITED: self.shutdown() raise SystemError('Error starting worker - cannot continue') else: raise SystemError('Problem starting worker - cannot continue') ready = True def has_pending_work(self): return len(self._pending_work) > 0 def has_active_work(self): return len(self._active_work) > 0 def has_stashed_work(self): return len(self._stashed_work) > 0 def is_dirty(self): if not self.has_active_work(): if not self.has_stashed_work(): if not self.has_pending_work(): return False return True def get_dirty_values_async_result(self): dirty_async_res = AsyncResult() def dirty_check(self, res): while True: if self.is_dirty(): time.sleep(0.1) else: res.set(True) break spawn(dirty_check, self, dirty_async_res) return dirty_async_res def run(self): self._do_stop = False self._org_g = spawn(self.organize_work) self._prov_g = spawn(self.provisioner) self._rec_g = spawn(self.receiver) def shutdown(self, force=False, timeout=None): if self._shutdown: return # CBM TODO: Revisit to ensure this won't strand work or terminate workers before they complete their work...!! self._do_stop = True try: log.debug('Force == %s', force) if not force: log.debug('Waiting for organizer; timeout == %s',timeout) # Wait for the organizer to finish - ensures the prep_queue is empty self._org_g.join(timeout=timeout) log.debug('Waiting for provisioner; timeout == %s',timeout) # Wait for the provisioner to finish - ensures work_queue is empty self._prov_g.join(timeout=timeout) log.debug('Waiting for receiver; timeout == %s',timeout) # Wait for the receiver to finish - allows workers to finish their work self._rec_g.join(timeout=timeout) log.debug('Killing organizer, provisioner, and receiver greenlets') # Terminate the greenlets self._org_g.kill() self._prov_g.kill() self._rec_g.kill() log.debug('Greenlets killed') log.debug('Shutdown workers') # Shutdown workers - work should be completed by now... if self.is_single_worker: # Current work will be finished self.workers[0].stop() else: self.workers = self.factory.reload_instances() # CBM TODO: THIS DOES NOT ALLOW CURRENT WORK TO FINISH!!! for x in self.workers: self.workers[x].cleanup() self.factory.terminate() log.debug('Workers shutdown') except: raise finally: log.debug('Closing provisioner and receiver sockets') # Close sockets self.prov_sock.close() self.resp_sock.close() log.debug('Sockets closed') log.debug('Terminating the context') self.context.term() log.debug('Context terminated') self._shutdown = True def organize_work(self): while True: if self._do_stop and self.prep_queue.empty(): break try: # Timeout after 1 second to allow stopage and _stashed_work cleanup wd = self.prep_queue.get(timeout=1) except queue.Empty: # No new work added - see if there's anything on the stash to cleanup... for k in self._stashed_work: log.debug('Cleanup _stashed_work...') # Just want to trigger cleanup of the _stashed_work, pass an empty list of 'work', gets discarded self.put_work(k, self._stashed_work[k][0], []) continue try: k, wm, w = wd is_list = isinstance(w, list) if k not in self._stashed_work and len(w) == 0: log.debug('Discarding empty work') continue log.debug('Work: %s',w) if k in self._active_work: log.debug('Do Stash') # The work_key is being worked on if k not in self._stashed_work: # Create the stash for this work_key self._stashed_work[k] = (wm, []) # Add the work to the stash if is_list: self._stashed_work[k][1].extend(w[:]) else: self._stashed_work[k][1].append(w) else: # If there is a stash for this work_key, prepend it to work if k in self._stashed_work: log.debug('Was a stash, prepend: %s, %s', self._stashed_work[k], w) _, sv=self._stashed_work.pop(k) if is_list: sv.extend(w[:]) else: sv.append(w) w = sv is_list = True # Work is a list going forward!! log.debug('Work: %s',w) # The work_key is not yet pending not_in_pend = k not in self._pending_work if not_in_pend: # Create the pending for this work_key log.debug('-> new pointer \'%s\'', k) self._pending_work[k] = (wm, []) # Add the work to the pending log.debug('-> adding work to \'%s\': %s', k, w) if is_list: self._pending_work[k][1].extend(w[:]) else: self._pending_work[k][1].append(w) if not_in_pend: # Add the not-yet-pending work to the work_queue self.work_queue.put(k) except: raise def put_work(self, work_key, work_metrics, work): if self._shutdown: raise SystemError('This BrickDispatcher has been shutdown and cannot process more work!') self.prep_queue.put((work_key, work_metrics, work)) def _add_failure(self, wp): pwp = pack(wp) log.warn('Adding to _failures: %s', pwp) if pwp in self._failures: self._failures[pwp] += 1 else: self._failures[pwp] = 1 if self._failures[pwp] > WORK_FAILURE_RETRIES: raise ValueError('Maximum failure retries exceeded') def receiver(self): while True: try: if self.resp_sock.closed: break if self._do_stop and len(self._active_work) == 0: break log.debug('Receive response message (loop)') msg = None while msg is None: try: msg = self.resp_sock.recv(zmq.NOBLOCK) except zmq.ZMQError, e: if e.errno == zmq.EAGAIN: if self._do_stop: break else: time.sleep(0.1) else: raise if msg is not None: resp_type, worker_guid, work_key, work = unpack(msg) work = list(work) if work is not None else work if resp_type == SUCCESS: log.debug('Worker %s was successful', worker_guid) wguid, pw = self._active_work.pop(work_key) if pw in self._failures: self._failures.pop(pw) elif resp_type == FAILURE: log.debug('Failure reported for work on %s by worker %s', work_key, worker_guid) if work_key is None: # Worker failed before it did anything, put all work back on the prep queue to be reorganized by the organizer # Because it failed so miserably, need to find the work_key based on guid for k, v in self._active_work.iteritems(): if v[0] == worker_guid: work_key = k break if work_key is not None: wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError,e: self._failure_callback(e.message, unpack(pw)) continue self.put_work(*unpack(pw)) else: # Normal failure # Pop the work from active work, and queue the work returned by the worker wguid, pw = self._active_work.pop(work_key) try: self._add_failure(pw) except ValueError,e: self._failure_callback(e.message, unpack(pw)) continue _, wm, wk = unpack(pw) self.put_work(work_key, wm, work)
class SupDExe(object): def __init__(self, eeagent_cfg, log=logging): self.log = log self.log.debug("Starting SupDExe") self._working_dir = eeagent_cfg.launch_type.supd_directory self._eename = eeagent_cfg.name supdexe = _set_param_or_default(eeagent_cfg.launch_type, 'supdexe', None) self._slots = int(eeagent_cfg.slots) self._factory = SupDPidanticFactory(directory=self._working_dir, name=self._eename, supdexe=supdexe) pidantic_instances = self._factory.reload_instances() self._known_pws = {} for name in pidantic_instances: pidantic = pidantic_instances[name] pw = PidWrapper(self, name) pw.set_pidantic(pidantic) self._known_pws[name] = pw self._state_change_cb = None self._state_change_cb_arg = None def set_state_change_callback(self, cb, user_arg): self._state_change_cb = cb self._state_change_cb_arg = user_arg for name in self._known_pws: pw = self._known_pws['name'] pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) def run(self, name, parameters): pw = PidWrapper(self, name) self._known_pws[name] = pw command = parameters['exec'] + " " + " ".join(parameters['argv']) dir = self._working_dir if "working_directory" in parameters: dir = parameters["working_directory"] pid = self._factory.get_pidantic(command=command, process_name=name, directory=dir) pw.set_pidantic(pid) if self._state_change_cb: pw.set_state_change_callback(self._state_change_cb, self._state_change_cb_arg) running_jobs = self._get_running() x = len(running_jobs) if x <= self._slots: pid.start() else: pid.cancel_request() return pw def get_known_pws(self): return self._known_pws def _remove_proc(self, proc_name): del self._known_pws[proc_name] def lookup_id(self, process_name, ignore_round=False): if ignore_round: process_upid, process_round = unmake_id(process_name) for name, proc in self._known_pws.iteritems(): upid, round = unmake_id(name) if process_upid == upid: return proc else: return None else: if process_name not in self._known_pws: return None return self._known_pws[process_name] def get_all(self): return self._known_pws def _get_running(self): running_states = [ PidWrapper.RUNNING, PidWrapper.TERMINATING, PidWrapper.PENDING ] a = self.get_all().values() running = [i.get_state() for i in a] running = [i for i in a if i.get_state() in running_states] return running def poll(self): return self._factory.poll() def terminate(self): self._factory.terminate()