def _timed(*args, **kw): from powerhose import logger start = time.time() try: return func(*args, **kw) finally: logger.debug('%.4f' % (time.time() - start))
def run(self): self.running = True self.register() self.pinger.start() while self.running and not self.pinger.unresponsive: try: events = dict(self.poller.poll(self.timeout)) except zmq.ZMQError: break for socket in events: msg = unserialize(socket.recv()) if msg[0] == 'JOB': # do the job and send the result start = time.time() try: res = self.target(msg[1:]) except Exception, e: # XXX log the error res = str(e) logger.debug('%.6f' % (time.time() - start)) socket.send(serialize("JOBRES", msg[1], res)) else: socket.send('ERROR')
def acquire(self, timeout=None): """Acquire a worker from the queue and remove it. Should be used with :func:`release`. Options: - **timeout**: time in second before raising a TimeoutError exception. Defaults to the value provided in the class initialization. """ logger.debug('Trying to acquire a worker') if timeout is None: timeout = self.timeout worker = None # wait for timeout seconds try: while worker is None: worker = self._available.get(timeout=timeout) if worker.identity not in self._workers: # it has been removed self.delete(worker.identity) worker = None except Empty: raise TimeoutError("Could not get a worker") logger.debug('we got one \o/') return worker
def failed(self): logger.debug("ping failed let's die") try: self._msg('REMOVE', 'REMOVED') except RegisterError: pass self.stop()
def run(self): self.running = True self.register() self.pinger.start() while self.running and not self.pinger.unresponsive: try: events = dict(self.poller.poll(self.timeout)) except zmq.ZMQError: break for socket in events: msg = unserialize(socket.recv()) logger.debug(msg) if msg[0] == 'JOB': # do the job and send the result start = time.time() try: res = self.target(msg[1:]) except Exception, e: # XXX log the error res = str(e) logger.debug('%.6f' % (time.time() - start)) socket.send(serialize("JOBRES", msg[1], res)) else: socket.send('ERROR')
def execute(self, job, timeout=1.): """Execute a job and return the result. Options: - **job**: a :class:`Job` instance. - **timeout**: the maximum allowed time in seconds. (default: 1) If the job fails to run, this method may raise one of these exceptions: - :class:`TimeoutError`: timed out. - :class:`ExecutionError`: the worker has failed. In case of an execution error, the exception usually holds more details on the failure. """ from powerhose import logger e = None for i in range(self.retries): try: return self._execute(job, timeout) except (TimeoutError, ExecutionError), e: logger.debug(str(e)) logger.debug('retrying - %d' % (i + 1))
def stop(self): """Stops the registration loop. """ if not self.started: return logger.debug('Stopping registration at ' + self.endpoint) self.registration.stop() self.started = False
def start(self): """Starts the registration loop. """ if self.started: return logger.debug('Starting registration at ' + self.endpoint) self.registration.start() self.started = True
def stop(self): if not self.running: return logger.debug('stopping pinger') self.running = False try: self.join() except RuntimeError: pass
def release(self, worker): """Put back the worker in the queue. Options: - **worker**: the worker to put back. Should be used with :func:`acquire`. """ logger.debug('releasing the worker') self._available.put(worker)
def execute(self, job_id, job_data, timeout=1.): from powerhose import logger e = None for i in range(self.retries): try: return self._execute(job_id, job_data, timeout) except (TimeoutError, ExecutionError), e: logger.debug(str(e)) logger.debug('retrying - %d' % (i + 1))
def __init__(self, identity, socket, locker, fail_callable, duration=5., max_fails=10.): threading.Thread.__init__(self) self.duration = duration self.identity = identity logger.debug('starting pinger from %s' % self.identity) self.socket = socket self.locker = locker self.running = False self.fail_callable = fail_callable self.max_fails = max_fails self.poller = zmq.Poller() self.poller.register(self.socket, zmq.POLLIN) self.disabled = False self.unresponsive = False
def run(self): self.running = True num_failed = 0 while self.running: if num_failed >= self.max_fails: self.unresponsive = True self.running = False break if self.disabled: time.sleep(1.) continue with self.locker: try: data = serialize('PING', self.identity) logger.debug('[pinger] Pinging with ' + data) self.socket.send(data, zmq.NOBLOCK) except zmq.ZMQError, e: num_failed += 1 logger.debug('[pinger] ' + str(e)) continue try: events = dict(self.poller.poll(self.duration * 1000)) except zmq.ZMQError, e: self.num_failed += 1 logger.debug('[pinger] ' + str(e)) continue if len(events) == 0: logger.debug('[pinger] ' + 'no pong!') self.fail_callable() num_failed += 1 else: for socket in events: res = socket.recv() logger.debug('[pinger] ' + 'got ' + res) if res != 'PONG': self.running = False self.fail_callable() num_failed += 1
def __init__(self, endpoint, workers_cmd, num_workers=5, working_dir=None, circus_controller='tcp://127.0.0.1:555', circus_pubsub_endpoint='tcp://127.0.0.1:5556', env=None): # initialisation pid = str(thread.get_ident()) self.endpoint = endpoint.replace('$PID', pid) self.workers_cmd = workers_cmd.replace('$PID', pid) circus_controller = circus_controller.replace('$PID', pid) circus_pubsub_endpoint = circus_pubsub_endpoint.replace('$PID', pid) envdict = {} if env is not None: if isinstance(env, dict): envdict = env else: for pair in env.split(';'): key, value = pair.split('=', 1) envdict[key] = value # register the runner and the workers in the global vars. if self.endpoint not in _runners: _runners[self.endpoint] = JobRunner(self.endpoint) _workers[self.endpoint] = CryptoWorkers(self.workers_cmd, num_workers=num_workers, working_dir=working_dir, controller=circus_controller, pubsub_endpoint=circus_pubsub_endpoint, env=envdict) self.runner = _runners[self.endpoint] logger.debug('Starting powerhose master') # start the runner ... self.runner.start() time.sleep(.5) self.workers = _workers[self.endpoint] # ... and the workers self.workers.start() # wait a bit time.sleep(1.)
def stop_runners(): logger.debug("stop_runner starts") for workers in _workers.values(): workers.stop() logger.debug("workers killed") for runner in _runners.values(): logger.debug('Stopping powerhose master') runner.stop() logger.debug("stop_runner ends")
def __init__(self, endpoint, workers_cmd, num_workers=5, working_dir=None, env=None): self.endpoint = endpoint self.workers_cmd = workers_cmd if env is not None: envdict = {} for pair in env.split(';'): key, value = pair.split('=', 1) envdict[key] = value if self.endpoint not in _runners: _runners[self.endpoint] = JobRunner(self.endpoint) _workers[self.endpoint] = CryptoWorkers(self.workers_cmd, num_workers=num_workers, working_dir=working_dir, env=envdict) self.runner = _runners[self.endpoint] logger.debug('Starting powerhose master') self.runner.start() time.sleep(.5) self.workers = _workers[self.endpoint] self.workers.run()
def _execute(self, job, timeout=1.): worker = None timeout *= 1000. # timeout is in ms data = serialize("JOB", job.serialize()) logger.debug('Lets run that job') try: logger.debug('getting a worker') with self.workers.get_context() as worker: try: worker.send(data, zmq.NOBLOCK) except zmq.ZMQError, e: raise ExecutionError(str(e)) poller = zmq.Poller() poller.register(worker, zmq.POLLIN) try: events = dict(poller.poll(timeout)) except zmq.ZMQError, e: raise ExecutionError(str(e)) if events == {}: raise TimeoutError() for socket in events: try: msg = unserialize(socket.recv()) except zmq.ZMQError, e: raise ExecutionError(str(e)) if msg[0] == 'JOBRES': # we got a result return msg[-1] else: raise NotImplementedError(str(msg))
def stop(self): """Stop the thread -- thus the registration """ logger.debug('Stopping [workermgr]') self.alive = False self.join()
def stop(self): logger.debug('Stopping powerhose workers') self.workers.stop() self.join()
def run(self): logger.debug('Starting powerhose workers') self.workers.run()
def stop(self): logger.debug('stopping workers') self.trainer.stop() logger.debug('stopping workers done')
def run(self): logger.debug('starting workers') self.trainer.start()
def stop(self): logger.debug('stopping workers') self.arbiter.stop() logger.debug('stopping workers done')
events = dict(poller.poll(timeout)) except zmq.ZMQError, e: raise ExecutionError(str(e)) if events == {}: raise TimeoutError() for socket in events: try: msg = unserialize(socket.recv()) except zmq.ZMQError, e: raise ExecutionError(str(e)) if msg[0] == 'JOBRES': # we got a result return msg[-1] else: raise NotImplementedError(str(msg)) except Exception, e: logger.debug('something went wrong') if worker is not None: # killing this worker - it can come back on the next ping self.workers.delete(worker.identity) exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) raise ExecutionError('\n'.join(exc))
def stop(self): logger.debug('Stopping [workermgr]') self.alive = False self.join()
def run(self): self.alive = True # channel to communicate with the workers logger.debug('Starting [workermgr]') client = self.context.socket(zmq.REP) client.identity = 'master' client.bind(self.endpoint) poller = zmq.Poller() poller.register(client, zmq.POLLIN) poll_timeout = 1000 while self.alive: try: events = dict(poller.poll(poll_timeout)) except zmq.ZMQError, e: logger.debug("The poll failed") logger.debug(str(e)) break for socket in events: msg = unserialize(socket.recv()) if len(msg) < 2: # XXX log socket.send('ERROR') if msg[-2] == 'PING': logger.debug("[workermgr] Got a PING") if msg[-1] not in self.workers: name = msg[-1] logger.debug("Registered " + name) # keep track of that worker work = self.context.socket(zmq.REQ) work.connect(name) work.identity = name self.workers.add(work) # in any case we pong back logger.debug("[workermgr] sent a PONG") socket.send('PONG') elif msg[-2] == 'REMOVE': if msg[-1] in self.workers: logger.debug("[workermgr] Removing` " + msg[-1]) self.workers.delete(msg[-1]) socket.send('REMOVED') else: logger.debug('Error') socket.send('ERROR') time.sleep(.1)
def start(self): if self.started: return logger.debug('Starting registration at ' + self.endpoint) self.registration.start() self.started = True
def stop(self): if not self.started: return logger.debug('Stopping registration at ' + self.endpoint) self.registration.stop() self.started = False
def run(self): logger.debug('starting workers') self.arbiter.start() logger.debug('workers Stopped')