def _execute(self): # calling the clients now self.test_result.startTestRun() detached = self.args.get('detach') if not detached: cb = ioloop.PeriodicCallback(self.refresh, self.refresh_rate, self.loop) cb.start() try: self._attach_publisher() logger.debug('Calling the broker...') res = self.client.run(self.args) self.run_id = res['run_id'] self.agents = res['agents'] if not detached: logger.debug('Waiting for results') self.loop.start() else: logger.info('Detached. run --attach to reattach') finally: if not detached: # end.. cb.stop() self.test_result.stopTestRun() self.context.destroy() self.flush()
def execute(self, job, timeout=None, log_exceptions=True): """Runs the job Options: - **job**: Job to be performed. Can be a :class:`Job` instance or a string. If it's a string a :class:`Job` instance will be automatically created out of it. - **timeout**: maximum allowed time for a job to run. If not provided, uses the one defined in the constructor. If the job fails after the timeout, raises a :class:`TimeoutError`. This method is thread-safe and uses a lock. If you need to execute a lot of jobs simultaneously on a broker, use the :class:`Pool` class. """ if timeout is None: timeout = self.timeout_max_overflow try: duration, res = timed(self.debug)(self._execute)(job, timeout) except Exception: # logged, connector replaced. if log_exceptions: logger.exception('Failed to execute the job.') logger.debug(str(job)) raise if 'error' in res: raise ValueError(res['error']) return res['result']
def __init__(self, endpoint=DEFAULT_HEARTBEAT, interval=10., io_loop=None, ctx=None, register=5, onregister=None): self.loop = io_loop or ioloop.IOLoop.instance() self.daemon = True self.kill_context = ctx is None self.context = ctx or zmq.Context() self.endpoint = endpoint self.running = False self.interval = interval logger.debug('Publishing to ' + self.endpoint) self._endpoint = self.context.socket(zmq.PUB) self._endpoint.linger = 0 self._endpoint.hwm = 0 self._endpoint.bind(self.endpoint) self._cb = ioloop.PeriodicCallback(self._ping, interval * 1000, io_loop=self.loop) self.register = register self.current_register = 0 self.onregister = onregister
def _deploy_python_deps(self, deps=None): # XXX pip hack to avoid uninstall # deploy python deps if asked deps = deps or self.args.get('python_dep', []) if deps == []: return nil = "lambda *args, **kw: None" code = ["from pip.req import InstallRequirement", "InstallRequirement.uninstall = %s" % nil, "InstallRequirement.commit_uninstall = %s" % nil, "import pip", "pip.main()"] cmd = [sys.executable, '-c', '"%s"' % ';'.join(code), 'install', '-t', 'deps', '-I'] for dep in deps: logger.debug('Deploying %r in %r' % (dep, os.getcwd())) process = subprocess.Popen(' '.join(cmd + [dep]), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: raise Exception(stderr) sys.path.insert(0, 'deps')
def _handle_recv_back(self, msg): # do the message and send the result if self.debug: logger.debug('Message received') target = timed()(self.target) else: target = self.target duration = -1 # results are sent with a PID:OK: or a PID:ERROR prefix try: with self.timer.run_message(): res = target(Message.load_from_string(msg[0])) # did we timout ? if self.timer.timed_out: # let's dump the last for line in self.timer.last_dump: logger.error(line) if self.debug: duration, res = res # we're working with strings if isinstance(res, unicode): res = res.encode('utf8') res = '%d:OK:%s' % (self.pid, res) except Exception, e: exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) res = '%d:ERROR:%s' % (self.pid, '\n'.join(exc)) logger.error(res)
def _remove_worker(self, worker_id): logger.debug('%r removed' % worker_id) self._workers.remove(worker_id) if worker_id in self._worker_times: del self._worker_times[worker_id] if worker_id in self._runs: del self._runs[worker_id]
def _deploy_python_deps(self, deps=None): # XXX pip hack to avoid uninstall # deploy python deps if asked deps = deps or self.args.get('python_dep', []) if deps == []: return nil = "lambda *args, **kw: None" code = [ "from pip.req import InstallRequirement", "InstallRequirement.uninstall = %s" % nil, "InstallRequirement.commit_uninstall = %s" % nil, "import pip", "pip.main()" ] cmd = [ sys.executable, '-c', '"%s"' % ';'.join(code), 'install', '-t', 'deps', '-I' ] for dep in deps: logger.debug('Deploying %r in %r' % (dep, os.getcwd())) process = subprocess.Popen(' '.join(cmd + [dep]), shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() if process.returncode != 0: raise Exception(stderr) sys.path.insert(0, 'deps')
def _prepare_filesystem(self): test_dir = self.args.get('test_dir') # in standalone mode we take care of creating # the files if test_dir is not None: if not self.slave: test_dir = test_dir + '-%d' % os.getpid() if not os.path.exists(test_dir): os.makedirs(test_dir) # grab the files, if any includes = self.args.get('include_file', []) for file_ in glob(includes): logger.debug('Copying %r' % file_) target = os.path.join(test_dir, file_) if os.path.isdir(file_): if os.path.exists(target): shutil.rmtree(target) shutil.copytree(file_, target) else: shutil.copyfile(file_, target) # change to execution directory if asked logger.debug('chdir %r' % test_dir) os.chdir(test_dir)
def __init__(self, target, backend=DEFAULT_BACKEND, heartbeat=DEFAULT_HEARTBEAT, register=DEFAULT_REG, ping_delay=10., ping_retries=3, params=None, timeout=DEFAULT_TIMEOUT_MOVF, max_age=DEFAULT_MAX_AGE, max_age_delta=DEFAULT_MAX_AGE_DELTA): logger.debug('Initializing the worker.') self.ctx = zmq.Context() self.backend = backend self._reg = self.ctx.socket(zmq.PUSH) self._reg.connect(register) self._backend = self.ctx.socket(zmq.REP) self._backend.identity = str(os.getpid()) self._backend.connect(self.backend) self.target = target self.running = False self.loop = ioloop.IOLoop() self._backstream = zmqstream.ZMQStream(self._backend, self.loop) self._backstream.on_recv(self._handle_recv_back) self.ping = Stethoscope(heartbeat, onbeatlost=self.lost, delay=ping_delay, retries=ping_retries, ctx=self.ctx) self.debug = logger.isEnabledFor(logging.DEBUG) self.params = params self.pid = os.getpid() self.timeout = timeout self.timer = ExecutionTimer(timeout=timeout) self.max_age = max_age self.max_age_delta = max_age_delta self.delayed_exit = None self.lock = threading.RLock()
def run(args): is_slave = args.get('slave', False) has_agents = args.get('agents', None) attach = args.get('attach', False) if not attach and (is_slave or not has_agents): if args.get('test_runner', None) is not None: runner = ExternalRunner else: runner = LocalRunner try: return runner(args).execute() except Exception: print traceback.format_exc() raise else: if attach: # find out what's running client = Client(args['broker']) try: runs = client.list_runs() except TimeoutError: logger.info("Can't reach the broker at %r" % args['broker']) client.close() return 1 if len(runs) == 0: logger.info("Nothing seem to be running on that broker.") client.close() return 1 elif len(runs) == 1: run_id, run_data = runs.items()[0] __, started = run_data[-1] else: # we need to pick one raise NotImplementedError() counts = client.get_counts(run_id) events = [event for event, hits in counts] if 'stopTestRun' in events: logger.info("This test has just stopped.") client.close() return 1 metadata = client.get_metadata(run_id) logger.debug('Reattaching run %r' % run_id) started = datetime.utcfromtimestamp(started) runner = DistributedRunner(args) try: return runner.attach(run_id, started, counts, metadata) except KeyboardInterrupt: _detach_question(runner) else: logger.debug('Summoning %d agents' % args['agents']) runner = DistributedRunner(args) try: return runner.execute() except KeyboardInterrupt: _detach_question(runner)
def _handle_reg(self, msg): if msg[0] == 'REGISTER': if msg[1] not in self._workers: logger.debug('%r registered' % msg[1]) self._workers.append(msg[1]) elif msg[0] == 'UNREGISTER': if msg[1] in self._workers: self._remove_worker(msg[1])
def __timed(*args, **kw): start = timer() try: res = func(*args, **kw) finally: duration = timer() - start if debug: logger.debug('%.4f' % duration) return duration, res
def start(self): """Starts the loop""" logger.debug('Starting the loop') if self.running: return self.running = True self._initialize() time.sleep(self.warmup_delay) self._timer.start()
def _handle_recv(self, msg): self.tries = 0 msg = msg[0] if msg == 'BEAT' and self.onbeat is not None: self.onbeat() elif self.onregister is not None: self.onregister() logger.debug(msg)
def __init__(self, broker=DEFAULT_FRONTEND, ping_delay=10., ping_retries=3, params=None, timeout=DEFAULT_TIMEOUT_MOVF, max_age=DEFAULT_MAX_AGE, max_age_delta=DEFAULT_MAX_AGE_DELTA): logger.debug('Initializing the agent.') self.debug = logger.isEnabledFor(logging.DEBUG) self.params = params self.pid = os.getpid() self.agent_id = '%s-%s' % (get_hostname(), self.pid) self.timeout = timeout self.max_age = max_age self.max_age_delta = max_age_delta self.env = os.environ.copy() self.running = False self._workers = {} self._max_id = defaultdict(int) # Let's ask the broker its options self.broker = broker client = Client(self.broker) # this will timeout in case the broker is unreachable result = client.ping() self.endpoints = result['endpoints'] # Setup the zmq sockets self.loop = ioloop.IOLoop() self.ctx = zmq.Context() # backend socket - used to receive work from the broker self._backend = self.ctx.socket(zmq.ROUTER) self._backend.identity = self.agent_id self._backend.connect(self.endpoints['backend']) # register socket - used to register into the broker self._reg = self.ctx.socket(zmq.PUSH) self._reg.connect(self.endpoints['register']) # hearbeat socket - used to check if the broker is alive heartbeat = self.endpoints.get('heartbeat') if heartbeat is not None: logger.info("Hearbeat activated") self.ping = Stethoscope(heartbeat, onbeatlost=self.lost, delay=ping_delay, retries=ping_retries, ctx=self.ctx, io_loop=self.loop, onregister=self.register) else: self.ping = None # Setup the zmq streams. self._backstream = zmqstream.ZMQStream(self._backend, self.loop) self._backstream.on_recv(self._handle_recv_back) self._check = ioloop.PeriodicCallback(self._check_proc, ping_delay * 1000, io_loop=self.loop)
def _run_python_tests(self): # resolve the name now logger.debug('Resolving the test fqn') self._resolve_name() logger.debug('Ready to spawn greenlets for testing.') agent_id = self.args.get('agent_id') exception = None try: if not self.args.get('no_patching', False): logger.debug('Gevent monkey patches the stdlib') from gevent import monkey monkey.patch_all() if not hasattr(self.test, 'im_class'): raise ValueError("The FQN of the test doesn't point to a test " "class (%s)." % self.test) gevent.spawn(self._grefresh) if not self.args.get('externally_managed'): self.test_result.startTestRun(agent_id) for user in self.users: if self.stop: break group = [] for i in range(user): group.append(gevent.spawn(self._run, i, user)) gevent.sleep(0) gevent.joinall(group) gevent.sleep(0) if not self.args.get('externally_managed'): self.test_result.stopTestRun(agent_id) except KeyboardInterrupt: pass except Exception as e: exception = e finally: logger.debug('Test over - cleaning up') # be sure we flush the outputs that need it. # but do it only if we are in "normal" mode try: if not self.slave: self.flush() else: # in slave mode, be sure to close the zmq relay. self.test_result.close() finally: if exception: logger.debug('We had an exception, re-raising it') raise exception
def _kill_worker(self, proc): pid = proc.pid logger.debug('%d final termination' % proc.pid) if proc.poll() is None: logger.debug('Calling kill on %d' % proc.pid) try: proc.kill() except OSError: logger.exception('Cannot kill %d' % pid)
def run(args): if args.get('agents') is None or args.get('slave'): try: return Runner(args).execute() except Exception: print traceback.format_exc() raise else: logger.debug('Summoning %d agents' % args['agents']) return DistributedRunner(args).execute()
def stop(self): """Stops the Pinger""" logger.debug('Stopping the Pinger') self.running = False try: self._stream.flush() except zmq.ZMQError: pass self._timer.stop()
def _remove_agent(self, agent_id): logger.debug('%r removed' % agent_id) if agent_id in self._agents: self._agents.remove(agent_id) if agent_id in self._agent_times: del self._agent_times[agent_id] if agent_id in self._runs: del self._runs[agent_id]
def _handle_commands(self, message): # we get the messages from the broker here data = message.data command = data['command'] logger.debug('Received command %s' % command) if command == 'RUN': # XXX should be done in _run or at least asynchronously filedata = data.get('filedata') if filedata: test_dir = data['args'].get('test_dir') if test_dir is None: test_dir = '.' if not os.path.exists(test_dir): os.makedirs(test_dir) unpack_include_files(filedata, test_dir) args = data['args'] run_id = data.get('run_id') pid = self._run(args, run_id) return { 'result': { 'pids': [pid], 'agent_id': str(self.pid), 'command': command } } elif command in ('STATUS', '_STATUS'): status = {} run_id = data.get('run_id') if run_id is not None: status['run_id'] = run_id for pid, (proc, _run_id) in self._workers.items(): if run_id is not None and run_id != _run_id: continue if proc.poll() is None: status[pid] = 'running' else: status[pid] = 'terminated' res = {'result': {'status': status, 'command': command}} return res elif command == 'STOP': return self._stop_runs(command) elif command == 'QUIT': try: return self._stop_runs(command) finally: sys.exit(0) raise NotImplementedError(command)
def _remove_agent(self, agent_id, reason='unspecified'): logger.debug('%r removed. %s' % (agent_id, reason)) if agent_id in self._agents: del self._agents[agent_id] if agent_id in self._agent_times: del self._agent_times[agent_id] if agent_id in self._runs: del self._runs[agent_id]
def _initialize(self): logger.debug('Subscribing to ' + self.endpoint) self._endpoint = self.context.socket(zmq.SUB) self._endpoint.setsockopt(zmq.SUBSCRIBE, '') self._endpoint.linger = 0 #self._endpoint.identity = str(os.getpid()) self._endpoint.connect(self.endpoint) self._stream = zmqstream.ZMQStream(self._endpoint, self.loop) self._stream.on_recv(self._handle_recv) self._timer = ioloop.PeriodicCallback(self._delayed, self.delay * 1000, io_loop=self.loop)
def _call(): try: res = getattr(self, cmd)(msg, data) res = {'result': res} self.broker.send_json(target, res) except Exception, e: logger.debug('Failed') exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) self.broker.send_json(target, {'error': exc})
def _stop_runs(self, command): status = {} for pid, (proc, run_id) in self._workers.items(): logger.debug('terminating proc for run %s' % str(run_id)) if proc.poll() is None: proc.terminate() del self._workers[pid] status[pid] = {'status': 'terminated', 'run_id': run_id} self._sync_hb() return {'result': {'status': status, 'command': command}}
def stop(self): """Stops the Pinger""" logger.debug('Stopping the Pinger') self.running = False try: self._stream.flush() except zmq.ZMQError: pass self.tries = 0 self._stream.stop_on_recv() self._timer.stop() self._endpoint.disconnect(self.endpoint)
def _ping(self): logger.debug('*beat*') if self.current_register == 0: if self.onregister is not None: self.onregister() self._endpoint.send('REGISTER') else: self._endpoint.send('BEAT') self.current_register += 1 if self.current_register == self.register: self.current_register = 0
def _handle_commands(self, message): # we get the messages from the broker here data = message.data command = data['command'] logger.debug('Received command %s' % command) if command == 'RUN': # XXX should be done in _run or at least asynchronously filedata = data.get('filedata') if filedata: test_dir = data['args'].get('test_dir') if test_dir is None: test_dir = '.' if not os.path.exists(test_dir): os.makedirs(test_dir) unpack_include_files(filedata, test_dir) args = data['args'] run_id = data.get('run_id') pid = self._run(args, run_id) return {'result': {'pids': [pid], 'agent_id': str(self.pid), 'command': command}} elif command in ('STATUS', '_STATUS'): status = {} run_id = data.get('run_id') if run_id is not None: status['run_id'] = run_id for pid, (proc, _run_id) in self._workers.items(): if run_id is not None and run_id != _run_id: continue if proc.poll() is None: status[pid] = 'running' else: status[pid] = 'terminated' res = {'result': {'status': status, 'command': command}} return res elif command == 'STOP': return self._stop_runs(command) elif command == 'QUIT': try: return self._stop_runs(command) finally: sys.exit(0) raise NotImplementedError(command)
def recv(socket, max_retries=3, retry_sleep=0.1): retries = 0 while retries < max_retries: try: return socket.recv(zmq.NOBLOCK) except zmq.ZMQError, e: logger.debug('Failed on recv()') logger.debug(str(e)) if e.errno in (zmq.EFSM, zmq.EAGAIN): retries += 1 time.sleep(retry_sleep) else: raise
def __init__(self, timeout=DEFAULT_TIMEOUT_MOVF, interval=.1): logger.debug('Initializing the execution timer. timeout is %.2f' \ % timeout) threading.Thread.__init__(self) self.armed = self.running = False self.timeout = timeout self.daemon = True # creating a queue for I/O with the worker self.queue = Queue.Queue() self.interval = interval self.timed_out = self.working = False self.last_dump = None
def stop(self): """Stops the broker. """ if not self.started: return self._backstream.flush() logger.debug('Stopping the heartbeat') self.pong.stop() logger.debug('Stopping the loop') self.loop.stop() self.started = False self.context.destroy(0)
def _stop(self): logger.debug('Stopping the worker') self.running = False try: self._backstream.flush() except zmq.core.error.ZMQError: pass self.loop.stop() self.ping.stop() self.timer.stop() time.sleep(.1) self.ctx.destroy(0) logger.debug('Worker is stopped')
def test_ended(self, run_id): # first of all, we want to mark it done in the DB logger.debug('test %s ended marking the metadata' % run_id) self.update_metadata(run_id, stopped=True, active=False, ended=time.time()) # we want to ping all observers that things are done # for a given test. # get the list of observers args = self._db.get_metadata(run_id) observers = _compute_observers(args.get('observer')) if observers == []: self._db.summarize_run(run_id) return logger.debug('test %s ended calling the observers' % run_id) # if we are using the web dashboard - we're just providing a link if self.broker.web_root is not None: test_result = '%s/run/%s' % (self.broker.web_root, run_id) else: # rebuild the test result instance test_result = RemoteTestResult(args=args) test_result.args = args if 'started' in args: started = args['started'] started = datetime.datetime.utcfromtimestamp(started) test_result.startTestRun(when=started) test_result.set_counts(self._db.get_counts(run_id)) # for each observer we call it with the test results for observer in observers: options = {} prefix = 'observer_%s_' % observer.name for name, value in args.items(): if name.startswith(prefix): options[name[len(prefix):]] = value # get the options try: observer(args=args, **options)(test_result) except Exception: # the observer code failed. We want to log it logger.error('%r failed' % observer) self._db.summarize_run(run_id)
def _stop(self): logger.debug('Stopping the agent') self.running = False try: self._backstream.flush() except zmq.core.error.ZMQError: pass self.loop.stop() if self.ping is not None: self.ping.stop() self._check.stop() time.sleep(.1) self.ctx.destroy(0) logger.debug('Agent is stopped')
def _handle_recv_front(self, msg, tentative=0): """front => back All commands starting with CTRL_ are sent to the controller. """ data = json.loads(msg[2]) target = msg[:-1] cmd = data['command'] # a command handled by the controller if cmd.startswith('CTRL_'): cmd = cmd[len('CTRL_'):] logger.debug('calling %s' % cmd) try: res = self.ctrl.run_command(cmd, msg, data) except Exception, e: logger.debug('Failed') exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) self.send_json(target, {'error': exc}) else: # sending back a synchronous result if needed. if res is not None: logger.debug('sync success %s' % str(res)) self.send_json(target, res) else: logger.debug('async success')
def _handle_commands(self, message): # we get the messages from the broker here data = message.data command = data['command'] logger.debug('Received command %s' % command) if command == 'RUN': # XXX should be done in _run or at least asynchronously if 'files' in data: self._copy_files(data) args = data['args'] run_id = data.get('run_id') pid = self._run(args, run_id) return { 'result': { 'pids': [pid], 'agent_id': str(self.pid), 'command': command } } elif command in ('STATUS', '_STATUS'): status = {} run_id = data.get('run_id') for pid, (proc, _run_id) in self._workers.items(): if run_id is not None and run_id != _run_id: continue if proc.poll() is None: status[pid] = 'running' else: status[pid] = 'terminated' res = {'result': {'status': status, 'command': command}} logger.debug('Status: %s' % str(res)) return res elif command == 'STOP': return self._stop_runs(command) elif command == 'QUIT': try: return self._stop_runs(command) finally: sys.exit(0) raise NotImplementedError(command)
def stop(self): """Stops the agent. """ if not self.running: return # telling the broker we are stopping try: self._reg.send_multipart(['UNREGISTER', str(self.pid)]) except zmq.ZMQError: logger.debug('Could not unregister') # give it a chance to finish a message logger.debug('Starting the graceful period') delay = time.time() + self.timeout self.loop.add_timeout(delay, self._stop)
def send_to_agent(self, agent_id, msg): msg = list(msg) # now we can send to the right guy msg.insert(0, agent_id) try: self.broker._backstream.send_multipart(msg) except Exception, e: logger.debug('Failed to send %s' % str(msg)) # we don't want to die on error. we just log it exc_type, exc_value, exc_traceback = sys.exc_info() exc = traceback.format_tb(exc_traceback) exc.insert(0, str(e)) logger.error('\n'.join(exc)) logger.debug('Removing agent') self._remove_agent(agent_id)
def stop(self): """Stops the agent. """ if not self.running: return # telling the broker we are stopping try: self._reg.send_multipart(['UNREGISTER', str(self.pid)]) except zmq.ZMQError: logger.debug('Could not unregister') # give it a chance to finish a message logger.debug('Starting the graceful period') self.graceful_delay = ioloop.DelayedCallback(self._stop, self.timeout * 1000, io_loop=self.loop) self.graceful_delay.start()
def _check_agent(self, agent_id): # XXX we'll want agents to register themselves # again after each heartbeat # # The broker will removing idling agents # just before sending a hearbeat. # # That will let us make sure a dead agent on # a distant box is removed if agent_id in self._agent_times: last_contact = self._agent_times.get(agent_id) if last_contact is not None: duration = time.time() - last_contact if duration > self.agent_timeout: logger.debug('The agent %r is slow (%.2f)' % (agent_id, duration)) return False return True