def connect(self, raise_on_any_error=False): """ Connect to hosts in hosts list. Returns status of connect as a dict. :param raise_on_any_error: Optional Raise an exception even if connecting to one of the hosts fails. :type raise_on_any_error: ``boolean`` :rtype: ``dict`` of ``str`` to ``dict`` """ results = {} for host in self._hosts: while not concurrency_lib.is_green_pool_free(self._pool): concurrency_lib.sleep(self._scan_interval) self._pool.spawn(self._connect, host=host, results=results, raise_on_any_error=raise_on_any_error) concurrency_lib.green_pool_wait_all(self._pool) if self._successful_connects < 1: # We definitely have to raise an exception in this case. LOG.error('Unable to connect to any of the hosts.', extra={'connect_results': results}) msg = ( 'Unable to connect to any one of the hosts: %s.\n\n connect_errors=%s' % (self._hosts, json.dumps(results, indent=2))) raise NoHostsConnectedToException(msg) return results
def process_task(self, body, message): LOG.debug("process_task") LOG.debug(" body: %s", body) LOG.debug(" message.properties: %s", message.properties) LOG.debug(" message.delivery_info: %s", message.delivery_info) routing_key = message.delivery_info.get("routing_key", "") handler = self._handlers.get(routing_key, None) try: if not handler: LOG.debug("Skipping message %s as no handler was found.", message) return trigger_type = getattr(body, "type", None) if self._trigger_types and trigger_type not in self._trigger_types: LOG.debug( "Skipping message %s since trigger_type doesn't match (type=%s)", message, trigger_type, ) return try: handler(body) except Exception as e: LOG.exception( "Handling failed. Message body: %s. Exception: %s", body, six.text_type(e), ) finally: message.ack() concurrency.sleep(self.sleep_interval)
def test_sensor_watch_queue_gets_deleted_on_stop(self): def create_handler(sensor_db): pass def update_handler(sensor_db): pass def delete_handler(sensor_db): pass sensor_watcher = SensorWatcher(create_handler, update_handler, delete_handler, queue_suffix='covfefe') sensor_watcher.start() sw_queues = self._get_sensor_watcher_amqp_queues( queue_name='st2.sensor.watch.covfefe') start = monotonic() done = False while not done: concurrency.sleep(0.01) sw_queues = self._get_sensor_watcher_amqp_queues( queue_name='st2.sensor.watch.covfefe') done = len(sw_queues) > 0 or ((monotonic() - start) < 5) sensor_watcher.stop() sw_queues = self._get_sensor_watcher_amqp_queues( queue_name='st2.sensor.watch.covfefe') self.assertTrue(len(sw_queues) == 0)
def process_task(self, body, message): LOG.debug('process_task') LOG.debug(' body: %s', body) LOG.debug(' message.properties: %s', message.properties) LOG.debug(' message.delivery_info: %s', message.delivery_info) routing_key = message.delivery_info.get('routing_key', '') handler = self._handlers.get(routing_key, None) try: if not handler: LOG.debug('Skipping message %s as no handler was found.', message) return trigger_type = getattr(body, 'type', None) if self._trigger_types and trigger_type not in self._trigger_types: LOG.debug( 'Skipping message %s since trigger_type doesn\'t match (type=%s)', message, trigger_type) return try: handler(body) except Exception as e: LOG.exception( 'Handling failed. Message body: %s. Exception: %s', body, six.text_type(e)) finally: message.ack() concurrency.sleep(self.sleep_interval)
def test_child_processes_are_killed_on_sigint(self): process = self._start_sensor_container() # Give it some time to start up concurrency.sleep(7) # Assert process has started and is running self.assertProcessIsRunning(process=process) # Verify container process and children sensor / wrapper processes are running pp = psutil.Process(process.pid) children_pp = pp.children() self.assertEqual(pp.cmdline()[1:], DEFAULT_CMD[1:]) self.assertEqual(len(children_pp), 1) # Send SIGINT process.send_signal(signal.SIGINT) # SIGINT causes graceful shutdown so give it some time to gracefuly shut down the sensor # child processes concurrency.sleep(PROCESS_EXIT_TIMEOUT + 1) # Verify parent and children processes have exited self.assertProcessExited(proc=pp) self.assertProcessExited(proc=children_pp[0]) self.remove_process(process=process)
def run(self): self._run_all_sensors() success_exception_cls = concurrency.get_greenlet_exit_exception_class() try: while not self._stopped: # Poll for all running processes sensor_ids = list(self._sensors.keys()) if len(sensor_ids) >= 1: LOG.debug('%d active sensor(s)' % (len(sensor_ids))) self._poll_sensors_for_results(sensor_ids) else: LOG.debug('No active sensors') concurrency.sleep(self._poll_interval) except success_exception_cls: # This exception is thrown when sensor container manager # kills the thread which runs process container. Not sure # if this is the best thing to do. self._stopped = True return SUCCESS_EXIT_CODE except: LOG.exception('Container failed to run sensors.') self._stopped = True return FAILURE_EXIT_CODE self._stopped = True LOG.error('Process container stopped.') exit_code = self._exit_code or SUCCESS_EXIT_CODE return exit_code
def _perform_garbage_collection(self): LOG.info("Performing garbage collection...") proc_message = "Performing garbage collection for %s." skip_message = "Skipping garbage collection for %s since it's not configured." # Note: We sleep for a bit between garbage collection of each object type to prevent busy # waiting obj_type = "action executions" if ( self._action_executions_ttl and self._action_executions_ttl >= MINIMUM_TTL_DAYS ): LOG.info(proc_message, obj_type) self._purge_action_executions() concurrency.sleep(self._sleep_delay) else: LOG.debug(skip_message, obj_type) obj_type = "action executions output" if ( self._action_executions_output_ttl and self._action_executions_output_ttl >= MINIMUM_TTL_DAYS_EXECUTION_OUTPUT ): LOG.info(proc_message, obj_type) self._purge_action_executions_output() concurrency.sleep(self._sleep_delay) else: LOG.debug(skip_message, obj_type) obj_type = "trigger instances" if ( self._trigger_instances_ttl and self._trigger_instances_ttl >= MINIMUM_TTL_DAYS ): LOG.info(proc_message, obj_type) self._purge_trigger_instances() concurrency.sleep(self._sleep_delay) else: LOG.debug(skip_message, obj_type) obj_type = "inquiries" if self._purge_inquiries: LOG.info(proc_message, obj_type) self._timeout_inquiries() concurrency.sleep(self._sleep_delay) else: LOG.debug(skip_message, obj_type) obj_type = "orphaned workflow executions" if self._workflow_execution_max_idle > 0: LOG.info(proc_message, obj_type) self._purge_orphaned_workflow_executions() concurrency.sleep(self._sleep_delay) else: LOG.debug(skip_message, obj_type)
def test_no_sensors_dont_quit(self): process_container = ProcessSensorContainer(None, poll_interval=0.1) process_container_thread = concurrency.spawn(process_container.run) concurrency.sleep(0.5) self.assertEqual(process_container.running(), 0) self.assertEqual(process_container.stopped(), False) process_container.shutdown() process_container_thread.kill()
def _main_loop(self): while self._running: self._perform_garbage_collection() LOG.info( "Sleeping for %s seconds before next garbage collection..." % (self._collection_interval)) concurrency.sleep(self._collection_interval)
def _execute_in_pool(self, execute_method, **kwargs): results = {} for host in self._bad_hosts.keys(): results[host] = self._bad_hosts[host] for host in self._hosts_client.keys(): while not self._pool.free(): concurrency_lib.sleep(self._scan_interval) self._pool.spawn(execute_method, host=host, results=results, **kwargs) concurrency_lib.green_pool_wait_all(self._pool) return results
def _respawn_sensor(self, sensor_id, sensor, exit_code): """ Method for respawning a sensor which died with a non-zero exit code. """ extra = {"sensor_id": sensor_id, "sensor": sensor} if self._single_sensor_mode: # In single sensor mode we want to exit immediately on failure LOG.info( "Not respawning a sensor since running in single sensor mode", extra=extra, ) self._stopped = True self._exit_code = exit_code return if self._stopped: LOG.debug("Stopped, not respawning a dead sensor", extra=extra) return should_respawn = self._should_respawn_sensor(sensor_id=sensor_id, sensor=sensor, exit_code=exit_code) if not should_respawn: LOG.debug("Not respawning a dead sensor", extra=extra) return LOG.debug("Respawning dead sensor", extra=extra) self._sensor_respawn_counts[sensor_id] += 1 sleep_delay = SENSOR_RESPAWN_DELAY * self._sensor_respawn_counts[ sensor_id] concurrency.sleep(sleep_delay) try: self._spawn_sensor_process(sensor=sensor) except Exception as e: LOG.warning(six.text_type(e), exc_info=True) # Disable sensor which we are unable to start del self._sensors[sensor_id]
def test_inquiry_garbage_collection(self): now = date_utils.get_datetime_utc_now() # Insert some mock Inquiries with start_timestamp > TTL old_inquiry_count = 15 timestamp = now - datetime.timedelta(minutes=3) for index in range(0, old_inquiry_count): self._create_inquiry(ttl=2, timestamp=timestamp) # Insert some mock Inquiries with TTL set to a "disabled" value disabled_inquiry_count = 3 timestamp = now - datetime.timedelta(minutes=3) for index in range(0, disabled_inquiry_count): self._create_inquiry(ttl=0, timestamp=timestamp) # Insert some mock Inquiries with start_timestamp < TTL new_inquiry_count = 5 timestamp = now - datetime.timedelta(minutes=3) for index in range(0, new_inquiry_count): self._create_inquiry(ttl=15, timestamp=timestamp) filters = {"status": action_constants.LIVEACTION_STATUS_PENDING} inquiries = list(ActionExecution.query(**filters)) self.assertEqual( len(inquiries), (old_inquiry_count + new_inquiry_count + disabled_inquiry_count), ) # Start garbage collector process = self._start_garbage_collector() # Give it some time to perform garbage collection and kill it concurrency.sleep(15) process.send_signal(signal.SIGKILL) self.remove_process(process=process) # Expired Inquiries should have been garbage collected inquiries = list(ActionExecution.query(**filters)) self.assertEqual(len(inquiries), new_inquiry_count + disabled_inquiry_count)
def run(self): self._running = True self._register_signal_handlers() # Wait a couple of seconds before performing initial collection to prevent thundering herd # effect when restarting multiple services at the same time jitter_seconds = random.uniform(0, 3) concurrency.sleep(jitter_seconds) success_exception_cls = concurrency.get_greenlet_exit_exception_class() try: self._main_loop() except success_exception_cls: self._running = False return SUCCESS_EXIT_CODE except Exception as e: LOG.exception("Exception in the garbage collector: %s" % (six.text_type(e))) self._running = False return FAILURE_EXIT_CODE return SUCCESS_EXIT_CODE
def test_child_processes_are_killed_on_sigkill(self): process = self._start_sensor_container() # Give it some time to start up concurrency.sleep(5) # Verify container process and children sensor / wrapper processes are running pp = psutil.Process(process.pid) children_pp = pp.children() self.assertEqual(pp.cmdline()[1:], DEFAULT_CMD[1:]) self.assertEqual(len(children_pp), 1) # Send SIGKILL process.send_signal(signal.SIGKILL) # Note: On SIGKILL processes should be killed instantly concurrency.sleep(1) # Verify parent and children processes have exited self.assertProcessExited(proc=pp) self.assertProcessExited(proc=children_pp[0]) self.remove_process(process=process)
def test_single_sensor_mode(self): # 1. --sensor-ref not provided cmd = [ PYTHON_BINARY, BINARY, "--config-file", ST2_CONFIG_PATH, "--single-sensor-mode", ] process = self._start_sensor_container(cmd=cmd) pp = psutil.Process(process.pid) # Give it some time to start up concurrency.sleep(5) stdout = process.stdout.read() self.assertTrue( ( b"--sensor-ref argument must be provided when running in single sensor " b"mode" ) in stdout ) self.assertProcessExited(proc=pp) self.remove_process(process=process) # 2. sensor ref provided cmd = [ BINARY, "--config-file", ST2_CONFIG_PATH, "--single-sensor-mode", "--sensor-ref=examples.SampleSensorExit", ] process = self._start_sensor_container(cmd=cmd) pp = psutil.Process(process.pid) # Give it some time to start up concurrency.sleep(1) # Container should exit and not respawn a sensor in single sensor mode stdout = process.stdout.read() self.assertTrue( b"Process for sensor examples.SampleSensorExit has exited with code 110" ) self.assertTrue(b"Not respawning a sensor since running in single sensor mode") self.assertTrue(b"Process container quit with exit_code 110.") concurrency.sleep(2) self.assertProcessExited(proc=pp) self.remove_process(process=process)
def run(self, cmd, timeout=None, quote=False, call_line_handler_func=False): """ Note: This function is based on paramiko's exec_command() method. :param timeout: How long to wait (in seconds) for the command to finish (optional). :type timeout: ``float`` :param call_line_handler_func: True to call handle_stdout_line_func function for each line of received stdout and handle_stderr_line_func for each line of stderr. :type call_line_handler_func: ``bool`` """ if quote: cmd = quote_unix(cmd) extra = {'_cmd': cmd} self.logger.info('Executing command', extra=extra) # Use the system default buffer size bufsize = -1 transport = self.client.get_transport() chan = transport.open_session() start_time = time.time() if cmd.startswith('sudo'): # Note that fabric does this as well. If you set pty, stdout and stderr # streams will be combined into one. # NOTE: If pty is used, every new line character \n will be converted to \r\n which # isn't desired. Because of that we sanitize the output and replace \r\n with \n at the # bottom of this method uses_pty = True chan.get_pty() else: uses_pty = False chan.exec_command(cmd) stdout = StringIO() stderr = StringIO() # Create a stdin file and immediately close it to prevent any # interactive script from hanging the process. stdin = chan.makefile('wb', bufsize) stdin.close() # Receive all the output # Note #1: This is used instead of chan.makefile approach to prevent # buffering issues and hanging if the executed command produces a lot # of output. # # Note #2: If you are going to remove "ready" checks inside the loop # you are going to have a bad time. Trying to consume from a channel # which is not ready will block for indefinitely. exit_status_ready = chan.exit_status_ready() if exit_status_ready: stdout_data = self._consume_stdout( chan=chan, call_line_handler_func=call_line_handler_func) stdout_data = stdout_data.getvalue() stderr_data = self._consume_stderr( chan=chan, call_line_handler_func=call_line_handler_func) stderr_data = stderr_data.getvalue() stdout.write(stdout_data) stderr.write(stderr_data) while not exit_status_ready: current_time = time.time() elapsed_time = (current_time - start_time) if timeout and (elapsed_time > timeout): # TODO: Is this the right way to clean up? chan.close() stdout = sanitize_output(stdout.getvalue(), uses_pty=uses_pty) stderr = sanitize_output(stderr.getvalue(), uses_pty=uses_pty) raise SSHCommandTimeoutError(cmd=cmd, timeout=timeout, stdout=stdout, stderr=stderr) stdout_data = self._consume_stdout( chan=chan, call_line_handler_func=call_line_handler_func) stdout_data = stdout_data.getvalue() stderr_data = self._consume_stderr( chan=chan, call_line_handler_func=call_line_handler_func) stderr_data = stderr_data.getvalue() stdout.write(stdout_data) stderr.write(stderr_data) # We need to check the exit status here, because the command could # print some output and exit during this sleep below. exit_status_ready = chan.exit_status_ready() if exit_status_ready: break # Short sleep to prevent busy waiting concurrency.sleep(self.SLEEP_DELAY) # print('Wait over. Channel must be ready for host: %s' % self.hostname) # Receive the exit status code of the command we ran. status = chan.recv_exit_status() stdout = sanitize_output(stdout.getvalue(), uses_pty=uses_pty) stderr = sanitize_output(stderr.getvalue(), uses_pty=uses_pty) extra = {'_status': status, '_stdout': stdout, '_stderr': stderr} self.logger.debug('Command finished', extra=extra) return [stdout, stderr, status]
def test_garbage_collection(self): now = date_utils.get_datetime_utc_now() status = action_constants.LIVEACTION_STATUS_SUCCEEDED # Insert come mock ActionExecutionDB objects with start_timestamp < TTL defined in the # config old_executions_count = 15 ttl_days = 30 # > 20 timestamp = (now - datetime.timedelta(days=ttl_days)) for index in range(0, old_executions_count): action_execution_db = ActionExecutionDB( start_timestamp=timestamp, end_timestamp=timestamp, status=status, action={'ref': 'core.local'}, runner={'name': 'local-shell-cmd'}, liveaction={'ref': 'foo'}) ActionExecution.add_or_update(action_execution_db) stdout_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stdout', data='stdout') ActionExecutionOutput.add_or_update(stdout_db) stderr_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stderr', data='stderr') ActionExecutionOutput.add_or_update(stderr_db) # Insert come mock ActionExecutionDB objects with start_timestamp > TTL defined in the # config new_executions_count = 5 ttl_days = 2 # < 20 timestamp = (now - datetime.timedelta(days=ttl_days)) for index in range(0, new_executions_count): action_execution_db = ActionExecutionDB( start_timestamp=timestamp, end_timestamp=timestamp, status=status, action={'ref': 'core.local'}, runner={'name': 'local-shell-cmd'}, liveaction={'ref': 'foo'}) ActionExecution.add_or_update(action_execution_db) stdout_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stdout', data='stdout') ActionExecutionOutput.add_or_update(stdout_db) stderr_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stderr', data='stderr') ActionExecutionOutput.add_or_update(stderr_db) # Insert some mock output objects where start_timestamp > action_executions_output_ttl new_output_count = 5 ttl_days = 15 # > 10 and < 20 timestamp = (now - datetime.timedelta(days=ttl_days)) for index in range(0, new_output_count): action_execution_db = ActionExecutionDB( start_timestamp=timestamp, end_timestamp=timestamp, status=status, action={'ref': 'core.local'}, runner={'name': 'local-shell-cmd'}, liveaction={'ref': 'foo'}) ActionExecution.add_or_update(action_execution_db) stdout_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stdout', data='stdout') ActionExecutionOutput.add_or_update(stdout_db) stderr_db = ActionExecutionOutputDB(execution_id=str( action_execution_db.id), action_ref='core.local', runner_ref='dummy', timestamp=timestamp, output_type='stderr', data='stderr') ActionExecutionOutput.add_or_update(stderr_db) execs = ActionExecution.get_all() self.assertEqual( len(execs), (old_executions_count + new_executions_count + new_output_count)) stdout_dbs = ActionExecutionOutput.query(output_type='stdout') self.assertEqual( len(stdout_dbs), (old_executions_count + new_executions_count + new_output_count)) stderr_dbs = ActionExecutionOutput.query(output_type='stderr') self.assertEqual( len(stderr_dbs), (old_executions_count + new_executions_count + new_output_count)) # Start garbage collector process = self._start_garbage_collector() # Give it some time to perform garbage collection and kill it concurrency.sleep(15) process.send_signal(signal.SIGKILL) self.remove_process(process=process) # Old executions and corresponding objects should have been garbage collected execs = ActionExecution.get_all() self.assertEqual(len(execs), (new_executions_count + new_output_count)) # Collection for output objects older than 10 days is also enabled, so those objects # should be deleted as well stdout_dbs = ActionExecutionOutput.query(output_type='stdout') self.assertEqual(len(stdout_dbs), (new_executions_count)) stderr_dbs = ActionExecutionOutput.query(output_type='stderr') self.assertEqual(len(stderr_dbs), (new_executions_count))
def on_iteration(self): super(TriggerWatcher, self).on_iteration() concurrency.sleep(seconds=self.sleep_interval)
def on_consume_end(self, connection, channel): super(TriggerWatcher, self).on_consume_end(connection=connection, channel=channel) concurrency.sleep(seconds=self.sleep_interval)
def run(self): while True: self.poll() concurrency.sleep(self._poll_interval)
def run(self, connection, wrapped_callback): """ Run the wrapped_callback in a protective covering of retries and error handling. :param connection: Connection to messaging service :type connection: kombu.connection.Connection :param wrapped_callback: Callback that will be wrapped by all the fine handling in this method. Expected signature of callback - ``def func(connection, channel)`` """ should_stop = False channel = None while not should_stop: try: channel = connection.channel() wrapped_callback(connection=connection, channel=channel) should_stop = True except connection.connection_errors + connection.channel_errors as e: should_stop, wait = self._retry_context.test_should_stop(e) # reset channel to None to avoid any channel closing errors. At this point # in case of an exception there should be no channel but that is better to # guarantee. channel = None # All attempts to re-establish connections have failed. This error needs to # be notified so raise. if should_stop: raise # -1, 0 and 1+ are handled properly by eventlet.sleep self._logger.debug( 'Received RabbitMQ server error, sleeping for %s seconds ' 'before retrying: %s' % (wait, six.text_type(e))) concurrency.sleep(wait) connection.close() # ensure_connection will automatically switch to an alternate. Other connections # in the pool will be fixed independently. It would be nice to cut-over the # entire ConnectionPool simultaneously but that would require writing our own # ConnectionPool. If a server recovers it could happen that the same process # ends up talking to separate nodes in a cluster. def log_error_on_conn_failure(exc, interval): self._logger.debug( 'Failed to re-establish connection to RabbitMQ server, ' 'retrying in %s seconds: %s' % (interval, six.text_type(e))) try: # NOTE: This function blocks and tries to restablish a connection for # indefinetly if "max_retries" argument is not specified connection.ensure_connection( max_retries=self._ensure_max_retries, errback=log_error_on_conn_failure) except Exception: self._logger.exception( 'Connections to RabbitMQ cannot be re-established: %s', six.text_type(e)) raise except Exception as e: self._logger.exception( 'Connections to RabbitMQ cannot be re-established: %s', six.text_type(e)) # Not being able to publish a message could be a significant issue for an app. raise finally: if should_stop and channel: try: channel.close() except Exception: self._logger.warning('Error closing channel.', exc_info=True)