def publish(self, payload, exchange, routing_key=''): with Timer(key='amqp.pool_publisher.publish_with_retries.' + exchange.name): with self.pool.acquire(block=True) as connection: retry_wrapper = ConnectionRetryWrapper( cluster_size=self.cluster_size, logger=LOG) def do_publish(connection, channel): # ProducerPool ends up creating it own ConnectionPool which ends up # completely invalidating this ConnectionPool. Also, a ConnectionPool for # producer does not really solve any problems for us so better to create a # Producer for each publish. producer = Producer(channel) kwargs = { 'body': payload, 'exchange': exchange, 'routing_key': routing_key, 'serializer': 'pickle', 'content_encoding': 'utf-8' } retry_wrapper.ensured(connection=connection, obj=producer, to_ensure_func=producer.publish, **kwargs) retry_wrapper.run(connection=connection, wrapped_callback=do_publish)
def register_runners(): # Register runners registered_count = 0 fail_on_failure = cfg.CONF.register.fail_on_failure # 1. Register runner types try: LOG.info('=========================================================') LOG.info('############## Registering runners ######################') LOG.info('=========================================================') with Timer(key='st2.register.runners'): registered_count = runners_registrar.register_runners( fail_on_failure=fail_on_failure, experimental=False ) except Exception as error: exc_info = not fail_on_failure # TODO: Narrow exception window LOG.warning('Failed to register runners: %s', error, exc_info=exc_info) if fail_on_failure: raise error LOG.info('Registered %s runners.', registered_count)
def register_actions(): # Register runnertypes and actions. The order is important because actions require action # types to be present in the system. pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info('=========================================================') LOG.info('############## Registering actions ######################') LOG.info('=========================================================') with Timer(key='st2.register.actions'): registered_count = actions_registrar.register_actions( pack_dir=pack_dir, fail_on_failure=fail_on_failure ) except Exception as e: exc_info = not fail_on_failure LOG.warning('Failed to register actions: %s', e, exc_info=exc_info) if fail_on_failure: raise e LOG.info('Registered %s actions.' % (registered_count))
def register_rules(): # Register ruletypes and rules. pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 overridden_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering rules ########################") LOG.info("=========================================================") rule_types_registrar.register_rule_types() except Exception as e: LOG.warning("Failed to register rule types: %s", e, exc_info=True) return try: with Timer(key="st2.register.rules"): registered_count, overridden_count = rules_registrar.register_rules( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register rules: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s rules.", registered_count) LOG.info("%s rules had their metadata overridden." % (overridden_count))
def update_execution(liveaction_db, publish=True, set_result_size=False): """ :param set_result_size: True to calculate size of the serialized result field value and set it on the "result_size" database field. """ execution = ActionExecution.get(liveaction__id=str(liveaction_db.id)) decomposed = _decompose_liveaction(liveaction_db) kw = {} for k, v in six.iteritems(decomposed): kw["set__" + k] = v if liveaction_db.status != execution.status: # Note: If the status changes we store this transition in the "log" attribute of action # execution kw["push__log"] = _create_execution_log_entry(liveaction_db.status) if set_result_size: # Sadly with the current ORM abstraction there is no better way to achieve updating # result_size and we need to serialize the value again - luckily that operation is fast. # To put things into perspective - on 4 MB result dictionary it only takes 7 ms which is # negligible compared to other DB operations duration (and for smaller results it takes # in sub ms range). with Timer(key="action.executions.calculate_result_size"): result_size = len( ActionExecutionDB.result._serialize_field_value(liveaction_db.result) ) kw["set__result_size"] = result_size execution = ActionExecution.update(execution, publish=publish, **kw) return execution
def register_rules(): # Register ruletypes and rules. pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info('=========================================================') LOG.info('############## Registering rules ########################') LOG.info('=========================================================') rule_types_registrar.register_rule_types() except Exception as e: LOG.warning('Failed to register rule types: %s', e, exc_info=True) return try: with Timer(key='st2.register.rules'): registered_count = rules_registrar.register_rules( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning('Failed to register rules: %s', e, exc_info=exc_info) if fail_on_failure: raise e LOG.info('Registered %s rules.', registered_count)
def register_policies(): # Register policy types and policies. pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_type_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering policy types #################") LOG.info("=========================================================") with Timer(key="st2.register.policies"): registered_type_count = policies_registrar.register_policy_types( st2common) except Exception: LOG.warning("Failed to register policy types.", exc_info=True) LOG.info("Registered %s policy types.", registered_type_count) registered_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering policies #####################") LOG.info("=========================================================") registered_count = policies_registrar.register_policies( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register policies: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s policies.", registered_count)
def publish(self, payload, exchange, routing_key="", compression=None): compression = compression or cfg.CONF.messaging.compression with Timer(key="amqp.pool_publisher.publish_with_retries." + exchange.name): with self.pool.acquire(block=True) as connection: retry_wrapper = ConnectionRetryWrapper( cluster_size=self.cluster_size, logger=LOG) def do_publish(connection, channel): # ProducerPool ends up creating it own ConnectionPool which ends up # completely invalidating this ConnectionPool. Also, a ConnectionPool for # producer does not really solve any problems for us so better to create a # Producer for each publish. producer = Producer(channel) kwargs = { "body": payload, "exchange": exchange, "routing_key": routing_key, "serializer": "pickle", "compression": compression, "content_encoding": "utf-8", } retry_wrapper.ensured( connection=connection, obj=producer, to_ensure_func=producer.publish, **kwargs, ) retry_wrapper.run(connection=connection, wrapped_callback=do_publish)
def _update_live_action_db(self, liveaction_id, status, result, context): """ Update LiveActionDB object for the provided liveaction id. """ liveaction_db = get_liveaction_by_id(liveaction_id) state_changed = (liveaction_db.status != status and liveaction_db.status not in action_constants.LIVEACTION_COMPLETED_STATES) if status in action_constants.LIVEACTION_COMPLETED_STATES: end_timestamp = date_utils.get_datetime_utc_now() else: end_timestamp = None with Timer(key="action.executions.update_liveaction_db"): liveaction_db = update_liveaction_status( status=status if state_changed else liveaction_db.status, result=result, context=context, end_timestamp=end_timestamp, liveaction_db=liveaction_db, ) return (liveaction_db, state_changed)
def register_actions(): # Register runnertypes and actions. The order is important because actions require action # types to be present in the system. pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 overridden_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering actions ######################") LOG.info("=========================================================") with Timer(key="st2.register.actions"): registered_count, overridden_count = actions_registrar.register_actions( pack_dir=pack_dir, fail_on_failure=fail_on_failure, use_runners_cache=True, ) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register actions: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s actions." % (registered_count)) LOG.info("%s actions had their metadata overridden." % (overridden_count))
def process(self, pre_ack_response): trigger_instance, message = self._decompose_pre_ack_process_response( pre_ack_response) if not trigger_instance: raise ValueError("No trigger_instance provided for processing.") get_driver().inc_counter("trigger.%s.processed" % (trigger_instance.trigger)) try: # Use trace_context from the message and if not found create a new context # and use the trigger_instance.id as trace_tag. trace_context = message.get(TRACE_CONTEXT, None) if not trace_context: trace_context = { TRACE_ID: "trigger_instance-%s" % str(trigger_instance.id) } # add a trace or update an existing trace with trigger_instance trace_service.add_or_update_given_trace_context( trace_context=trace_context, trigger_instances=[ trace_service.get_trace_component_for_trigger_instance( trigger_instance) ], ) container_utils.update_trigger_instance_status( trigger_instance, trigger_constants.TRIGGER_INSTANCE_PROCESSING) with CounterWithTimer(key="rule.processed"): with Timer(key="trigger.%s.processed" % (trigger_instance.trigger)): self.rules_engine.handle_trigger_instance(trigger_instance) container_utils.update_trigger_instance_status( trigger_instance, trigger_constants.TRIGGER_INSTANCE_PROCESSED) except: # TODO : Capture the reason for failure. container_utils.update_trigger_instance_status( trigger_instance, trigger_constants.TRIGGER_INSTANCE_PROCESSING_FAILED) # This could be a large message but at least in case of an exception # we get to see more context. # Beyond this point code cannot really handle the exception anyway so # eating up the exception. LOG.exception("Failed to handle trigger_instance %s.", trigger_instance) return
def register_aliases(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering aliases ######################") LOG.info("=========================================================") with Timer(key="st2.register.aliases"): registered_count = aliases_registrar.register_aliases( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: if fail_on_failure: raise e LOG.warning("Failed to register aliases.", exc_info=True) LOG.info("Registered %s aliases.", registered_count)
def register_aliases(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info('=========================================================') LOG.info('############## Registering aliases ######################') LOG.info('=========================================================') with Timer(key='st2.register.aliases'): registered_count = aliases_registrar.register_aliases( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: if fail_on_failure: raise e LOG.warning('Failed to register aliases.', exc_info=True) LOG.info('Registered %s aliases.', registered_count)
def update_execution(liveaction_db, publish=True, set_result_size=False): """ :param set_result_size: True to calculate size of the serialized result field value and set it on the "result_size" database field. """ execution = ActionExecution.get(liveaction__id=str(liveaction_db.id)) with coordination.get_coordinator().get_lock( str(liveaction_db.id).encode()): # Skip execution object update when action is already in completed state. if execution.status in action_constants.LIVEACTION_COMPLETED_STATES: LOG.debug( "[%s] Action is already in completed state: %s. Skipping execution update to state: %s." % (execution.id, execution.status, liveaction_db.status)) return execution decomposed = _decompose_liveaction(liveaction_db) kw = {} for k, v in six.iteritems(decomposed): kw["set__" + k] = v if liveaction_db.status != execution.status: # Note: If the status changes we store this transition in the "log" attribute of action # execution kw["push__log"] = _create_execution_log_entry(liveaction_db.status) if set_result_size: # Sadly with the current ORM abstraction there is no better way to achieve updating # result_size and we need to serialize the value again - luckily that operation is fast. # To put things into perspective - on 4 MB result dictionary it only takes 7 ms which is # negligible compared to other DB operations duration (and for smaller results it takes # in sub ms range). with Timer(key="action.executions.calculate_result_size"): result_size = len( ActionExecutionDB.result._serialize_field_value( liveaction_db.result)) kw["set__result_size"] = result_size execution = ActionExecution.update(execution, publish=publish, **kw) return execution
def register_sensors(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info('=========================================================') LOG.info('############## Registering sensors ######################') LOG.info('=========================================================') with Timer(key='st2.register.sensors'): registered_count = sensors_registrar.register_sensors( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning('Failed to register sensors: %s', e, exc_info=exc_info) if fail_on_failure: raise e LOG.info('Registered %s sensors.' % (registered_count))
def register_triggers(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering triggers #####################") LOG.info("=========================================================") with Timer(key="st2.register.triggers"): registered_count = triggers_registrar.register_triggers( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register sensors: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s triggers." % (registered_count))
def register_configs(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info('=========================================================') LOG.info('############## Registering configs ######################') LOG.info('=========================================================') with Timer(key='st2.register.configs'): registered_count = configs_registrar.register_configs( pack_dir=pack_dir, fail_on_failure=fail_on_failure, validate_configs=True) except Exception as e: exc_info = not fail_on_failure LOG.warning('Failed to register configs: %s', e, exc_info=exc_info) if fail_on_failure: raise e LOG.info('Registered %s configs.' % (registered_count))
def register_sensors(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 overridden_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering sensors ######################") LOG.info("=========================================================") with Timer(key="st2.register.sensors"): (registered_count, overridden_count) = sensors_registrar.register_sensors( pack_dir=pack_dir, fail_on_failure=fail_on_failure) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register sensors: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s sensors." % (registered_count)) LOG.info("%s sensors had their metadata overridden." % (overridden_count))
def register_configs(): pack_dir = cfg.CONF.register.pack fail_on_failure = not cfg.CONF.register.no_fail_on_failure registered_count = 0 try: LOG.info("=========================================================") LOG.info("############## Registering configs ######################") LOG.info("=========================================================") with Timer(key="st2.register.configs"): registered_count = configs_registrar.register_configs( pack_dir=pack_dir, fail_on_failure=fail_on_failure, validate_configs=True, ) except Exception as e: exc_info = not fail_on_failure LOG.warning("Failed to register configs: %s", e, exc_info=exc_info) if fail_on_failure: raise e LOG.info("Registered %s configs." % (registered_count))
def publish_state(self, payload, state): if not state: raise Exception('Unable to publish unassigned state.') with Timer(key='amqp.publish.state'): self._state_publisher.publish(payload, self._state_exchange, state)
def publish_delete(self, payload): with Timer(key='amqp.publish.delete'): self._publisher.publish(payload, self._exchange, DELETE_RK)
def publish_update(self, payload): with Timer(key='amqp.publish.update'): self._publisher.publish(payload, self._exchange, UPDATE_RK)
def publish_create(self, payload): with Timer(key='amqp.publish.create'): self._publisher.publish(payload, self._exchange, CREATE_RK)
def _update_status(self, liveaction_id, status, result, context): with Timer(key="action.executions.update_liveaction_db"): try: # NOTE: The next two operations take a very long time in master with large executions # (long standing issue), but because start_timestamp and end_timestamp measure how long # it took for the runner to run the action, it doesn't include the time it took to # actually write results / persist execution into the database - that's a problem # because we have no good direct visibility into that. # # The UX user would experience is - they would run an action which produces large # result, CLI / API would show execution as running for a long time (until it's # persisted in the database), but when it will finally be written to the database, # duration will be shown as a short time, because it's measured based on start and # end timestamp. # # This mean we can have, for example, Python runner action which returns a lot of data # and takes only 0.5 second to finish, but next two database operations can easily take # 10 seconds each. # # To work around that and provide some additional visibility into that to the operators # and users, we update "end_timestamp" on each object again after both of them have # already been written. That atomic single field update is very fast and adds no # additional overhead. LOG.debug("Setting status: %s for liveaction: %s", status, liveaction_id) liveaction_db, state_changed = self._update_live_action_db( liveaction_id, status, result, context) except Exception as e: LOG.exception("Cannot update liveaction " "(id: %s, status: %s, result: %s)." % (liveaction_id, status, result)) raise e # live_action_written_to_db_dt = date_utils.get_datetime_utc_now() with Timer(key="action.executions.update_execution_db"): try: executions.update_execution( liveaction_db, publish=state_changed, set_result_size=True, ) extra = {"liveaction_db": liveaction_db} LOG.debug("Updated liveaction after run", extra=extra) except Exception as e: LOG.exception("Cannot update action execution for liveaction " "(id: %s, status: %s, result: %s)." % (liveaction_id, status, result)) raise e # execution_written_to_db = date_utils.get_datetime_utc_now() # Those two operations are fast since they operate on two field in atomic fashion # with very small data # NOTE: Right now this is commented out since with the new DB performance optimizations we # don't really need this anymore (actual DB save times now rarely take more than couple of # fields on live action, but that would only save us ~300ms at most for large executions.. # This way we save two queries and up to 500ms for very large executions. """ operations = { "set__end_timestamp": live_action_written_to_db_dt, "set__result_size": result_size, } LiveAction.update(liveaction_db, publish=False, **operations) operations = { "set__end_timestamp": execution_written_to_db, "set__result_size": result_size, } ActionExecution.update(execution_db, publish=False, **operations) """ return liveaction_db
def _post_notify_subsection_triggers(self, liveaction_db=None, execution_db=None, notify_subsection=None, default_message_suffix=None): routes = (getattr(notify_subsection, 'routes') or getattr(notify_subsection, 'channels', [])) or [] execution_id = str(execution_db.id) if routes and len(routes) >= 1: payload = {} message = notify_subsection.message or ( 'Action ' + liveaction_db.action + ' ' + default_message_suffix) data = notify_subsection.data or {} jinja_context = self._build_jinja_context( liveaction_db=liveaction_db, execution_db=execution_db) try: with Timer(key='notifier.transform_message'): message = self._transform_message(message=message, context=jinja_context) except: LOG.exception('Failed (Jinja) transforming `message`.') try: with Timer(key='notifier.transform_data'): data = self._transform_data(data=data, context=jinja_context) except: LOG.exception('Failed (Jinja) transforming `data`.') # At this point convert result to a string. This restricts the rulesengines # ability to introspect the result. On the other handle atleast a json usable # result is sent as part of the notification. If jinja is required to convert # to a string representation it uses str(...) which make it impossible to # parse the result as json any longer. # TODO: Use to_serializable_dict data['result'] = json.dumps(liveaction_db.result) payload['message'] = message payload['data'] = data payload['execution_id'] = execution_id payload['status'] = liveaction_db.status payload['start_timestamp'] = isotime.format( liveaction_db.start_timestamp) try: payload['end_timestamp'] = isotime.format( liveaction_db.end_timestamp) except AttributeError: # This can be raised if liveaction.end_timestamp is None, which is caused # when policy cancels a request due to concurrency # In this case, use datetime.now() instead payload['end_timestamp'] = isotime.format(datetime.utcnow()) payload['action_ref'] = liveaction_db.action payload['runner_ref'] = self._get_runner_ref(liveaction_db.action) trace_context = self._get_trace_context(execution_id=execution_id) failed_routes = [] for route in routes: try: payload['route'] = route # Deprecated. Only for backward compatibility reasons. payload['channel'] = route LOG.debug('POSTing %s for %s. Payload - %s.', NOTIFY_TRIGGER_TYPE['name'], liveaction_db.id, payload) with CounterWithTimer( key='notifier.notify_trigger.dispatch'): self._trigger_dispatcher.dispatch( self._notify_trigger, payload=payload, trace_context=trace_context) except: failed_routes.append(route) if len(failed_routes) > 0: raise Exception('Failed notifications to routes: %s' % ', '.join(failed_routes))
def _do_run(self, runner): # Create a temporary auth token which will be available # for the duration of the action execution. runner.auth_token = self._create_auth_token( context=runner.context, action_db=runner.action, liveaction_db=runner.liveaction, ) try: # Finalized parameters are resolved and then rendered. This process could # fail. Handle the exception and report the error correctly. try: runner_params, action_params = param_utils.render_final_params( runner.runner_type.runner_parameters, runner.action.parameters, runner.liveaction.parameters, runner.liveaction.context, ) runner.runner_parameters = runner_params except ParamException as e: raise actionrunner.ActionRunnerException(six.text_type(e)) LOG.debug("Performing pre-run for runner: %s", runner.runner_id) runner.pre_run() # Mask secret parameters in the log context resolved_action_params = ResolvedActionParameters( action_db=runner.action, runner_type_db=runner.runner_type, runner_parameters=runner_params, action_parameters=action_params, ) extra = {"runner": runner, "parameters": resolved_action_params} LOG.debug("Performing run for runner: %s" % (runner.runner_id), extra=extra) with CounterWithTimer(key="action.executions"): with CounterWithTimer(key="action.%s.executions" % (runner.action.ref)): (status, result, context) = runner.run(action_params) result = jsonify.try_loads(result) action_completed = status in action_constants.LIVEACTION_COMPLETED_STATES if (isinstance(runner, PollingAsyncActionRunner) and runner.is_polling_enabled() and not action_completed): queries.setup_query(runner.liveaction.id, runner.runner_type, context) except: LOG.exception("Failed to run action.") _, ex, tb = sys.exc_info() # mark execution as failed. status = action_constants.LIVEACTION_STATUS_FAILED # include the error message and traceback to try and provide some hints. result = { "error": str(ex), "traceback": "".join(traceback.format_tb(tb, 20)), } context = None finally: # Log action completion extra = {"result": result, "status": status} LOG.debug('Action "%s" completed.' % (runner.action.name), extra=extra) # Update the final status of liveaction and corresponding action execution. with Timer(key="action.executions.update_status"): runner.liveaction = self._update_status( runner.liveaction.id, status, result, context) # Always clean-up the auth_token # This method should be called in the finally block to ensure post_run is not impacted. self._clean_up_auth_token(runner=runner, status=status) LOG.debug("Performing post_run for runner: %s", runner.runner_id) runner.post_run(status=status, result=result) LOG.debug("Runner do_run result", extra={"result": runner.liveaction.result}) LOG.audit("Liveaction completed", extra={"liveaction_db": runner.liveaction}) return runner.liveaction