def __init__(self, state_engine, event_dispatcher, config): """ """ self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating {}.RestAPI, using {} JSON parser".format( __name__, json.__name__)) rest_api_config = config.get("rest_api") if rest_api_config: self.host = rest_api_config.get("host", "0.0.0.0") self.port = rest_api_config.get("port", 4584) self.region = rest_api_config.get("region", "local") self.asl_store = state_engine.asl_store self.executions = state_engine.executions self.execution_history = state_engine.execution_history self.execution_metrics = state_engine.execution_metrics self.task_metrics = state_engine.task_dispatcher.task_metrics self.event_dispatcher = event_dispatcher self.system_metrics = {} metrics_config = config.get("metrics", {}) if metrics_config.get("implementation", "") == "Prometheus": self.system_metrics = SystemMetrics( metrics_config.get("namespace", ""))
def __init__(self, url="amqp://localhost:5672"): """ Creates a connection. A newly created connection must be opened with the Connection.open() method before it can be used. For RabbitMQ AMQP Connection URL documentation see: https://pika.readthedocs.io/en/stable/modules/parameters.html https://pika.readthedocs.io/en/stable/examples/using_urlparameters.html """ self.logger = init_logging(log_name="amqp_0_9_1_messaging_async") self.logger.info("Creating Connection with url: {}".format(url)) self.parameters = pika.URLParameters(url)
def setUp(self): # Initialise logger logger = init_logging(log_name="test_payload_template") config = { "state_engine": { "store_url": "ASL_store.json", "execution_ttl": 500 } } state_engine = StateEngine(config) # Stub out the real TaskDispatcher execute_task state_engine.task_dispatcher.execute_task = execute_task_stub self.event_dispatcher = EventDispatcherStub(state_engine, config)
def __init__(self, state_engine, event_dispatcher, config): """ """ self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating {}.RestAPI, using {} JSON parser".format( __name__, json.__name__)) config = config.get("rest_api") if config: self.host = config.get("host", "0.0.0.0") self.port = config.get("port", 4584) self.region = config.get("region", "local") self.asl_store = state_engine.asl_store self.executions = state_engine.executions self.execution_history = state_engine.execution_history self.event_dispatcher = event_dispatcher
def __init__(self, json_store_name, *args, **kwargs): self.json_store = json_store_name self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating JSONStore: {}".format(self.json_store)) try: with open(self.json_store, "r") as fp: self.store = json.load(fp) self.logger.info("JSONStore loading: {}".format(self.json_store)) except IOError as e: self.store = {} except ValueError as e: self.logger.warning( "JSONStore {} does not contain valid JSON".format( self.json_store)) self.store = {} self.update(*args, **kwargs) # use the free update to set keys
def __init__(self, url, key="", cache_size=128, daemon=False): """ key is used as a prefix/namespace in the Redis keyspace, so if we create multiple instances of a RedisStore then using different prefix keys for each instance will prevent odd result if we happen to need to store items with the same index keys in different stores. cache_size is the maximum number of items to store in the LRU cache used by get_cached_view(). Setting cache_size to 0 will disable the cache, so calls to get_cached_view() will behave exactly like get() or []. daemon is used to set the thread listening for cache invalidation messages from the Redis server as a daemon or not. """ self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating {} {}: {}".format(key, self.__class__.__name__, url)) self.redis = None # In case get_connection() fails, which will __del__ on exit. self.redis = RedisStore.get_connection(url, self.logger) # Get Redis version as an int, so 6.0.0 would be 600 to aid version tests. redis_version = self.redis.info("server").get("redis_version", "0") self.redis_version = int(redis_version.replace(".", "")) if self.redis_version < 600: self.logger.warning( "Server assisted client side caching is not " "supported in Redis version {}, read performance " "might be reduced.".format(redis_version)) self.key = key # Enabling the cache is deferred until first call to get_cached_view(). self.cache = None self.cache_size = cache_size self.daemon = daemon self.tracker_id = None
"Comment": "Trivial Child Step Function", "StartAt": "StartState", "States": { "StartState": { "Type": "Pass", "End": true } } }""" items = ['[{"category": "reference", "author": "Nigel Rees", "title": "Sayings of the Century", "price": 8.95}, {"category": "fiction", "author": "Evelyn Waugh", "title": "Sword of Honour", "price": 12.99}, {"category": "fiction", "author": "Herman Melville", "title": "Moby Dick", "isbn": "0-553-21311-3", "price": 8.99}, {"category": "fiction", "author": "J. R. R. Tolkien", "title": "The Lord of the Rings", "isbn": "0-395-19395-8", "price": 22.99}]'] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name='iterate1') # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("iterate1", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") iterate1_state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:iterate1_state_machine" child_state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:child_state_machine" def create_state_machines(): # Create state machines using a dummy roleArn. If it already exists an # exception will be thrown, we ignore that but raise other exceptions.
} }""" items = [ '{"lambda":"Success"}', '{"lambda":"InternalErrorNotHandled"}', '{"lambda":"InternalErrorHandled"}', '{"lambda":"Timeout"}' ] items = ['{"lambda":"Success"}'] #items = ['{"lambda":"InternalErrorNotHandled"}'] #items = ['{"lambda":"InternalErrorHandled"}'] #items = ['{"lambda":"Timeout"}'] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name='step_by_step') # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("step_by_step", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") caller_state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:caller_state_machine" state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:simple_state_machine" def create_state_machines(): # Create state machine using a dummy roleArn. If it already exists an
def __init__(self, name): super().__init__(name=name) # Call Thread constructor # Initialise logger self.logger = init_logging(log_name=name)
The most important paths for state traversal are: $$.State.Name = the current state $$.StateMachine.Definition = (optional) contains the complete ASL state machine $$.StateMachine.Id = a unique reference to an ASL state machine """ context = '{"StateMachine": {"Id": "arn:aws:states:local:0123456789:stateMachine:id_then_unzip", "Definition": ' + ASL + '}}' items = [ '{"data": {"zipfile": "s3://37199-dev/CFX/input-data/akismet.2.5.3.zip", "destination": "s3://37199-dev/CFX/processed-data"}, "context": ' + context + '}' ] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name='id_then_unzip') # Connect to event queue and send items. connection = Connection( "amqp://localhost:5672?connection_attempts=20&retry_delay=10&heartbeat=0" ) try: connection.open() session = connection.session() producer = session.producer("asl_workflow_events") # event queue for item in items: """ Setting content_type isn't necessary for correct operation, however it is the correct thing to do: https://www.ietf.org/rfc/rfc4627.txt. """
} } ] } } } ] } } }""" items = ['{"lambda":"Success"}'] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name="parallel2") # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("parallel2", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:parallel2" def create_state_machines(): # Create state machine using a dummy roleArn. If it already exists an # exception will be thrown, we ignore that but raise other exceptions. try:
def create_tracer(service_name, config, use_asyncio=False): create_tracer.logger = init_logging(service_name) create_tracer.logger.info("Creating OpenTracing Tracer") # Store default tracer in case creating concrete implementation fails. tracer = opentracing.tracer if config.get("implementation") == "Jaeger": try: # import deferred until Jaeger is selected in config. import jaeger_client import tornado.ioloop """ If Implementation = Jaeger get the Jaeger config from the config dict if available, if not present create a sane default config. """ jaeger_config = config.get("config") if not jaeger_config: jaeger_config = { "sampler": { "type": "const", "param": 1 }, "logging": False } jaeger = jaeger_client.Config( service_name=config.get("service_name", service_name), config=jaeger_config, ) """ The init_logging(log_name="tornado") is important, though a bit obtuse. Without it all subsequent log messages generated will be duplicated. The issue is that "under the covers" Jaeger uses the tornado https://www.tornadoweb.org async networking library. Tornado's IOLoop creates a log handler if necessary when it is started, because if there were no handler configured you'd never see any of its event loop exception messages. The default handler is created for the root logger and ends up resulting in duplicate messages for other logs. By explicitly adding a handler for the tornado logger, as the following line does, the logging should be correctly handled. See: https://stackoverflow.com/questions/30373620/why-does-ioloop-in-tornado-seem-to-add-root-logger-handler """ init_logging(log_name="tornado") """ If we are using asyncio we want the tracer to use the main asyncio event loop rather than create a new ThreadLoop (which is the default behaviour unless a tornado IOLoop is passed. In recent versions of Tornado that delegates to the asyncio event loop so getting the current tornado IOLoop and passing that to initialize_tracer will cause the tracer to use the main event loop. """ if use_asyncio: jaeger.initialize_tracer( io_loop=tornado.ioloop.IOLoop.current()) else: jaeger.initialize_tracer() create_tracer.logger.info("Jaeger Tracer initialised") except Exception as e: create_tracer.logger.warning( "Failed to initialise Jaeger Tracer : {}".format(e)) opentracing.tracer = tracer patch_boto3()
def __init__(self, state_machines): # Initialise logger self.logger = init_logging(log_name="subscribe_to_notifications") self.state_machines = state_machines
def __init__(self, configuration_file): """ :param configuration_file: Path to coordinator configuration file :type configuration_file: str :raises IOError: If configuration file does not exist, or is not readable :raises ValueError: If configuration file does not contain valid JSON :raises AssertionError: If configuration file does not contain the required fields """ # Initialise logger self.logger = init_logging(log_name="asl_workflow_engine") # Load the configuration file. try: with open(configuration_file, "r") as fp: config = json.load(fp) self.logger.info("Creating WorkflowEngine") except IOError as e: self.logger.error("Unable to read configuration file: {}".format( configuration_file)) raise except ValueError as e: self.logger.error("Configuration file does not contain valid JSON") raise # Provide defaults for any unset config key config["event_queue"] = config.get("event_queue", {}) config["notifier"] = config.get("notifier", {}) config["state_engine"] = config.get("state_engine", {}) config["rest_api"] = config.get("rest_api", {}) config["tracer"] = config.get("tracer", {}) config["metrics"] = config.get("metrics", {}) """ Override config values if a field is set as an environment variable. There is also a USE_STRUCTURED_LOGGING environment variable used by the logger to select between automation friendly structured logging or more human readable "traditional" logs. """ eq = config["event_queue"] eq["queue_name"] = os.environ.get("EVENT_QUEUE_QUEUE_NAME", eq.get("queue_name")) eq["instance_id"] = os.environ.get("EVENT_QUEUE_INSTANCE_ID", eq.get("instance_id")) eq["queue_type"] = os.environ.get("EVENT_QUEUE_QUEUE_TYPE", eq.get("queue_type")) eq["connection_url"] = os.environ.get("EVENT_QUEUE_CONNECTION_URL", eq.get("connection_url")) eq["connection_options"] = os.environ.get( "EVENT_QUEUE_CONNECTION_OPTIONS", eq.get("connection_options")) eq["shared_event_consumer_capacity"] = os.environ.get( "EVENT_QUEUE_SHARED_EVENT_CONSUMER_CAPACITY", eq.get("shared_event_consumer_capacity")) eq["instance_event_consumer_capacity"] = os.environ.get( "EVENT_QUEUE_INSTANCE_EVENT_CONSUMER_CAPACITY", eq.get("instance_event_consumer_capacity")) eq["reply_to_consumer_capacity"] = os.environ.get( "EVENT_QUEUE_REPLY_TO_CONSUMER_CAPACITY", eq.get("reply_to_consumer_capacity")) no = config["notifier"] no["topic"] = os.environ.get("NOTIFIER_TOPIC", no.get("topic")) se = config["state_engine"] se["store_url"] = os.environ.get("STATE_ENGINE_STORE_URL", se.get("store_url")) se["execution_ttl"] = os.environ.get("STATE_ENGINE_EXECUTION_TTL", se.get("execution_ttl", 86400)) ra = config["rest_api"] ra["host"] = os.environ.get("REST_API_HOST", ra.get("host")) ra["port"] = int(os.environ.get("REST_API_PORT", ra.get("port"))) ra["region"] = os.environ.get("REST_API_REGION", ra.get("region")) tr = config["tracer"] tr["implementation"] = os.environ.get("TRACER_IMPLEMENTATION", tr.get("implementation", "None")) # The Jaeger specific env vars are derived from this document: # https://www.jaegertracing.io/docs/1.22/client-features/ sampler = tr["config"]["sampler"] sampler["type"] = os.environ.get("JAEGER_SAMPLER_TYPE", sampler.get("type")) sampler["param"] = os.environ.get("JAEGER_SAMPLER_PARAM", sampler.get("param")) metrics = config["metrics"] metrics["implementation"] = os.environ.get( "METRICS_IMPLEMENTATION", metrics.get("implementation", "None")) metrics["namespace"] = os.environ.get("METRICS_NAMESPACE", metrics.get("namespace", "")) """ Initialise opentracing.tracer before creating the StateEngine, EventDispatcher and RestAPIinstances. Call asyncio.get_event_loop() here, because if we are using asyncio we want the tracer to use the main asyncio event loop rather than create a new ThreadLoop, which is the default behaviour unless a tornado IOLoop is passed. In recent versions of Tornado that delegates to asyncio loop. """ if eq["queue_type"].endswith("-asyncio"): # Attempt to use uvloop libuv based event loop if available # https://github.com/MagicStack/uvloop try: import uvloop uvloop.install() self.logger.info("Using uvloop asyncio event loop") except: # Fall back to standard library asyncio epoll event loop self.logger.info("Using standard library asyncio event loop") loop = asyncio.get_event_loop() create_tracer("asl_workflow_engine", config["tracer"], use_asyncio=True) else: create_tracer("asl_workflow_engine", config["tracer"]) self.state_engine = StateEngine(config) self.event_dispatcher = EventDispatcher(self.state_engine, config) self.config = config
} }, "End": true } } }""" items = [""" { "items": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] } """] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name="map2") # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("map2", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:map2" def create_state_machines(): # Create state machine using a dummy roleArn. If it already exists an # exception will be thrown, we ignore that but raise other exceptions. try:
def __init__(self, state_engine, config): """ :param logger: The Workflow Engine logger :type logger: logger :param config: Configuration dictionary :type config: dict """ self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating EventDispatcher, using {} JSON parser".format(json.__name__)) self.queue_config = config["event_queue"] # TODO Handle missing config self.notifier_config = config["notifier"] # TODO Handle missing config # TODO validate that config contains the keys we need. self.queue_name = self.queue_config.get("queue_name", "asl_workflow_events") instance_id = self.queue_config.get("instance_id", "") self.instance_queue_name = self.queue_name + "-" + instance_id # Get consumer capacities from config as numbers or numeric strings capacity = self.queue_config.get("shared_event_consumer_capacity", 1000) self.shared_event_consumer_capacity = int(float(capacity)) capacity = self.queue_config.get("instance_event_consumer_capacity", 1000) self.instance_event_consumer_capacity = int(float(capacity)) """ Create an association with the state engine and give that a reference back to this event dispatcher so that it can publish events and make use of the set_timeout time scheduler. """ self.state_engine = state_engine self.state_engine.event_dispatcher = self """ Incoming messages should only be acknowledged when they have been fully processed by the StateEngine to allow the messaging fabric to redeliver the message upon a failure. Because processing might take some time due to waiting for Task responses and the Wait and Parallel states we must retain any unacknowledged messages. The message count is a simple one-up number used as a key. """ self.unacknowledged_messages = {} self.shared_event_consumer_unack_count = 0 self.instance_event_consumer_unack_count = 0 self.message_count = 0 """ """ self.heartbeat_count = 0 """ Connection Factory for the event queue. The idea is to eventually allow the ASL workflow engine to connect to alternative event queue implementations in order to allow maximum flexibility. """ name = ( self.queue_config.get("queue_type", "AMQP-0.9.1") .lower() .replace("-", "_") .replace(".", "_") ) if name.endswith("_asyncio"): self.name = "asl_workflow_engine." + name[:-8] + "_messaging_asyncio" else: self.name = "asl_workflow_engine." + name + "_messaging" self.logger.info("Loading messaging module {}".format(self.name)) # Load the module whose name is derived from the specified queue_type. try: messaging = importlib.import_module(self.name) globals()["Connection"] = messaging.Connection globals()["Message"] = messaging.Message except ImportError as e: self.logger.error(e) sys.exit(1)
} }""" items = [ '{"lambda":"Success"}', '{"lambda":"InternalErrorNotHandled"}', '{"lambda":"InternalErrorHandled"}', '{"lambda":"Timeout"}' ] #items = ['{"lambda":"Success"}'] #items = ['{"lambda":"InternalErrorNotHandled"}'] #items = ['{"lambda":"InternalErrorHandled"}'] #items = ['{"lambda":"Timeout"}'] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name='simple_state_machine2') # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("simple_state_machine2", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:simple_state_machine" def create_state_machines(): # Create state machine using a dummy roleArn. If it already exists an # exception will be thrown, we ignore that but raise other exceptions. try:
def __init__(self, name): self.name = name # Initialise logger self.logger = init_logging(log_name=name) self.count = 0
"Type": "Pass", "End": true }, "WaitState": { "Type": "Wait", "Seconds":10, "Next": "EndState" } } }""" items = ['{"lambda":"Success"}'] if __name__ == '__main__': # Initialise logger logger = init_logging(log_name='error_handling1') # Initialising OpenTracing. It's important to do this before the boto3.client # call as create_tracer "patches" boto3 to add the OpenTracing hooks. create_tracer("error_handling1", {"implementation": "Jaeger"}) # Initialise the boto3 client setting the endpoint_url to our local # ASL Workflow Engine # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session.client sfn = boto3.client("stepfunctions", endpoint_url="http://localhost:4584") state_machine_arn = "arn:aws:states:local:0123456789:stateMachine:error_handling_state_machine" def create_state_machines(): # Create state machine using a dummy roleArn. If it already exists an # exception will be thrown, we ignore that but raise other exceptions. try:
def __init__(self, state_engine, config): """ :param state_engine: The StateEngine calling this TaskDispatcher :type state_engine: StateEngine :param config: Configuration dictionary :type config: dict """ self.logger = init_logging(log_name="asl_workflow_engine") self.logger.info("Creating TaskDispatcher, using {} JSON parser".format(json.__name__)) """ Get the messaging peer.address, e.g. the Broker address for use by OpenTracing later. """ queue_config = config.get("event_queue", {}) peer_address_string = queue_config.get("connection_url", "amqp://localhost:5672") try: parsed_peer_address = urllib.parse.urlparse(peer_address_string) except Exception as e: self.logger.error( "Invalid peer address found: {}".format(peer_address_string), exc_info=e ) raise e self.peer_address = parsed_peer_address.hostname if parsed_peer_address.port: self.peer_address = self.peer_address + ":" + str(parsed_peer_address.port) if parsed_peer_address.scheme: self.peer_address = parsed_peer_address.scheme + "://" + self.peer_address # Get the channel capacity/prefetch value for the reply_to consumer capacity = queue_config.get("reply_to_consumer_capacity", 100) self.reply_to_capacity = int(float(capacity)) # Handles numbers or strings self.state_engine = state_engine """ Some services that we integrate Task States with might be request/ response in the ASL sense, as in we would want to not move to the next state until the response has been received, but they might be async in implementation terms, such as the case for rpcmessage. This is conceptually RPC like so logically behaves like a Lambda call, but is implemented over a messaging fabric with queues. In order to deal with this we need to be able to associate requests with their subsequent responses, so this pending_requests dictionary maps requests with their callbacks using correlation IDs. """ self.pending_requests = {} """ Prometheus metrics intended to emulate Stepfunction CloudWatch metrics. https://docs.aws.amazon.com/step-functions/latest/dg/procedure-cw-metrics.html """ self.task_metrics = {} metrics_config = config.get("metrics", {}) if metrics_config.get("implementation", "") == "Prometheus": ns = metrics_config.get("namespace", "") ns = ns + "_" if ns else "" self.task_metrics = { "LambdaFunctionTime": Summary( ns + "LambdaFunctionTime", "The interval, in milliseconds, between the time the " + "Lambda function is scheduled and the time it closes." ), "LambdaFunctionsFailed": Counter( ns + "LambdaFunctionsFailed", "The number of failed Lambda functions." ), "LambdaFunctionsScheduled": Counter( ns + "LambdaFunctionsScheduled", "The number of scheduled Lambda functions." ), "LambdaFunctionsSucceeded": Counter( ns + "LambdaFunctionsSucceeded", "The number of successfully completed Lambda functions." ), "LambdaFunctionsTimedOut": Counter( ns + "LambdaFunctionsTimedOut", "The number of Lambda functions that time out on close." ), #"ServiceIntegrationTime": Summary( # ns + "ServiceIntegrationTime", # "The interval, in milliseconds, between the time the " + # "Service Task is scheduled and the time it closes." #), #"ServiceIntegrationsFailed": Counter( # ns + "ServiceIntegrationsFailed", # "The number of failed Service Tasks." #), "ServiceIntegrationsScheduled": Counter( ns + "ServiceIntegrationsScheduled", "The number of scheduled Service Tasks." ), "ServiceIntegrationsSucceeded": Counter( ns + "ServiceIntegrationsSucceeded", "The number of successfully completed Service Tasks." ), #"ServiceIntegrationsTimedOut": Counter( # ns + "ServiceIntegrationsTimedOut", # "The number of Service Tasks that time out on close." #) }