def test_workflow_events(self): wf = Workflow(self.gbdx) workflow_id = '4347109104758907277' events = wf.events(workflow_id) assert len(events) > 0 assert isinstance(events, list) for event in events: assert 'task' in event.keys() assert 'state' in event.keys() assert 'event' in event.keys() assert 'timestamp' in event.keys() assert 'when' in event.keys() assert 'note' in event.keys() assert event['state'] in ['pending','running','complete'] assert event['event'] in ['submitted','scheduled','rescheduling','started','succeeded','failed','timedout']
def test_workflow_events(): wf = Workflow(gbdx) workflow_id = '4347109104758907277' events = wf.events(workflow_id) assert len(events) > 0 assert isinstance(events, list) for event in events: assert 'task' in event.keys() assert 'state' in event.keys() assert 'event' in event.keys() assert 'timestamp' in event.keys() assert 'when' in event.keys() assert 'note' in event.keys() assert event['state'] in ['pending','running','complete'] assert event['event'] in ['submitted','scheduled','rescheduling','started','succeeded','failed','timedout']
class Workflow(object): def __init__(self, tasks, **kwargs): self.workflow = WF() self.name = kwargs.get('name', str(uuid.uuid4())[:8] ) self.id = None self.callback = kwargs.get('callback', None ) self.definition = None self.tasks = tasks batch_values = [] for task in self.tasks: if task.batch_values: batch_values.extend(task.batch_values) if batch_values: self.batch_values = batch_values else: self.batch_values = None def savedata(self, output, location=None): ''' Save output data from any task in this workflow to S3 Args: output: Reference task output (e.g. task.inputs.output1). location (optional): Subfolder under which the output will be saved. It will be placed under the account directory in gbd-customer-data bucket: s3://gbd-customer-data/{account_id}/{location} Leave blank to save to: workflow_output/{workflow_id}/{task_name}/{port_name} Returns: None ''' output.persist = True if location: output.persist_location = location def workflow_skeleton(self): return { "tasks": [], "name": self.name } def list_workflow_outputs(self): ''' Get a list of outputs from the workflow that are saved to S3. To get resolved locations call workflow status. Args: None Returns: list ''' workflow_outputs = [] for task in self.tasks: for output_port_name in task.outputs._portnames: if task.outputs.__getattribute__(output_port_name).persist: workflow_outputs.append(task.name + ':' + output_port_name) return workflow_outputs def generate_workflow_description(self): ''' Generate workflow json for launching the workflow against the gbdx api Args: None Returns: json string ''' if not self.tasks: raise WorkflowError('Workflow contains no tasks, and cannot be executed.') self.definition = self.workflow_skeleton() if self.batch_values: self.definition["batch_values"] = self.batch_values all_input_port_values = [t.inputs.__getattribute__(input_port_name).value for t in self.tasks for input_port_name in t.inputs._portnames] for task in self.tasks: # only include multiplex output ports in this task if other tasks refer to them in their inputs. # 1. find the multplex output port_names in this task # 2. see if they are referred to in any other tasks inputs # 3. If not, exclude them from the workflow_def output_multiplex_ports_to_exclude = [] multiplex_output_port_names = [portname for portname in task.outputs._portnames if task.outputs.__getattribute__(portname).is_multiplex] for p in multiplex_output_port_names: output_port_reference = 'source:' + task.name + ':' + p if output_port_reference not in all_input_port_values: output_multiplex_ports_to_exclude.append(p) task_def = task.generate_task_workflow_json( output_multiplex_ports_to_exclude=output_multiplex_ports_to_exclude) self.definition['tasks'].append(task_def) if self.callback: self.definition['callback'] = self.callback return self.definition def execute(self): ''' Execute the workflow. Args: None Returns: Workflow_id ''' # if not self.tasks: # raise WorkflowError('Workflow contains no tasks, and cannot be executed.') # for task in self.tasks: # self.definition['tasks'].append( task.generate_task_workflow_json() ) self.generate_workflow_description() # hit batch workflow endpoint if batch values if self.batch_values: self.id = self.workflow.launch_batch_workflow(self.definition) # use regular workflow endpoint if no batch values else: self.id = self.workflow.launch(self.definition) return self.id @property def task_ids(self): ''' Get the task IDs of a running workflow Args: None Returns: List of task IDs ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get task IDs.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for task IDs.") wf = self.workflow.get(self.id) return [task['id'] for task in wf['tasks']] @task_ids.setter def task_ids(self, value): raise NotImplementedError("Cannot set workflow task_ids, readonly.") def cancel(self): ''' Cancel a running workflow. Args: None Returns: None ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot cancel.') if self.batch_values: self.workflow.batch_workflow_cancel(self.id) else: self.workflow.cancel(self.id) @property def status(self): if not self.id: raise WorkflowError('Workflow is not running. Cannot check status.') if self.batch_values: status = self.workflow.batch_workflow_status(self.id) else: status = self.workflow.status(self.id) return status @status.setter def status(self, value): raise NotImplementedError("Cannot set workflow status, readonly.") @property def events(self): if not self.id: raise WorkflowError('Workflow is not running. Cannot check status.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Events") return self.workflow.events(self.id) @events.setter def events(self, value): raise NotImplementedError("Cannot set workflow events, readonly.") @property def complete(self): if not self.id: return False # check if all sub workflows are either done, failed, or timedout if self.batch_values: return all(workflow.get("state") in ["succeeded", "failed", "timedout"] for workflow in self.status['workflows']) else: return self.status['state'] == 'complete' @complete.setter def complete(self, value): raise NotImplementedError("Cannot set workflow complete, readonly.") @property def failed(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'failed' @failed.setter def failed(self, value): raise NotImplementedError("Cannot set workflow failed, readonly.") @property def canceled(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'canceled' @canceled.setter def canceled(self, value): raise NotImplementedError("Cannot set workflow canceled, readonly.") @property def succeeded(self): if not self.id: return False # check if all sub workflows are succeeded if self.batch_values: return all(workflow.get("state") == "succeeded" for workflow in self.status['workflows']) status = self.status return status['state'] == 'complete' and status['event'] == 'succeeded' @succeeded.setter def succeeded(self, value): raise NotImplementedError("Cannot set workflow succeeded, readonly.") @property def running(self): if not self.id: return False if self.batch_values: # check if any sub workflows are running return any(workflow.get("state") not in ["succeeded", "failed", "timedout"] for workflow in self.status['workflows']) status = self.status return status['state'] == 'running' and status['event'] == 'started' @running.setter def running(self, value): raise NotImplementedError("Cannot set workflow running, readonly.") @property def timedout(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'timedout' @timedout.setter def timedout(self, value): raise NotImplementedError("Cannot set workflow timedout, readonly.") @property def stdout(self): ''' Get stdout from all the tasks of a workflow. Args: None Returns: List of tasks with their stdout, formatted like this: [ { "id": "4488895771403082552", "taskType": "AOP_Strip_Processor", "name": "Task1", "stdout": "............" } ] ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get stdout.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for stdout.") wf = self.workflow.get(self.id) stdout_list = [] for task in wf['tasks']: stdout_list.append( { 'id': task['id'], 'taskType': task['taskType'], 'name': task['name'], 'stdout': self.workflow.get_stdout(self.id, task['id']) } ) return stdout_list @stdout.setter def stdout(self, value): raise NotImplementedError("Cannot set workflow stdout, readonly.") @property def stderr(self): ''' Get stderr from all the tasks of a workflow. Args: None Returns: List of tasks with their stderr, formatted like this: [ { "id": "4488895771403082552", "taskType": "AOP_Strip_Processor", "name": "Task1", "stderr": "............" } ] ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get stderr.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for stderr.") wf = self.workflow.get(self.id) stderr_list = [] for task in wf['tasks']: stderr_list.append( { 'id': task['id'], 'taskType': task['taskType'], 'name': task['name'], 'stderr': self.workflow.get_stderr(self.id, task['id']) } ) return stderr_list @stderr.setter def stderr(self, value): raise NotImplementedError("Cannot set workflow stderr, readonly.")