def test_workflow_search(self): """ test gbdx.workflow.search(lookback_h=<hours>, state=<state>, owner=<owner>) """ wf = Workflow() output = wf.search(lookback_h=12, state='all') self.assertTrue(len(output), 0)
def test_describe_tasks(): wf = Workflow(gbdx) taskinfo = wf.list_tasks() assert len(taskinfo) > 0 desc = wf.describe_task(taskinfo['tasks'][0]) assert isinstance(desc, dict) assert len(desc['description']) > 0
def test_batch_workflows(self): """ tests all 3 endpoints for batch workflows, create, fetch, and cancel :return: """ wf = Workflow() with open(os.path.join(self.data_path, "batch_workflow.json")) as json_file: self.batch_workflow_json = json.loads(json_file.read()) # test create batch_workflow_id = wf.launch_batch_workflow(self.batch_workflow_json) # test status batch_workflow_status = wf.batch_workflow_status(batch_workflow_id) self.assertEqual(batch_workflow_id, batch_workflow_status.get("batch_workflow_id")) # test cancel batch_workflow_status = wf.batch_workflow_cancel(batch_workflow_id) workflows = batch_workflow_status.get('workflows') for workflow in workflows: self.assertTrue(workflow.get('state') in ["canceling", "canceled"])
def test_describe_tasks(self): wf = Workflow(self.gbdx) taskinfo = wf.list_tasks() self.assertTrue(len(taskinfo) > 0) desc = wf.describe_task(taskinfo['tasks'][0]) self.assertTrue(isinstance(desc, dict)) self.assertTrue(len(desc['description']) > 0)
def test_task_get_stdout(self): """ test gbdx.workflows.get_stdout(<workflow_id>,<task_id>) """ wf = Workflow(self.gbdx) output = wf.get_stdout('4488969848362445219','4488969848354891944') self.assertTrue(len(output) > 0)
def test_task_get_stderr(self): """ test gbdx.workflows.get_stdout(<workflow_id>,<task_id>) """ wf = Workflow() output = wf.get_stderr('4488969848362445219','4488969848354891944') self.assertEquals('<empty>', output)
def test_task_get_stdout(self): """ test gbdx.workflows.get_stdout(<workflow_id>,<task_id>) """ wf = Workflow() output = wf.get_stdout('4488969848362445219', '4488969848354891944') self.assertTrue(len(output) > 0)
def test_task_get_stderr(self): """ test gbdx.workflows.get_stdout(<workflow_id>,<task_id>) """ wf = Workflow() output = wf.get_stderr('4488969848362445219', '4488969848354891944') self.assertEqual('<empty>', output)
def test_workflow_get(self): """ test gbdx.workflows.get(<workflow_id>) """ wf = Workflow(self.gbdx) output = wf.get('4488969848362445219') self.assertTrue('id' in output.keys()) self.assertTrue('owner' in output.keys()) self.assertTrue('submitted_time' in output.keys()) self.assertTrue('state' in output.keys()) self.assertTrue('callback' in output.keys()) self.assertTrue('tasks' in output.keys())
def test_workflow_get(self): """ test gbdx.workflows.get(<workflow_id>) """ wf = Workflow() output = wf.get('4488969848362445219') self.assertTrue('id' in output.keys()) self.assertTrue('owner' in output.keys()) self.assertTrue('submitted_time' in output.keys()) self.assertTrue('state' in output.keys()) self.assertTrue('callback' in output.keys()) self.assertTrue('tasks' in output.keys())
def test_workflow_events(): wf = Workflow(gbdx) workflow_id = '4347109104758907277' events = wf.events(workflow_id) assert len(events) > 0 assert isinstance(events, list) for event in events: assert 'task' in event.keys() assert 'state' in event.keys() assert 'event' in event.keys() assert 'timestamp' in event.keys() assert 'when' in event.keys() assert 'note' in event.keys() assert event['state'] in ['pending','running','complete'] assert event['event'] in ['submitted','scheduled','rescheduling','started','succeeded','failed','timedout']
def test_workflow_events(self): wf = Workflow(self.gbdx) workflow_id = '4347109104758907277' events = wf.events(workflow_id) assert len(events) > 0 assert isinstance(events, list) for event in events: assert 'task' in event.keys() assert 'state' in event.keys() assert 'event' in event.keys() assert 'timestamp' in event.keys() assert 'when' in event.keys() assert 'note' in event.keys() assert event['state'] in ['pending','running','complete'] assert event['event'] in ['submitted','scheduled','rescheduling','started','succeeded','failed','timedout']
def test_workflow_callback_is_retrieved_in_workflow_status(self): """ Verify we can set task timeouts, it appears in the json, and launching a workflow works """ aoptask = self.gbdx.Task("AOP_Strip_Processor", data='testing') callback_url = 'http://requestb.in/qg8wzqqg' # launch a workflow and verify it launches: w = self.gbdx.Workflow([aoptask], callback=callback_url) w.execute() wf_api = WorkflowAPI(self.gbdx) wf_body = wf_api.get(w.id) assert wf_body['callback'] == callback_url
def test_workflow_callback_is_retrieved_in_workflow_status(self): """ Verify we can set task timeouts, it appears in the json, and launching a workflow works """ aoptask = self.gbdx.Task("AOP_Strip_Processor", data='testing') callback_url = 'http://requestb.in/qg8wzqqg' # launch a workflow and verify it launches: w = self.gbdx.Workflow([aoptask], callback=callback_url) w.execute() wf_api = WorkflowAPI() wf_body = wf_api.get(w.id) assert wf_body['callback'] == callback_url
def __init__(self, **kwargs): interface = Auth(**kwargs) self.gbdx_connection = interface.gbdx_connection self.root_url = interface.root_url self.logger = interface.logger # create and store an instance of the GBDX s3 client self.s3 = S3() # create and store an instance of the GBDX Ordering Client self.ordering = Ordering() # create and store an instance of the GBDX Catalog Client self.catalog = Catalog() # create and store an instance of the GBDX Workflow Client self.workflow = Workflow() # create and store an instance of the Idaho Client self.idaho = Idaho() self.vectors = Vectors() self.catalog_image = CatalogImage self.idaho_image = IdahoImage self.task_registry = TaskRegistry()
def __init__(self, **kwargs): interface = Auth(**kwargs) self.gbdx_connection = interface.gbdx_connection self.root_url = interface.root_url self.logger = interface.logger # create and store an instance of the GBDX s3 client self.s3 = S3() # create and store an instance of the GBDX Ordering Client self.ordering = Ordering() # create and store an instance of the GBDX Catalog Client self.catalog = Catalog() # create and store an instance of the GBDX Workflow Client self.workflow = Workflow() # create and store an instance of the Idaho Client self.idaho = Idaho() self.vectors = Vectors() self.catalog_image = CatalogImage self.idaho_image = IdahoImage self.landsat_image = LandsatImage self.sentinel2 = Sentinel2 self.tms_image = TmsImage self.dem_image = DemImage self.wv03_vnir = WV03_VNIR self.wv02 = WV02 self.ge01 = GE01 self.s3_image = S3Image self.task_registry = TaskRegistry()
def test_batch_workflows(self): """ tests all 3 endpoints for batch workflows, create, fetch, and cancel :return: """ wf = Workflow(self.gbdx) with open(os.path.join(self.data_path, "batch_workflow.json")) as json_file: self.batch_workflow_json = json.loads(json_file.read()) # test create batch_workflow_id = wf.launch_batch_workflow(self.batch_workflow_json) # test status batch_workflow_status = wf.batch_workflow_status(batch_workflow_id) self.assertEqual(batch_workflow_id, batch_workflow_status.get("batch_workflow_id")) # test cancel batch_workflow_status = wf.batch_workflow_cancel(batch_workflow_id) workflows = batch_workflow_status.get('workflows') for workflow in workflows: self.assertTrue(workflow.get('state') in ["canceling", "canceled"])
def __init__(self, **kwargs): host = kwargs.get('host') if kwargs.get('host') else 'geobigdata.io' self.root_url = 'https://%s' % host if (kwargs.get('username') and kwargs.get('password') and kwargs.get('client_id') and kwargs.get('client_secret')): self.gbdx_connection = gbdx_auth.session_from_kwargs(**kwargs) elif kwargs.get('gbdx_connection'): # Pass in a custom gbdx connection object, for testing purposes self.gbdx_connection = kwargs.get('gbdx_connection') else: # This will throw an exception if your .ini file is not set properly self.gbdx_connection = gbdx_auth.get_session( kwargs.get('config_file')) # create a logger # for now, just log to the console. We'll replace all the 'print' statements # with at least logger.info or logger.debug statements # later, we can log to a service, file, or some other aggregator self.logger = logging.getLogger('gbdxtools') self.logger.setLevel(logging.ERROR) console_handler = logging.StreamHandler() console_handler.setLevel(logging.ERROR) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') console_handler.setFormatter(formatter) self.logger.addHandler(console_handler) self.logger.info('Logger initialized') # create and store an instance of the GBDX s3 client self.s3 = S3(self) # create and store an instance of the GBDX Ordering Client self.ordering = Ordering(self) # create and store an instance of the GBDX Catalog Client self.catalog = Catalog(self) # create and store an instance of the GBDX Workflow Client self.workflow = Workflow(self) # create and store an instance of the Idaho Client self.idaho = Idaho(self) self.vectors = Vectors(self) self.task_registry = TaskRegistry(self)
def __init__(self, tasks, **kwargs): self.workflow = WF() self.name = kwargs.get('name', str(uuid.uuid4())[:8] ) self.id = None self.callback = kwargs.get('callback', None ) self.definition = None self.tasks = tasks batch_values = [] for task in self.tasks: if task.batch_values: batch_values.extend(task.batch_values) if batch_values: self.batch_values = batch_values else: self.batch_values = None
def test_init(self): wf = Workflow(self.gbdx) self.assertTrue(isinstance(wf, Workflow)) self.assertTrue(wf.s3 is not None) self.assertTrue(wf.gbdx_connection is not None)
def test_list_tasks(): wf = Workflow(gbdx) taskinfo = wf.list_tasks() assert taskinfo is not None assert 'HelloGBDX' in taskinfo['tasks']
def test_init(): wf = Workflow(gbdx) assert isinstance(wf, Workflow) assert wf.s3 is not None assert wf.gbdx_connection is not None
class Workflow(object): def __init__(self, tasks, **kwargs): self.workflow = WF() self.name = kwargs.get('name', str(uuid.uuid4())[:8] ) self.id = None self.callback = kwargs.get('callback', None ) self.definition = None self.tasks = tasks batch_values = [] for task in self.tasks: if task.batch_values: batch_values.extend(task.batch_values) if batch_values: self.batch_values = batch_values else: self.batch_values = None def savedata(self, output, location=None): ''' Save output data from any task in this workflow to S3 Args: output: Reference task output (e.g. task.inputs.output1). location (optional): Subfolder under which the output will be saved. It will be placed under the account directory in gbd-customer-data bucket: s3://gbd-customer-data/{account_id}/{location} Leave blank to save to: workflow_output/{workflow_id}/{task_name}/{port_name} Returns: None ''' output.persist = True if location: output.persist_location = location def workflow_skeleton(self): return { "tasks": [], "name": self.name } def list_workflow_outputs(self): ''' Get a list of outputs from the workflow that are saved to S3. To get resolved locations call workflow status. Args: None Returns: list ''' workflow_outputs = [] for task in self.tasks: for output_port_name in task.outputs._portnames: if task.outputs.__getattribute__(output_port_name).persist: workflow_outputs.append(task.name + ':' + output_port_name) return workflow_outputs def generate_workflow_description(self): ''' Generate workflow json for launching the workflow against the gbdx api Args: None Returns: json string ''' if not self.tasks: raise WorkflowError('Workflow contains no tasks, and cannot be executed.') self.definition = self.workflow_skeleton() if self.batch_values: self.definition["batch_values"] = self.batch_values all_input_port_values = [t.inputs.__getattribute__(input_port_name).value for t in self.tasks for input_port_name in t.inputs._portnames] for task in self.tasks: # only include multiplex output ports in this task if other tasks refer to them in their inputs. # 1. find the multplex output port_names in this task # 2. see if they are referred to in any other tasks inputs # 3. If not, exclude them from the workflow_def output_multiplex_ports_to_exclude = [] multiplex_output_port_names = [portname for portname in task.outputs._portnames if task.outputs.__getattribute__(portname).is_multiplex] for p in multiplex_output_port_names: output_port_reference = 'source:' + task.name + ':' + p if output_port_reference not in all_input_port_values: output_multiplex_ports_to_exclude.append(p) task_def = task.generate_task_workflow_json( output_multiplex_ports_to_exclude=output_multiplex_ports_to_exclude) self.definition['tasks'].append(task_def) if self.callback: self.definition['callback'] = self.callback return self.definition def execute(self): ''' Execute the workflow. Args: None Returns: Workflow_id ''' # if not self.tasks: # raise WorkflowError('Workflow contains no tasks, and cannot be executed.') # for task in self.tasks: # self.definition['tasks'].append( task.generate_task_workflow_json() ) self.generate_workflow_description() # hit batch workflow endpoint if batch values if self.batch_values: self.id = self.workflow.launch_batch_workflow(self.definition) # use regular workflow endpoint if no batch values else: self.id = self.workflow.launch(self.definition) return self.id @property def task_ids(self): ''' Get the task IDs of a running workflow Args: None Returns: List of task IDs ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get task IDs.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for task IDs.") wf = self.workflow.get(self.id) return [task['id'] for task in wf['tasks']] @task_ids.setter def task_ids(self, value): raise NotImplementedError("Cannot set workflow task_ids, readonly.") def cancel(self): ''' Cancel a running workflow. Args: None Returns: None ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot cancel.') if self.batch_values: self.workflow.batch_workflow_cancel(self.id) else: self.workflow.cancel(self.id) @property def status(self): if not self.id: raise WorkflowError('Workflow is not running. Cannot check status.') if self.batch_values: status = self.workflow.batch_workflow_status(self.id) else: status = self.workflow.status(self.id) return status @status.setter def status(self, value): raise NotImplementedError("Cannot set workflow status, readonly.") @property def events(self): if not self.id: raise WorkflowError('Workflow is not running. Cannot check status.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Events") return self.workflow.events(self.id) @events.setter def events(self, value): raise NotImplementedError("Cannot set workflow events, readonly.") @property def complete(self): if not self.id: return False # check if all sub workflows are either done, failed, or timedout if self.batch_values: return all(workflow.get("state") in ["succeeded", "failed", "timedout"] for workflow in self.status['workflows']) else: return self.status['state'] == 'complete' @complete.setter def complete(self, value): raise NotImplementedError("Cannot set workflow complete, readonly.") @property def failed(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'failed' @failed.setter def failed(self, value): raise NotImplementedError("Cannot set workflow failed, readonly.") @property def canceled(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'canceled' @canceled.setter def canceled(self, value): raise NotImplementedError("Cannot set workflow canceled, readonly.") @property def succeeded(self): if not self.id: return False # check if all sub workflows are succeeded if self.batch_values: return all(workflow.get("state") == "succeeded" for workflow in self.status['workflows']) status = self.status return status['state'] == 'complete' and status['event'] == 'succeeded' @succeeded.setter def succeeded(self, value): raise NotImplementedError("Cannot set workflow succeeded, readonly.") @property def running(self): if not self.id: return False if self.batch_values: # check if any sub workflows are running return any(workflow.get("state") not in ["succeeded", "failed", "timedout"] for workflow in self.status['workflows']) status = self.status return status['state'] == 'running' and status['event'] == 'started' @running.setter def running(self, value): raise NotImplementedError("Cannot set workflow running, readonly.") @property def timedout(self): if not self.id: return False if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for Current State") status = self.status return status['state'] == 'complete' and status['event'] == 'timedout' @timedout.setter def timedout(self, value): raise NotImplementedError("Cannot set workflow timedout, readonly.") @property def stdout(self): ''' Get stdout from all the tasks of a workflow. Args: None Returns: List of tasks with their stdout, formatted like this: [ { "id": "4488895771403082552", "taskType": "AOP_Strip_Processor", "name": "Task1", "stdout": "............" } ] ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get stdout.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for stdout.") wf = self.workflow.get(self.id) stdout_list = [] for task in wf['tasks']: stdout_list.append( { 'id': task['id'], 'taskType': task['taskType'], 'name': task['name'], 'stdout': self.workflow.get_stdout(self.id, task['id']) } ) return stdout_list @stdout.setter def stdout(self, value): raise NotImplementedError("Cannot set workflow stdout, readonly.") @property def stderr(self): ''' Get stderr from all the tasks of a workflow. Args: None Returns: List of tasks with their stderr, formatted like this: [ { "id": "4488895771403082552", "taskType": "AOP_Strip_Processor", "name": "Task1", "stderr": "............" } ] ''' if not self.id: raise WorkflowError('Workflow is not running. Cannot get stderr.') if self.batch_values: raise NotImplementedError("Query Each Workflow Id within the Batch Workflow for stderr.") wf = self.workflow.get(self.id) stderr_list = [] for task in wf['tasks']: stderr_list.append( { 'id': task['id'], 'taskType': task['taskType'], 'name': task['name'], 'stderr': self.workflow.get_stderr(self.id, task['id']) } ) return stderr_list @stderr.setter def stderr(self, value): raise NotImplementedError("Cannot set workflow stderr, readonly.")
def test_list_tasks(self): wf = Workflow(self.gbdx) taskinfo = wf.list_tasks() self.assertTrue(taskinfo is not None) self.assertTrue('HelloGBDX' in taskinfo['tasks'])