def add_task(self, func, params=None, parents=None, stage_name=None, uid=None, drm=None): """ Adds a new Task to the Workflow. If the Task already exists (and was successful), return the successful Task stored in the database :param func func: A function which returns a string which will get converted to a shell script to be executed. Func will not get called until all of its dependencies have completed. :param dict params: Parameters to `func`. Must be jsonable so that it can be stored in the database. Any Dependency objects will get resolved into a string, and the Dependency.task will be added to this Task's parents. :param list[Tasks] parents: A list of dependent Tasks. :param str uid: A unique identifier for this Task, primarily used for skipping previously successful Tasks. If a Task with this stage_name and uid already exists in the database (and was successful), the database version will be returned and a new one will not be created. :param str stage_name: The name of the Stage to add this Task to. Defaults to a title()ed __name__ of `func`. :param str drm: The drm to use for this Task (example 'local' or 'ge') :return: """ from cosmos.models.Stage import Stage from cosmos import recursive_resolve_dependency from cosmos.api import Dependency # parents if isinstance(parents, types.GeneratorType): parents = list(parents) if parents is None: parents = [] if isinstance(parents, Task): parents = [parents] # params if params is None: params = dict() for k, v in params.iteritems(): # decompose `Dependency` objects to values and parents new_val, parent_tasks = recursive_resolve_dependency(v) params[k] = new_val parents.extend(parent_tasks - set(parents)) # uid if uid is None: raise AssertionError, 'uid parameter must be specified' # Fix me assert params are all JSONable # uid = str(params) else: assert isinstance(uid, basestring), 'uid must be a string' # stage_name if stage_name is None: stage_name = str(func.__name__).replace('_', ' ').title().replace( ' ', '_') # Get the right Stage stage = only_one((s for s in self.stages if s.name == stage_name), None) if stage is None: stage = Stage(workflow=self, name=stage_name) self.session.add(stage) # Check if task is already in stage task = stage.get_task(uid, None) if task is not None: # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks # were already removed on workflow load if task.successful: return task else: # TODO check for duplicate params here? would be a lot faster at Workflow.run raise ValueError( 'Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice. ' 'Task uids must be unique within the same Stage.' % (stage_name, uid)) else: # Create Task # input_map, output_map = io.get_io_map(task_func, task_params, parents, stage.name, out_dir, self.output_dir) # input_files = io.unpack_io_map(input_map) # output_files = io.unpack_io_map(output_map) sig = funcsigs.signature(func) # Check required parameters are specified # for keyword, parameter in sig.parameters.iteritems(): # if parameter.default is funcsigs._empty and keyword not in params: # raise AssertionError, 'Parameter %s is required for %s' % (keyword, func) def params_or_signature_default_or(name, default): if name in params: return params[name] if name in sig.parameters: param_default = sig.parameters[name].default if param_default is funcsigs._empty: return default else: return param_default return default input_map = dict() output_map = dict() for keyword, param in sig.parameters.iteritems(): if keyword.startswith('in_'): v = params.get(keyword, param.default) assert v != funcsigs._empty, 'parameter %s for %s is required' % ( param, func) input_map[keyword] = v elif keyword.startswith('out_'): v = params.get(keyword, param.default) assert v != funcsigs._empty, 'parameter %s for %s is required' % ( param, func) output_map[keyword] = v task = Task( stage=stage, params=params, parents=parents, input_map=input_map, output_map=output_map, uid=uid, drm=drm or self.cosmos_app.default_drm, core_req=params_or_signature_default_or('core_req', 1), must_succeed=params_or_signature_default_or( 'must_succeed', True), mem_req=params_or_signature_default_or('mem_req', None), time_req=params_or_signature_default_or('time_req', None)) task.cmd_fxn = func # task.input_map = input_map # task.output_map = output_map # task.call_kwargs = call_kwargs # Add Stage Dependencies for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) self.dont_garbage_collect.append(task) return task
def add_task( self, func, params=None, parents=None, stage_name=None, uid=None, drm=None, queue=None, must_succeed=True, time_req=None, core_req=None, mem_req=None, gpu_req=None, max_attempts=None, noop=False, job_class=None, drm_options=None, environment_variables=None, if_duplicate="raise", ): """ Adds a new Task to the Workflow. If the Task already exists (and was successful), return the successful Task stored in the database :param callable func: A function which returns a string which will get converted to a shell script to be executed. `func` will not get called until all of its dependencies have completed. :param dict params: Parameters to `func`. Must be jsonable so that it can be stored in the database. Any Dependency objects will get resolved into a string, and the Dependency.task will be added to this Task's parents. :param list[Tasks] parents: A list of dependent Tasks. :param str uid: A unique identifier for this Task, primarily used for skipping previously successful Tasks. If a Task with this stage_name and uid already exists in the database (and was successful), the database version will be returned and a new one will not be created. :param str stage_name: The name of the Stage to add this Task to. Defaults to `func.__name__`. :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf'). Defaults to the `default_drm` parameter of :meth:`Cosmos.start` :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start` :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start` :param bool must_succeed: Default True. If False, the Workflow will not fail if this Task does not succeed. Dependent Jobs will not be executed. :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources. :param int core_req: Number of cpus required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence. Warning! In future versions, this will be the only way to set it. :param int mem_req: Number of MB of RAM required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence. Warning! In future versions, this will be the only way to set it. :param int gpu_req: Number of gpus required for this Task. :param int max_attempts: The maximum number of times to retry a failed job. Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start` :param bool noop: Task is a No-op and will always be marked as successful. :param dict drm_options: Options for Distributed Resource Management (cluster). :param dict environment_variables: Environment variables to pass to the DRM (if supported). :param str if_duplicate: If "raise", raises an error if a Task with the same UID has already been added to this Workflow. If "return", return that Task, allowing for an easy way to avoid duplicate work. :rtype: cosmos.api.Task """ # Avoid cyclical import dependencies from cosmos.job.drm.DRM_Base import DRM from cosmos.models.Stage import Stage from cosmos import recursive_resolve_dependency # parents if parents is None: parents = [] elif isinstance(parents, Task): parents = [parents] else: parents = list(parents) # params if params is None: params = dict() for k, v in list(params.items()): # decompose `Dependency` objects to values and parents new_val, parent_tasks = recursive_resolve_dependency(v) params[k] = new_val parents.extend(parent_tasks - set(parents)) # uid if uid is None: raise AssertionError("uid parameter must be specified") # Fix me assert params are all JSONable # uid = str(params) else: assert isinstance(uid, str), "uid must be a string" if stage_name is None: stage_name = str(func.__name__) # Get the right Stage stage = only_one((s for s in self.stages if s.name == stage_name), None) if stage is None: stage = Stage(workflow=self, name=stage_name, status=StageStatus.no_attempt) self.session.add(stage) # Check if task is already in stage task = stage.get_task(uid, None) if task is not None: # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks # were already removed on workflow load if task.successful: # If the user manually edited the dag and this a resume, parents might need to be-readded task.parents.extend(set(parents).difference(set(task.parents))) for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) return task else: if if_duplicate == "raise": raise DuplicateUid( "Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice. " "Task uids must be unique within the same Stage." % (stage_name, uid)) elif if_duplicate == "return": if task.params != params: raise InvalidParams( f"Tried to add a task with the same uid, but different parameters." ) return task else: raise ValueError(f"{if_duplicate} is not valid") else: # Create Task sig = funcsigs.signature(func) def params_or_signature_default_or(name, default): if name in params: return params[name] if name in sig.parameters: param_default = sig.parameters[name].default if param_default is funcsigs._empty: return default else: return param_default return default task = Task( stage=stage, params=params, parents=parents, uid=uid, drm=drm if drm is not None else self.cosmos_app.default_drm, job_class=job_class if job_class is not None else self.cosmos_app.default_job_class, queue=queue if queue is not None else self.cosmos_app.default_queue, must_succeed=must_succeed, core_req=core_req if core_req is not None else params_or_signature_default_or("core_req", 1), mem_req=mem_req if mem_req is not None else params_or_signature_default_or("mem_req", None), time_req=time_req if time_req is not None else self.cosmos_app.default_time_req, successful=False, max_attempts=max_attempts if max_attempts is not None else self.cosmos_app.default_max_attempts, attempt=1, NOOP=noop, gpu_req=gpu_req if gpu_req is not None else params_or_signature_default_or("gpu_req", 0), environment_variables=environment_variables if environment_variables is not None else self.cosmos_app.default_environment_variables, ) task.cmd_fxn = func if drm_options is None: task.drm_options = {} else: task.drm_options = drm_options # use default for any keys not set if self.cosmos_app.default_drm_options is not None: for key, val in list( self.cosmos_app.default_drm_options.items()): if key not in task.drm_options: task.drm_options[key] = val DRM.validate_drm_options(task.drm, task.drm_options) # Add Stage Dependencies for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) self._dont_garbage_collect.append(task) return task
def add_task(self, func, params=None, parents=None, stage_name=None, uid=None, drm=None, queue=None, must_succeed=True, time_req=None, core_req=None, mem_req=None, max_attempts=None, noop=False, job_class=None, drm_options=None): """ Adds a new Task to the Workflow. If the Task already exists (and was successful), return the successful Task stored in the database :param callable func: A function which returns a string which will get converted to a shell script to be executed. `func` will not get called until all of its dependencies have completed. :param dict params: Parameters to `func`. Must be jsonable so that it can be stored in the database. Any Dependency objects will get resolved into a string, and the Dependency.task will be added to this Task's parents. :param list[Tasks] parents: A list of dependent Tasks. :param str uid: A unique identifier for this Task, primarily used for skipping previously successful Tasks. If a Task with this stage_name and uid already exists in the database (and was successful), the database version will be returned and a new one will not be created. :param str stage_name: The name of the Stage to add this Task to. Defaults to `func.__name__`. :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf'). Defaults to the `default_drm` parameter of :meth:`Cosmos.start` :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start` :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start` :param bool must_succeed: Default True. If False, the Workflow will not fail if this Task does not succeed. Dependent Jobs will not be executed. :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources. :param int cpu_req: Number of cpus required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence. Warning! In future versions, this will be the only way to set it. :param int mem_req: Number of MB of RAM required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence. Warning! In future versions, this will be the only way to set it. :param int max_attempts: The maximum number of times to retry a failed job. Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start` :rtype: cosmos.api.Task """ # Avoid cyclical import dependencies from cosmos.job.drm.DRM_Base import DRM from cosmos.models.Stage import Stage from cosmos import recursive_resolve_dependency # parents if parents is None: parents = [] elif isinstance(parents, Task): parents = [parents] else: parents = list(parents) # params if params is None: params = dict() for k, v in params.iteritems(): # decompose `Dependency` objects to values and parents new_val, parent_tasks = recursive_resolve_dependency(v) params[k] = new_val parents.extend(parent_tasks - set(parents)) # uid if uid is None: raise AssertionError, 'uid parameter must be specified' # Fix me assert params are all JSONable # uid = str(params) else: assert isinstance(uid, basestring), 'uid must be a string' if stage_name is None: stage_name = str(func.__name__) # Get the right Stage stage = only_one((s for s in self.stages if s.name == stage_name), None) if stage is None: stage = Stage(workflow=self, name=stage_name, status=StageStatus.no_attempt) self.session.add(stage) # Check if task is already in stage task = stage.get_task(uid, None) if task is not None: # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks # were already removed on workflow load if task.successful: # If the user manually edited the dag and this a resume, parents might need to be-readded task.parents.extend(set(parents).difference(set(task.parents))) for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) return task else: # TODO check for duplicate params here? would be a lot faster at Workflow.run raise ValueError( 'Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice. ' 'Task uids must be unique within the same Stage.' % (stage_name, uid)) else: # Create Task sig = funcsigs.signature(func) def params_or_signature_default_or(name, default): if name in params: return params[name] if name in sig.parameters: param_default = sig.parameters[name].default if param_default is funcsigs._empty: return default else: return param_default return default input_map = dict() output_map = dict() for keyword, param in sig.parameters.iteritems(): if keyword.startswith('in_'): v = params.get(keyword, param.default) assert v != funcsigs._empty, 'parameter %s for %s is required' % ( param, func) input_map[keyword] = v elif keyword.startswith('out_'): v = params.get(keyword, param.default) assert v != funcsigs._empty, 'parameter %s for %s is required' % ( param, func) output_map[keyword] = v task = Task( stage=stage, params=params, parents=parents, input_map=input_map, output_map=output_map, uid=uid, drm=drm if drm is not None else self.cosmos_app.default_drm, job_class=job_class if job_class is not None else self.cosmos_app.default_job_class, queue=queue if queue is not None else self.cosmos_app.default_queue, must_succeed=must_succeed, core_req=core_req if core_req is not None else params_or_signature_default_or('core_req', 1), mem_req=mem_req if mem_req is not None else params_or_signature_default_or('mem_req', None), time_req=time_req if time_req is not None else self.cosmos_app.default_time_req, successful=False, max_attempts=max_attempts if max_attempts is not None else self.cosmos_app.default_max_attempts, attempt=1, NOOP=noop) task.cmd_fxn = func task.drm_options = drm_options if drm_options is not None else self.cosmos_app.default_drm_options DRM.validate_drm_options(task.drm, task.drm_options) # Add Stage Dependencies for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) self.dont_garbage_collect.append(task) return task
def add_task(self, func, params=None, parents=None, stage_name=None, uid=None, drm=None, queue=None, must_succeed=True, time_req=None, core_req=None, mem_req=None, max_attempts=None, noop=False, job_class=None, drm_options=None): """ Adds a new Task to the Workflow. If the Task already exists (and was successful), return the successful Task stored in the database :param callable func: A function which returns a string which will get converted to a shell script to be executed. `func` will not get called until all of its dependencies have completed. :param dict params: Parameters to `func`. Must be jsonable so that it can be stored in the database. Any Dependency objects will get resolved into a string, and the Dependency.task will be added to this Task's parents. :param list[Tasks] parents: A list of dependent Tasks. :param str uid: A unique identifier for this Task, primarily used for skipping previously successful Tasks. If a Task with this stage_name and uid already exists in the database (and was successful), the database version will be returned and a new one will not be created. :param str stage_name: The name of the Stage to add this Task to. Defaults to `func.__name__`. :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf'). Defaults to the `default_drm` parameter of :meth:`Cosmos.start` :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start` :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start` :param bool must_succeed: Default True. If False, the Workflow will not fail if this Task does not succeed. Dependent Jobs will not be executed. :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources. :param int cpu_req: Number of cpus required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence. Warning! In future versions, this will be the only way to set it. :param int mem_req: Number of MB of RAM required for this Task. Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence. Warning! In future versions, this will be the only way to set it. :param int max_attempts: The maximum number of times to retry a failed job. Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start` :rtype: cosmos.api.Task """ # Avoid cyclical import dependencies from cosmos.job.drm.DRM_Base import DRM from cosmos.models.Stage import Stage from cosmos import recursive_resolve_dependency # parents if parents is None: parents = [] elif isinstance(parents, Task): parents = [parents] else: parents = list(parents) # params if params is None: params = dict() for k, v in params.iteritems(): # decompose `Dependency` objects to values and parents new_val, parent_tasks = recursive_resolve_dependency(v) params[k] = new_val parents.extend(parent_tasks - set(parents)) # uid if uid is None: raise AssertionError, 'uid parameter must be specified' # Fix me assert params are all JSONable # uid = str(params) else: assert isinstance(uid, basestring), 'uid must be a string' if stage_name is None: stage_name = str(func.__name__) # Get the right Stage stage = only_one((s for s in self.stages if s.name == stage_name), None) if stage is None: stage = Stage(workflow=self, name=stage_name, status=StageStatus.no_attempt) self.session.add(stage) # Check if task is already in stage task = stage.get_task(uid, None) if task is not None: # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks # were already removed on workflow load if task.successful: # If the user manually edited the dag and this a resume, parents might need to be-readded task.parents.extend(set(parents).difference(set(task.parents))) for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) return task else: # TODO check for duplicate params here? would be a lot faster at Workflow.run raise ValueError('Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice. ' 'Task uids must be unique within the same Stage.' % (stage_name, uid)) else: # Create Task sig = funcsigs.signature(func) def params_or_signature_default_or(name, default): if name in params: return params[name] if name in sig.parameters: param_default = sig.parameters[name].default if param_default is funcsigs._empty: return default else: return param_default return default task = Task(stage=stage, params=params, parents=parents, uid=uid, drm=drm if drm is not None else self.cosmos_app.default_drm, job_class=job_class if job_class is not None else self.cosmos_app.default_job_class, queue=queue if queue is not None else self.cosmos_app.default_queue, must_succeed=must_succeed, core_req=core_req if core_req is not None else params_or_signature_default_or('core_req', 1), mem_req=mem_req if mem_req is not None else params_or_signature_default_or('mem_req', None), time_req=time_req if time_req is not None else self.cosmos_app.default_time_req, successful=False, max_attempts=max_attempts if max_attempts is not None else self.cosmos_app.default_max_attempts, attempt=1, NOOP=noop ) task.cmd_fxn = func task.drm_options = drm_options if drm_options is not None else self.cosmos_app.default_drm_options DRM.validate_drm_options(task.drm, task.drm_options) # Add Stage Dependencies for p in parents: if p.stage not in stage.parents: stage.parents.append(p.stage) self.dont_garbage_collect.append(task) return task