Beispiel #1
0
    def add_task(self,
                 func,
                 params=None,
                 parents=None,
                 stage_name=None,
                 uid=None,
                 drm=None):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param func func: A function which returns a string which will get converted to a shell script to be executed.  Func will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to a title()ed __name__ of `func`.
        :param str drm: The drm to use for this Task (example 'local' or 'ge')
        :return:
        """
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency
        from cosmos.api import Dependency

        # parents
        if isinstance(parents, types.GeneratorType):
            parents = list(parents)
        if parents is None:
            parents = []
        if isinstance(parents, Task):
            parents = [parents]

        # params
        if params is None:
            params = dict()
        for k, v in params.iteritems():
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError, 'uid parameter must be specified'
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, basestring), 'uid must be a string'

        # stage_name
        if stage_name is None:
            stage_name = str(func.__name__).replace('_', ' ').title().replace(
                ' ', '_')

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name),
                         None)
        if stage is None:
            stage = Stage(workflow=self, name=stage_name)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                return task
            else:
                # TODO check for duplicate params here?  would be a lot faster at Workflow.run
                raise ValueError(
                    'Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  '
                    'Task uids must be unique within the same Stage.' %
                    (stage_name, uid))
        else:
            # Create Task

            # input_map, output_map = io.get_io_map(task_func, task_params, parents, stage.name, out_dir, self.output_dir)
            # input_files = io.unpack_io_map(input_map)
            # output_files = io.unpack_io_map(output_map)
            sig = funcsigs.signature(func)

            # Check required parameters are specified
            # for keyword, parameter in sig.parameters.iteritems():
            # if parameter.default is funcsigs._empty and keyword not in params:
            #         raise AssertionError, 'Parameter %s is required for %s' % (keyword, func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default

            input_map = dict()
            output_map = dict()

            for keyword, param in sig.parameters.iteritems():
                if keyword.startswith('in_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    input_map[keyword] = v
                elif keyword.startswith('out_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    output_map[keyword] = v

            task = Task(
                stage=stage,
                params=params,
                parents=parents,
                input_map=input_map,
                output_map=output_map,
                uid=uid,
                drm=drm or self.cosmos_app.default_drm,
                core_req=params_or_signature_default_or('core_req', 1),
                must_succeed=params_or_signature_default_or(
                    'must_succeed', True),
                mem_req=params_or_signature_default_or('mem_req', None),
                time_req=params_or_signature_default_or('time_req', None))

            task.cmd_fxn = func
            # task.input_map = input_map
            # task.output_map = output_map
            # task.call_kwargs = call_kwargs

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self.dont_garbage_collect.append(task)

        return task
Beispiel #2
0
    def add_task(
        self,
        func,
        params=None,
        parents=None,
        stage_name=None,
        uid=None,
        drm=None,
        queue=None,
        must_succeed=True,
        time_req=None,
        core_req=None,
        mem_req=None,
        gpu_req=None,
        max_attempts=None,
        noop=False,
        job_class=None,
        drm_options=None,
        environment_variables=None,
        if_duplicate="raise",
    ):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param callable func: A function which returns a string which will get converted to a shell script to be executed.  `func` will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to `func.__name__`.
        :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf').  Defaults to the `default_drm` parameter of :meth:`Cosmos.start`
        :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start`
        :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start`
        :param bool must_succeed: Default True.  If False, the Workflow will not fail if this Task does not succeed.  Dependent Jobs will not be executed.
        :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources.
        :param int core_req: Number of cpus required for this Task.  Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence.
            Warning!  In future versions, this will be the only way to set it.
        :param int mem_req: Number of MB of RAM required for this Task.   Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence.
            Warning!  In future versions, this will be the only way to set it.
        :param int gpu_req: Number of gpus required for this Task.
        :param int max_attempts: The maximum number of times to retry a failed job.  Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start`
        :param bool noop: Task is a No-op and will always be marked as successful.
        :param dict drm_options: Options for Distributed Resource Management (cluster).
        :param dict environment_variables: Environment variables to pass to the DRM (if supported).
        :param str if_duplicate: If "raise", raises an error if a Task with the same UID has already been added to this
          Workflow.  If "return", return that Task, allowing for an easy way to avoid duplicate work.
        :rtype: cosmos.api.Task
        """
        # Avoid cyclical import dependencies
        from cosmos.job.drm.DRM_Base import DRM
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency

        # parents
        if parents is None:
            parents = []
        elif isinstance(parents, Task):
            parents = [parents]
        else:
            parents = list(parents)

        # params
        if params is None:
            params = dict()
        for k, v in list(params.items()):
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError("uid parameter must be specified")
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, str), "uid must be a string"

        if stage_name is None:
            stage_name = str(func.__name__)

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name),
                         None)
        if stage is None:
            stage = Stage(workflow=self,
                          name=stage_name,
                          status=StageStatus.no_attempt)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                # If the user manually edited the dag and this a resume, parents might need to be-readded
                task.parents.extend(set(parents).difference(set(task.parents)))

                for p in parents:
                    if p.stage not in stage.parents:
                        stage.parents.append(p.stage)

                return task
            else:
                if if_duplicate == "raise":
                    raise DuplicateUid(
                        "Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  "
                        "Task uids must be unique within the same Stage." %
                        (stage_name, uid))
                elif if_duplicate == "return":
                    if task.params != params:
                        raise InvalidParams(
                            f"Tried to add a task with the same uid, but different parameters."
                        )
                    return task
                else:
                    raise ValueError(f"{if_duplicate} is not valid")
        else:
            # Create Task
            sig = funcsigs.signature(func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default

            task = Task(
                stage=stage,
                params=params,
                parents=parents,
                uid=uid,
                drm=drm if drm is not None else self.cosmos_app.default_drm,
                job_class=job_class if job_class is not None else
                self.cosmos_app.default_job_class,
                queue=queue
                if queue is not None else self.cosmos_app.default_queue,
                must_succeed=must_succeed,
                core_req=core_req if core_req is not None else
                params_or_signature_default_or("core_req", 1),
                mem_req=mem_req if mem_req is not None else
                params_or_signature_default_or("mem_req", None),
                time_req=time_req
                if time_req is not None else self.cosmos_app.default_time_req,
                successful=False,
                max_attempts=max_attempts if max_attempts is not None else
                self.cosmos_app.default_max_attempts,
                attempt=1,
                NOOP=noop,
                gpu_req=gpu_req if gpu_req is not None else
                params_or_signature_default_or("gpu_req", 0),
                environment_variables=environment_variables
                if environment_variables is not None else
                self.cosmos_app.default_environment_variables,
            )

            task.cmd_fxn = func

            if drm_options is None:
                task.drm_options = {}
            else:
                task.drm_options = drm_options
            # use default for any keys not set
            if self.cosmos_app.default_drm_options is not None:
                for key, val in list(
                        self.cosmos_app.default_drm_options.items()):
                    if key not in task.drm_options:
                        task.drm_options[key] = val

            DRM.validate_drm_options(task.drm, task.drm_options)

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self._dont_garbage_collect.append(task)

        return task
Beispiel #3
0
    def add_task(self,
                 func,
                 params=None,
                 parents=None,
                 stage_name=None,
                 uid=None,
                 drm=None,
                 queue=None,
                 must_succeed=True,
                 time_req=None,
                 core_req=None,
                 mem_req=None,
                 max_attempts=None,
                 noop=False,
                 job_class=None,
                 drm_options=None):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param callable func: A function which returns a string which will get converted to a shell script to be executed.  `func` will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to `func.__name__`.
        :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf').  Defaults to the `default_drm` parameter of :meth:`Cosmos.start`
        :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start`
        :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start`
        :param bool must_succeed: Default True.  If False, the Workflow will not fail if this Task does not succeed.  Dependent Jobs will not be executed.
        :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources.
        :param int cpu_req: Number of cpus required for this Task.  Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence.
            Warning!  In future versions, this will be the only way to set it.
        :param int mem_req: Number of MB of RAM required for this Task.   Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence.
            Warning!  In future versions, this will be the only way to set it.
        :param int max_attempts: The maximum number of times to retry a failed job.  Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start`
        :rtype: cosmos.api.Task
        """
        # Avoid cyclical import dependencies
        from cosmos.job.drm.DRM_Base import DRM
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency

        # parents
        if parents is None:
            parents = []
        elif isinstance(parents, Task):
            parents = [parents]
        else:
            parents = list(parents)

        # params
        if params is None:
            params = dict()
        for k, v in params.iteritems():
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError, 'uid parameter must be specified'
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, basestring), 'uid must be a string'

        if stage_name is None:
            stage_name = str(func.__name__)

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name),
                         None)
        if stage is None:
            stage = Stage(workflow=self,
                          name=stage_name,
                          status=StageStatus.no_attempt)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                # If the user manually edited the dag and this a resume, parents might need to be-readded
                task.parents.extend(set(parents).difference(set(task.parents)))

                for p in parents:
                    if p.stage not in stage.parents:
                        stage.parents.append(p.stage)

                return task
            else:
                # TODO check for duplicate params here?  would be a lot faster at Workflow.run
                raise ValueError(
                    'Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  '
                    'Task uids must be unique within the same Stage.' %
                    (stage_name, uid))
        else:
            # Create Task
            sig = funcsigs.signature(func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default

            input_map = dict()
            output_map = dict()

            for keyword, param in sig.parameters.iteritems():
                if keyword.startswith('in_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    input_map[keyword] = v
                elif keyword.startswith('out_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    output_map[keyword] = v

            task = Task(
                stage=stage,
                params=params,
                parents=parents,
                input_map=input_map,
                output_map=output_map,
                uid=uid,
                drm=drm if drm is not None else self.cosmos_app.default_drm,
                job_class=job_class if job_class is not None else
                self.cosmos_app.default_job_class,
                queue=queue
                if queue is not None else self.cosmos_app.default_queue,
                must_succeed=must_succeed,
                core_req=core_req if core_req is not None else
                params_or_signature_default_or('core_req', 1),
                mem_req=mem_req if mem_req is not None else
                params_or_signature_default_or('mem_req', None),
                time_req=time_req
                if time_req is not None else self.cosmos_app.default_time_req,
                successful=False,
                max_attempts=max_attempts if max_attempts is not None else
                self.cosmos_app.default_max_attempts,
                attempt=1,
                NOOP=noop)

            task.cmd_fxn = func

            task.drm_options = drm_options if drm_options is not None else self.cosmos_app.default_drm_options
            DRM.validate_drm_options(task.drm, task.drm_options)

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self.dont_garbage_collect.append(task)

        return task
Beispiel #4
0
    def add_task(self, func, params=None, parents=None, stage_name=None, uid=None, drm=None,
                 queue=None, must_succeed=True, time_req=None, core_req=None, mem_req=None,
                 max_attempts=None, noop=False, job_class=None, drm_options=None):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param callable func: A function which returns a string which will get converted to a shell script to be executed.  `func` will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to `func.__name__`.
        :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf').  Defaults to the `default_drm` parameter of :meth:`Cosmos.start`
        :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start`
        :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start`
        :param bool must_succeed: Default True.  If False, the Workflow will not fail if this Task does not succeed.  Dependent Jobs will not be executed.
        :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources.
        :param int cpu_req: Number of cpus required for this Task.  Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence.
            Warning!  In future versions, this will be the only way to set it.
        :param int mem_req: Number of MB of RAM required for this Task.   Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence.
            Warning!  In future versions, this will be the only way to set it.
        :param int max_attempts: The maximum number of times to retry a failed job.  Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start`
        :rtype: cosmos.api.Task
        """
        # Avoid cyclical import dependencies
        from cosmos.job.drm.DRM_Base import DRM
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency

        # parents
        if parents is None:
            parents = []
        elif isinstance(parents, Task):
            parents = [parents]
        else:
            parents = list(parents)

        # params
        if params is None:
            params = dict()
        for k, v in params.iteritems():
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError, 'uid parameter must be specified'
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, basestring), 'uid must be a string'

        if stage_name is None:
            stage_name = str(func.__name__)

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name), None)
        if stage is None:
            stage = Stage(workflow=self, name=stage_name, status=StageStatus.no_attempt)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                # If the user manually edited the dag and this a resume, parents might need to be-readded
                task.parents.extend(set(parents).difference(set(task.parents)))

                for p in parents:
                    if p.stage not in stage.parents:
                        stage.parents.append(p.stage)

                return task
            else:
                # TODO check for duplicate params here?  would be a lot faster at Workflow.run
                raise ValueError('Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  '
                                 'Task uids must be unique within the same Stage.' % (stage_name, uid))
        else:
            # Create Task
            sig = funcsigs.signature(func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default


            task = Task(stage=stage,
                        params=params,
                        parents=parents,
                        uid=uid,
                        drm=drm if drm is not None else self.cosmos_app.default_drm,
                        job_class=job_class if job_class is not None else self.cosmos_app.default_job_class,
                        queue=queue if queue is not None else self.cosmos_app.default_queue,
                        must_succeed=must_succeed,
                        core_req=core_req if core_req is not None else params_or_signature_default_or('core_req', 1),
                        mem_req=mem_req if mem_req is not None else params_or_signature_default_or('mem_req', None),
                        time_req=time_req if time_req is not None else self.cosmos_app.default_time_req,
                        successful=False,
                        max_attempts=max_attempts if max_attempts is not None else self.cosmos_app.default_max_attempts,
                        attempt=1,
                        NOOP=noop
                        )

            task.cmd_fxn = func

            task.drm_options = drm_options if drm_options is not None else self.cosmos_app.default_drm_options
            DRM.validate_drm_options(task.drm, task.drm_options)

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self.dont_garbage_collect.append(task)

        return task