コード例 #1
0
class Task(Base):
    __tablename__ = 'task'
    """
    A job that gets executed.  Has a unique set of params within its Stage.
    """
    # FIXME causes a problem with mysql?
    __table_args__ = (UniqueConstraint('stage_id', 'uid', name='_uc1'),)

    id = Column(Integer, primary_key=True)
    uid = Column(String(255), index=True)

    mem_req = Column(Integer)
    core_req = Column(Integer)
    cpu_req = synonym('core_req')
    time_req = Column(Integer)
    NOOP = Column(Boolean, nullable=False)
    params = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, server_default='{}')
    stage_id = Column(ForeignKey('stage.id', ondelete="CASCADE"), nullable=False, index=True)
    log_dir = Column(String(255))
    # output_dir = Column(String(255))
    _status = Column(Enum34_ColumnType(TaskStatus), default=TaskStatus.no_attempt, nullable=False)
    successful = Column(Boolean, nullable=False)
    started_on = Column(DateTime)  # FIXME this should probably be deleted.  Too hard to determine.
    submitted_on = Column(DateTime)
    finished_on = Column(DateTime)
    attempt = Column(Integer, nullable=False)
    must_succeed = Column(Boolean, nullable=False)
    drm = Column(String(255))
    queue = Column(String(255))
    max_attempts = Column(Integer)
    parents = relationship("Task",
                           secondary=TaskEdge.__table__,
                           primaryjoin=id == TaskEdge.parent_id,
                           secondaryjoin=id == TaskEdge.child_id,
                           backref="children",
                           passive_deletes=True,
                           cascade="save-update, merge, delete",
                           )

    input_map = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, server_default='{}')
    output_map = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, server_default='{}')

    @property
    def input_files(self):
        return self.input_map.values()

    @property
    def output_files(self):
        return self.output_map.values()

    # command = Column(Text)

    drm_native_specification = Column(String(255))
    drm_jobID = Column(String(255))

    profile_fields = ['wall_time', 'cpu_time', 'percent_cpu', 'user_time', 'system_time', 'io_read_count',
                      'io_write_count', 'io_read_kb', 'io_write_kb',
                      'ctx_switch_voluntary', 'ctx_switch_involuntary', 'avg_rss_mem_kb', 'max_rss_mem_kb',
                      'avg_vms_mem_kb', 'max_vms_mem_kb', 'avg_num_threads',
                      'max_num_threads',
                      'avg_num_fds', 'max_num_fds', 'exit_status']
    exclude_from_dict = profile_fields + ['command', 'info', 'input_files', 'output_files']

    exit_status = Column(Integer)

    percent_cpu = Column(Integer)
    wall_time = Column(Integer)

    cpu_time = Column(Integer)
    user_time = Column(Integer)
    system_time = Column(Integer)

    avg_rss_mem_kb = Column(Integer)
    max_rss_mem_kb = Column(Integer)
    avg_vms_mem_kb = Column(Integer)
    max_vms_mem_kb = Column(Integer)

    io_read_count = Column(Integer)
    io_write_count = Column(Integer)
    io_wait = Column(Integer)
    io_read_kb = Column(Integer)
    io_write_kb = Column(Integer)

    ctx_switch_voluntary = Column(Integer)
    ctx_switch_involuntary = Column(Integer)

    avg_num_threads = Column(Integer)
    max_num_threads = Column(Integer)

    avg_num_fds = Column(Integer)
    max_num_fds = Column(Integer)

    extra = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False, server_default='{}')

    @declared_attr
    def status(cls):
        def get_status(self):
            return self._status

        def set_status(self, value):
            if self._status != value:
                self._status = value
                signal_task_status_change.send(self)

        return synonym('_status', descriptor=property(get_status, set_status))

    @property
    def workflow(self):
        return self.stage.workflow

    @property
    def log(self):
        return self.workflow.log

    @property
    def finished(self):
        return self.status in {TaskStatus.successful, TaskStatus.killed, TaskStatus.failed}

    _cache_profile = None

    output_profile_path = logplus('profile.json')
    output_command_script_path = logplus('command.bash')
    output_stderr_path = logplus('stderr.txt')
    output_stdout_path = logplus('stdout.txt')

    @property
    def stdout_text(self):
        return readfile(self.output_stdout_path)

    @property
    def stderr_text(self):
        r = readfile(self.output_stderr_path)
        if r == 'file does not exist':
            if self.drm == 'lsf' and self.drm_jobID:
                r += '\n\nbpeek %s output:\n\n' % self.drm_jobID
                try:
                    r += codecs.decode(sp.check_output('bpeek %s' % self.drm_jobID, shell=True), 'utf-8')
                except Exception as e:
                    r += str(e)
        return r

    @property
    def command_script_text(self):
        # return self.command
        return readfile(self.output_command_script_path).strip() or self.command

    def descendants(self, include_self=False):
        """
        :return: (list) all stages that descend from this stage in the stage_graph
        """
        x = nx.descendants(self.workflow.task_graph(), self)
        if include_self:
            return sorted({self}.union(x), key=lambda task: task.stage.number)
        else:
            return x

    @property
    def label(self):
        """Label used for the taskgraph image"""
        params = '' if len(self.params) == 0 else "\\n {0}".format(
            "\\n".join(["{0}: {1}".format(k, v) for k, v in self.params.items()]))

        return "[%s] %s%s" % (self.id, self.stage.name, params)

    def args_as_query_string(self):
        import urllib

        return urllib.urlencode(self.params)

    def delete(self, descendants=False):
        if descendants:
            tasks_to_delete = self.descendants(include_self=True)
            self.log.debug('Deleting %s and %s of its descendants' % (self, len(tasks_to_delete) - 1))
            for t in tasks_to_delete:
                self.session.delete(t)
        else:
            self.log.debug('Deleting %s' % self)
            self.session.delete(self)

        self.session.commit()

    @property
    def url(self):
        return url_for('cosmos.task', ex_name=self.workflow.name, stage_name=self.stage.name, task_id=self.id)

    @property
    def params_pretty(self):
        return '%s' % ', '.join('%s=%s' % (k, "'%s'" % v if isinstance(v, basestring) else v) for k, v in self.params.items())

    @property
    def params_pformat(self):
        return pprint.pformat(self.params, indent=2, width=1)

    def __repr__(self):
        return "<Task[%s] %s(uid='%s')>" % (self.id or 'id_%s' % id(self),
                                            self.stage.name if self.stage else '',
                                            self.uid
                                            )

    def __str__(self):
        return self.__repr__()
コード例 #2
0
ファイル: Workflow.py プロジェクト: p7k/COSMOS2
class Workflow(Base):
    """
    An collection Stages and Tasks encoded as a DAG
    """
    __tablename__ = 'workflow'

    id = Column(Integer, primary_key=True)
    name = Column(VARCHAR(200), unique=True, nullable=False)
    successful = Column(Boolean, nullable=False)
    created_on = Column(DateTime)
    started_on = Column(DateTime)
    finished_on = Column(DateTime)
    max_cores = Column(Integer)
    primary_log_path = Column(String(255))
    _log = None

    info = Column(MutableDict.as_mutable(JSONEncodedDict))
    _status = Column(Enum_ColumnType(WorkflowStatus, length=255),
                     default=WorkflowStatus.no_attempt)
    stages = relationship("Stage",
                          cascade="all, merge, delete-orphan",
                          order_by="Stage.number",
                          passive_deletes=True,
                          backref='workflow')

    exclude_from_dict = ['info']
    dont_garbage_collect = None
    termination_signal = None

    @declared_attr
    def status(cls):
        def get_status(self):
            return self._status

        def set_status(self, value):
            if self._status != value:
                self._status = value
                signal_workflow_status_change.send(self)

        return synonym('_status', descriptor=property(get_status, set_status))

    @validates('name')
    def validate_name(self, key, name):
        assert re.match(r"^[\w-]+$", name), 'Invalid workflow name, characters are limited to letters, numbers, ' \
                                            'hyphens and underscores'
        return name

    @orm.reconstructor
    def constructor(self):
        self.__init__(manual_instantiation=False)

    def __init__(self, manual_instantiation=True, *args, **kwargs):
        # FIXME provide the cosmos_app instance?

        if manual_instantiation:
            raise TypeError(
                'Do not instantiate an Workflow manually.  Use the Cosmos.start method.'
            )
        super(Workflow, self).__init__(*args, **kwargs)
        # assert self.output_dir is not None, 'output_dir cannot be None'
        if self.info is None:
            # mutable dict column defaults to None
            self.info = dict()
        self.jobmanager = None
        if not self.created_on:
            self.created_on = datetime.datetime.now()
        self.dont_garbage_collect = []

    @property
    def log(self):
        if self._log is None:
            self._log = get_logger('%s' % self, self.primary_log_path)
        return self._log

    def make_output_dirs(self):
        """
        Create directory paths of all output files
        """
        dirs = set()

        for task in self.tasks:
            for out_name, v in task.output_map.iteritems():
                dirname = lambda p: p if out_name.endswith(
                    'dir') or p is None else os.path.dirname(p)

                if isinstance(v, (tuple, list)):
                    dirs.update(map(dirname, v))
                elif isinstance(v, dict):
                    raise NotImplemented()
                else:
                    dirs.add(dirname(v))

        for d in dirs:
            if d is not None and '://' not in d:
                mkdir(d)

    def add_task(self,
                 func,
                 params=None,
                 parents=None,
                 stage_name=None,
                 uid=None,
                 drm=None,
                 queue=None,
                 must_succeed=True,
                 time_req=None,
                 core_req=None,
                 mem_req=None,
                 max_attempts=None,
                 noop=False,
                 job_class=None,
                 drm_options=None):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param callable func: A function which returns a string which will get converted to a shell script to be executed.  `func` will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to `func.__name__`.
        :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf').  Defaults to the `default_drm` parameter of :meth:`Cosmos.start`
        :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start`
        :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start`
        :param bool must_succeed: Default True.  If False, the Workflow will not fail if this Task does not succeed.  Dependent Jobs will not be executed.
        :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources.
        :param int cpu_req: Number of cpus required for this Task.  Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence.
            Warning!  In future versions, this will be the only way to set it.
        :param int mem_req: Number of MB of RAM required for this Task.   Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence.
            Warning!  In future versions, this will be the only way to set it.
        :param int max_attempts: The maximum number of times to retry a failed job.  Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start`
        :rtype: cosmos.api.Task
        """
        # Avoid cyclical import dependencies
        from cosmos.job.drm.DRM_Base import DRM
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency

        # parents
        if parents is None:
            parents = []
        elif isinstance(parents, Task):
            parents = [parents]
        else:
            parents = list(parents)

        # params
        if params is None:
            params = dict()
        for k, v in params.iteritems():
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError, 'uid parameter must be specified'
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, basestring), 'uid must be a string'

        if stage_name is None:
            stage_name = str(func.__name__)

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name),
                         None)
        if stage is None:
            stage = Stage(workflow=self,
                          name=stage_name,
                          status=StageStatus.no_attempt)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                # If the user manually edited the dag and this a resume, parents might need to be-readded
                task.parents.extend(set(parents).difference(set(task.parents)))

                for p in parents:
                    if p.stage not in stage.parents:
                        stage.parents.append(p.stage)

                return task
            else:
                # TODO check for duplicate params here?  would be a lot faster at Workflow.run
                raise ValueError(
                    'Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  '
                    'Task uids must be unique within the same Stage.' %
                    (stage_name, uid))
        else:
            # Create Task
            sig = funcsigs.signature(func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default

            input_map = dict()
            output_map = dict()

            for keyword, param in sig.parameters.iteritems():
                if keyword.startswith('in_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    input_map[keyword] = v
                elif keyword.startswith('out_'):
                    v = params.get(keyword, param.default)
                    assert v != funcsigs._empty, 'parameter %s for %s is required' % (
                        param, func)
                    output_map[keyword] = v

            task = Task(
                stage=stage,
                params=params,
                parents=parents,
                input_map=input_map,
                output_map=output_map,
                uid=uid,
                drm=drm if drm is not None else self.cosmos_app.default_drm,
                job_class=job_class if job_class is not None else
                self.cosmos_app.default_job_class,
                queue=queue
                if queue is not None else self.cosmos_app.default_queue,
                must_succeed=must_succeed,
                core_req=core_req if core_req is not None else
                params_or_signature_default_or('core_req', 1),
                mem_req=mem_req if mem_req is not None else
                params_or_signature_default_or('mem_req', None),
                time_req=time_req
                if time_req is not None else self.cosmos_app.default_time_req,
                successful=False,
                max_attempts=max_attempts if max_attempts is not None else
                self.cosmos_app.default_max_attempts,
                attempt=1,
                NOOP=noop)

            task.cmd_fxn = func

            task.drm_options = drm_options if drm_options is not None else self.cosmos_app.default_drm_options
            DRM.validate_drm_options(task.drm, task.drm_options)

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self.dont_garbage_collect.append(task)

        return task

    def run(self,
            max_cores=None,
            dry=False,
            set_successful=True,
            cmd_wrapper=signature.default_cmd_fxn_wrapper,
            log_out_dir_func=default_task_log_output_dir):
        """
        Runs this Workflow's DAG

        :param int max_cores: The maximum number of cores to use at once.  A value of None indicates no maximum.
        :param int max_attempts: The maximum number of times to retry a failed job.
             Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...)
        :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique).
             It receives one parameter: the Task instance.
             By default a Task's log output is stored in log/stage_name/task_id.
             See _default_task_log_output_dir for more info.
        :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn.
        :param bool dry: If True, do not actually run any jobs.
        :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure.  You might set this to False if you intend to add and
            run more tasks in this workflow later.

        Returns True if all tasks in the workflow ran successfully, False otherwise.
        If dry is specified, returns None.
        """
        try:
            assert os.path.exists(os.getcwd(
            )), 'current working dir does not exist! %s' % os.getcwd()

            assert hasattr(
                self, 'cosmos_app'
            ), 'Workflow was not initialized using the Workflow.start method'
            assert hasattr(log_out_dir_func,
                           '__call__'), 'log_out_dir_func must be a function'
            assert self.session, 'Workflow must be part of a sqlalchemy session'

            session = self.session
            self.log.info("Preparing to run %s using DRM `%s`, cwd is `%s`",
                          self, self.cosmos_app.default_drm, os.getcwd())
            try:
                user = getpass.getuser()
            except:
                # fallback to uid if we can't respove a user name
                user = os.getuid()

            self.log.info('Running as %s@%s, pid %s', user,
                          os.uname()[1], os.getpid())

            self.max_cores = max_cores

            from ..job.JobManager import JobManager

            if self.jobmanager is None:
                self.jobmanager = JobManager(
                    get_submit_args=self.cosmos_app.get_submit_args,
                    cmd_wrapper=cmd_wrapper,
                    log_out_dir_func=log_out_dir_func)

            self.status = WorkflowStatus.running
            self.successful = False

            if self.started_on is None:
                self.started_on = datetime.datetime.now()

            task_graph = self.task_graph()
            stage_graph = self.stage_graph()

            assert len(set(self.stages)) == len(
                self.stages), 'duplicate stage name detected: %s' % (next(
                    duplicates(self.stages)))

            # renumber stages
            stage_graph_no_cycles = nx.DiGraph()
            stage_graph_no_cycles.add_nodes_from(stage_graph.nodes())
            stage_graph_no_cycles.add_edges_from(stage_graph.edges())
            for cycle in nx.simple_cycles(stage_graph):
                stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0])
            for i, s in enumerate(topological_sort(stage_graph_no_cycles)):
                s.number = i + 1
                if s.status != StageStatus.successful:
                    s.status = StageStatus.no_attempt

            # Make sure everything is in the sqlalchemy session
            session.add(self)
            successful = filter(lambda t: t.successful, task_graph.nodes())

            # print stages
            for s in sorted(self.stages, key=lambda s: s.number):
                self.log.info('%s %s' % (s, s.status))

            # Create Task Queue
            task_queue = _copy_graph(task_graph)
            self.log.info('Skipping %s successful tasks...' % len(successful))
            task_queue.remove_nodes_from(successful)

            handle_exits(self)

            if self.max_cores is not None:
                self.log.info('Ensuring there are enough cores...')
                # make sure we've got enough cores
                for t in task_queue:
                    assert int(
                        t.core_req
                    ) <= self.max_cores, '%s requires more cpus (%s) than `max_cores` (%s)' % (
                        t, t.core_req, self.max_cores)

            # Run this thing!
            self.log.info('Committing to SQL db...')
            session.commit()
            if not dry:
                _run(self, session, task_queue)

                # set status
                if self.status == WorkflowStatus.failed_but_running:
                    self.status = WorkflowStatus.failed
                    # set stage status to failed
                    for s in self.stages:
                        if s.status == StageStatus.running_but_failed:
                            s.status = StageStatus.failed
                    session.commit()
                    return False
                elif self.status == WorkflowStatus.running:
                    if set_successful:
                        self.status = WorkflowStatus.successful
                    session.commit()
                    return True
                else:
                    self.log.warning('%s exited with status "%s"', self,
                                     self.status)
                    session.commit()
                    return False
            else:
                self.log.info('Workflow dry run is complete')
                return None
        except Exception as ex:
            self.log.fatal(ex, exc_info=True)
            raise

    def terminate(self, due_to_failure=True):
        self.log.warning('Terminating %s!' % self)
        if self.jobmanager:
            self.log.info(
                'Processing finished tasks and terminating {num_running_tasks} running tasks'
                .format(num_running_tasks=len(
                    self.jobmanager.running_tasks), ))
            _process_finished_tasks(self.jobmanager)
            self.jobmanager.terminate()

        if due_to_failure:
            self.status = WorkflowStatus.failed
        else:
            self.status = WorkflowStatus.killed

        self.session.commit()

    def cleanup(self):
        if self.jobmanager:
            self.log.info('Cleaning up {num_dead_tasks} dead tasks'.format(
                num_dead_tasks=len(self.jobmanager.dead_tasks), ))
            self.jobmanager.cleanup()

    @property
    def tasks(self):
        return [t for s in self.stages for t in s.tasks]
        # return session.query(Task).join(Stage).filter(Stage.workflow == ex).all()

    def stage_graph(self):
        """
        :return: (networkx.DiGraph) a DAG of the stages
        """
        g = nx.DiGraph()
        g.add_nodes_from(self.stages)
        g.add_edges_from((s, c) for s in self.stages for c in s.children if c)
        return g

    def task_graph(self):
        """
        :return: (networkx.DiGraph) a DAG of the tasks
        """
        g = nx.DiGraph()
        g.add_nodes_from(self.tasks)
        g.add_edges_from([(t, c) for t in self.tasks for c in t.children])
        return g

    def get_stage(self, name_or_id):
        if isinstance(name_or_id, int):
            f = lambda s: s.id == name_or_id
        else:
            f = lambda s: s.name == name_or_id

        for stage in self.stages:
            if f(stage):
                return stage

        raise ValueError('Stage with name %s does not exist' % name_or_id)

    @property
    def url(self):
        return url_for('cosmos.workflow', name=self.name)

    def __repr__(self):
        return '<Workflow[%s] %s>' % (self.id or '', self.name)

    def __unicode__(self):
        return self.__repr__()

    def delete(self, delete_files=False):
        """
        :param delete_files: (bool) If True, delete :attr:`output_dir` directory and all contents on the filesystem
        """
        if hasattr(self, 'log'):
            self.log.info('Deleting %s, delete_files=%s' %
                          (self, delete_files))
            for h in self.log.handlers:
                h.flush()
                h.close()
                self.log.removeHandler(h)

        if delete_files:
            raise NotImplementedError(
                'This should delete all Task.output_files')

        print >> sys.stderr, '%s Deleting from SQL...' % self
        self.session.delete(self)
        self.session.commit()
        print >> sys.stderr, '%s Deleted' % self

    def get_first_failed_task(self, key=lambda t: t.finished_on):
        """
        Return the first failed Task (chronologically).

        If no Task failed, return None.
        """
        for t in sorted([t for t in self.tasks if key(t) is not None],
                        key=key):
            if t.exit_status:
                return t
        return None
コード例 #3
0
ファイル: Task.py プロジェクト: indraniel/COSMOS2
class Task(Base):
    __tablename__ = "task"
    """
    A job that gets executed.  Has a unique set of params within its Stage.
    """
    # FIXME causes a problem with mysql?
    __table_args__ = (UniqueConstraint("stage_id", "uid", name="_uc1"), )
    drm_options = {}

    id = Column(Integer, primary_key=True)
    uid = Column(String(255), index=True)

    mem_req = Column(Integer)
    core_req = Column(Integer)
    cpu_req = synonym("core_req")
    time_req = Column(Integer)
    gpu_req = Column(Integer)
    NOOP = Column(Boolean, nullable=False)
    params = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False)
    stage_id = Column(ForeignKey("stage.id", ondelete="CASCADE"),
                      nullable=False,
                      index=True)
    log_dir = Column(String(255))
    # output_dir = Column(String(255))
    _status = Column(
        Enum_ColumnType(TaskStatus, length=255),
        default=TaskStatus.no_attempt,
        nullable=False,
    )
    status_reason = Column(String(255), nullable=True)
    successful = Column(Boolean, nullable=False)
    started_on = Column(
        DateTime
    )  # FIXME this should probably be deleted.  Too hard to determine.
    submitted_on = Column(DateTime)
    finished_on = Column(DateTime)
    attempt = Column(Integer, nullable=False)
    must_succeed = Column(Boolean, nullable=False)
    drm = Column(String(255))
    # FIXME consider making job_class a proper field next time the schema changes
    # job_class = Column(String(255))
    queue = Column(String(255))
    max_attempts = Column(Integer)
    parents = relationship(
        "Task",
        secondary=TaskEdge.__table__,
        primaryjoin=id == TaskEdge.parent_id,
        secondaryjoin=id == TaskEdge.child_id,
        backref="children",
        passive_deletes=True,
        cascade="save-update, merge, delete",
    )
    environment_variables = Column(MutableDict.as_mutable(JSONEncodedDict),
                                   nullable=False)

    # input_map = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False)
    # output_map = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False)

    @property
    def input_map(self):
        d = dict()
        for key, val in list(self.params.items()):
            if key.startswith("in_"):
                d[key] = val
        return d

    @property
    def output_map(self):
        d = dict()
        for key, val in list(self.params.items()):
            if key.startswith("out_"):
                d[key] = val
        return d

    @property
    def input_files(self):
        return list(self.input_map.values())

    @property
    def output_files(self):
        return list(self.output_map.values())

    # command = Column(Text)

    drm_native_specification = Column(String(255))
    drm_jobID = Column(String(255))

    profile_fields = [
        "wall_time",
        "cpu_time",
        "percent_cpu",
        "user_time",
        "system_time",
        "io_read_count",
        "io_write_count",
        "io_read_kb",
        "io_write_kb",
        "ctx_switch_voluntary",
        "ctx_switch_involuntary",
        "avg_rss_mem_kb",
        "max_rss_mem_kb",
        "avg_vms_mem_kb",
        "max_vms_mem_kb",
        "avg_num_threads",
        "max_num_threads",
        "avg_num_fds",
        "max_num_fds",
        "exit_status",
    ]

    exclude_from_dict = profile_fields + [
        "command",
        "info",
        "input_files",
        "output_files",
    ]

    exit_status = Column(Integer)

    percent_cpu = Column(Integer)
    # time in seconds
    wall_time = Column(Integer)

    cpu_time = Column(Integer)
    user_time = Column(Integer)
    system_time = Column(Integer)

    avg_rss_mem_kb = Column(Integer)
    max_rss_mem_kb = Column(Integer)
    avg_vms_mem_kb = Column(Integer)
    max_vms_mem_kb = Column(Integer)

    io_read_count = Column(Integer)
    io_write_count = Column(Integer)
    io_wait = Column(Integer)
    io_read_kb = Column(Integer)
    io_write_kb = Column(Integer)

    ctx_switch_voluntary = Column(Integer)
    ctx_switch_involuntary = Column(Integer)

    avg_num_threads = Column(Integer)
    max_num_threads = Column(Integer)

    avg_num_fds = Column(Integer)
    max_num_fds = Column(Integer)

    extra = Column(MutableDict.as_mutable(JSONEncodedDict), nullable=False)

    @declared_attr
    def status(cls):
        def get_status(self):
            return self._status

        def set_status(self, value):
            if self._status != value:
                self._status = value
                signal_task_status_change.send(self)

        return synonym("_status", descriptor=property(get_status, set_status))

    @property
    def workflow(self):
        return self.stage.workflow

    @property
    def log(self):
        return self.workflow.log

    @property
    def finished(self):
        return self.status in {
            TaskStatus.successful,
            TaskStatus.killed,
            TaskStatus.failed,
        }

    _cache_profile = None

    output_profile_path = logplus("profile.json")
    output_command_script_path = logplus("command.bash")
    output_stderr_path = logplus("stderr.txt")
    output_stdout_path = logplus("stdout.txt")

    @property
    def stdout_text(self):
        return readfile(self.output_stdout_path)

    @property
    def stdout_text_brief(self):
        lines = self.stdout_text.split("\n")
        if len(lines) <= 50:
            return "\n".join(lines)
        else:
            return "*** TRUNCATED (showing last 50 lines)... \n" + "\n".join(
                lines[-50:])

    @property
    def stderr_text(self):
        r = readfile(self.output_stderr_path)
        if r == "file does not exist":
            if self.drm == "lsf" and self.drm_jobID:
                r += "\n\nbpeek %s output:\n\n" % self.drm_jobID
                try:
                    r += codecs.decode(
                        sp.check_output("bpeek %s" % self.drm_jobID,
                                        shell=True),
                        "utf-8",
                    )
                except Exception as e:
                    r += str(e)
        return r

    @property
    def stderr_text_brief(self):
        lines = self.stderr_text.split("\n")
        if len(lines) <= 50:
            return "\n".join(lines)
        else:
            return "*** TRUNCATED (showing last 50 lines)... \n" + "\n".join(
                lines[-50:])

    @property
    def command_script_text(self):
        # return self.command
        return readfile(
            self.output_command_script_path).strip() or self.command

    def descendants(self, include_self=False):
        """
        :return: (list) all stages that descend from this stage in the stage_graph
        """
        x = nx.descendants(self.workflow.task_graph(), self)
        if include_self:
            return sorted({self}.union(x), key=lambda task: task.stage.number)
        else:
            return x

    def ancestors(self, include_self=False):
        x = nx.ancestors(self.workflow.task_graph(), self)
        if include_self:
            return sorted({self}.union(x), key=lambda task: task.stage.number)
        else:
            return x

    @property
    def label(self):
        """Label used for the taskgraph image"""
        params = ("" if len(self.params) == 0 else "\\n {0}".format("\\n".join(
            ["{0}: {1}".format(k, v) for k, v in list(self.params.items())])))

        return "[%s] %s%s" % (self.id, self.stage.name, params)

    def args_as_query_string(self):
        import urllib.request, urllib.parse, urllib.error

        return urllib.parse.urlencode(self.params)

    def delete(self, descendants=False):
        if descendants:
            tasks_to_delete = self.descendants(include_self=True)
            self.log.debug("Deleting %s and %s of its descendants" %
                           (self, len(tasks_to_delete) - 1))
            for t in tasks_to_delete:
                self.session.delete(t)
        else:
            self.log.debug("Deleting %s" % self)
            self.session.delete(self)

        self.session.commit()

    @property
    def url(self):
        return url_for(
            "cosmos.task",
            ex_name=self.workflow.name,
            stage_name=self.stage.name,
            task_id=self.id,
        )

    @property
    def params_pretty(self):
        return "%s" % ", ".join("%s=%s" %
                                (k, "'%s'" % v if isinstance(v, str) else v)
                                for k, v in list(self.params.items()))

    @property
    def params_pformat(self):
        return pprint.pformat(self.params, indent=2, width=1)

    def __repr__(self):
        return "<Task[%s] %s(uid='%s')>" % (
            self.id or "id_%s" % id(self),
            self.stage.name if self.stage else "",
            self.uid,
        )

    def __str__(self):
        return self.__repr__()

    # FIXME consider making job_class a proper field next time the schema changes
    def __init__(self, **kwargs):
        self.job_class = kwargs.pop("job_class", None)
        _declarative_constructor(self, **kwargs)

    @reconstructor
    def init_on_load(self):
        self.job_class = None

    @property
    def environment_variables_pretty(self):
        return "%s" % ", ".join(
            "%s=%s" % (k, "'%s'" % v if isinstance(v, str) else v)
            for k, v in list(self.environment_variables.items()))
コード例 #4
0
class Workflow(Base):
    """
    An collection Stages and Tasks encoded as a DAG
    """

    __tablename__ = "workflow"

    id = Column(Integer, primary_key=True)
    name = Column(VARCHAR(200), unique=True, nullable=False)
    successful = Column(Boolean, nullable=False)
    created_on = Column(DateTime)
    started_on = Column(DateTime)
    finished_on = Column(DateTime)
    max_cores = Column(Integer)
    max_gpus = Column(Integer)
    primary_log_path = Column(String(255))
    _log = None

    info = Column(MutableDict.as_mutable(JSONEncodedDict))
    _status = Column(Enum_ColumnType(WorkflowStatus, length=255),
                     default=WorkflowStatus.no_attempt)
    stages = relationship(
        "Stage",
        cascade="all, merge, delete-orphan",
        order_by="Stage.number",
        passive_deletes=True,
        backref="workflow",
    )

    exclude_from_dict = ["info"]
    _dont_garbage_collect = None
    termination_signal = None

    @property
    def wall_time(self):
        if self.started_on is None or self.finished_on is None:
            return None
        else:
            return self.finished_on - self.started_on

    @declared_attr
    def status(cls):
        def get_status(self):
            return self._status

        def set_status(self, value):
            if self._status != value:
                self._status = value
                signal_workflow_status_change.send(self)

        return synonym("_status", descriptor=property(get_status, set_status))

    @validates("name")
    def validate_name(self, key, name):
        assert re.match(r"^[\w-]+$", name), (
            "Invalid workflow name, characters are limited to letters, numbers, "
            "hyphens and underscores")
        return name

    @orm.reconstructor
    def constructor(self):
        self.__init__(manual_instantiation=False)

    def __init__(self, manual_instantiation=True, *args, **kwargs):
        # FIXME provide the cosmos_app instance?

        if manual_instantiation:
            raise TypeError(
                "Do not instantiate an Workflow manually.  Use the Cosmos.start method."
            )
        super(Workflow, self).__init__(*args, **kwargs)
        # assert self.output_dir is not None, 'output_dir cannot be None'
        if self.info is None:
            # mutable dict column defaults to None
            self.info = dict()
        self.jobmanager = None
        if not self.created_on:
            self.created_on = datetime.datetime.now()
        self._dont_garbage_collect = []

    @property
    def log(self):
        if self._log is None:
            self._log = get_logger("%s" % self, self.primary_log_path)
        return self._log

    def make_output_dirs(self):
        """
        Create directory paths of all output files
        """
        dirs = set()

        for task in self.tasks:
            for out_name, v in list(task.output_map.items()):
                dirname = lambda p: p if out_name.endswith(
                    "dir") or p is None else os.path.dirname(p)

                if isinstance(v, (tuple, list)):
                    dirs.update(list(map(dirname, v)))
                elif isinstance(v, dict):
                    raise NotImplemented()
                else:
                    dirs.add(dirname(v))

        for d in dirs:
            # don't add urls
            if d is not None and "://" not in d:
                mkdir(d)

    def add_task(
        self,
        func,
        params=None,
        parents=None,
        stage_name=None,
        uid=None,
        drm=None,
        queue=None,
        must_succeed=True,
        time_req=None,
        core_req=None,
        mem_req=None,
        gpu_req=None,
        max_attempts=None,
        noop=False,
        job_class=None,
        drm_options=None,
        environment_variables=None,
        if_duplicate="raise",
    ):
        """
        Adds a new Task to the Workflow.  If the Task already exists (and was successful), return the successful Task stored in the database

        :param callable func: A function which returns a string which will get converted to a shell script to be executed.  `func` will not get called until
          all of its dependencies have completed.
        :param dict params: Parameters to `func`.  Must be jsonable so that it can be stored in the database.  Any Dependency objects will get resolved into
            a string, and the Dependency.task will be added to this Task's parents.
        :param list[Tasks] parents: A list of dependent Tasks.
        :param str uid: A unique identifier for this Task, primarily used for skipping  previously successful Tasks.
            If a Task with this stage_name and uid already exists in the database (and was successful), the
            database version will be returned and a new one will not be created.
        :param str stage_name: The name of the Stage to add this Task to.  Defaults to `func.__name__`.
        :param str drm: The drm to use for this Task (example 'local', 'ge' or 'drmaa:lsf').  Defaults to the `default_drm` parameter of :meth:`Cosmos.start`
        :param job_class: The name of a job_class to submit to; defaults to the `default_job_class` parameter of :meth:`Cosmos.start`
        :param queue: The name of a queue to submit to; defaults to the `default_queue` parameter of :meth:`Cosmos.start`
        :param bool must_succeed: Default True.  If False, the Workflow will not fail if this Task does not succeed.  Dependent Jobs will not be executed.
        :param bool time_req: The time requirement; will set the Task.time_req attribute which is intended to be used by :func:`get_submit_args` to request resources.
        :param int core_req: Number of cpus required for this Task.  Can also be set in the `params` dict or the default value of the Task function signature, but this value takes precedence.
            Warning!  In future versions, this will be the only way to set it.
        :param int mem_req: Number of MB of RAM required for this Task.   Can also be set in the `params` dict or the default value of the Task function signature, but this value takes predence.
            Warning!  In future versions, this will be the only way to set it.
        :param int gpu_req: Number of gpus required for this Task.
        :param int max_attempts: The maximum number of times to retry a failed job.  Defaults to the `default_max_attempts` parameter of :meth:`Cosmos.start`
        :param bool noop: Task is a No-op and will always be marked as successful.
        :param dict drm_options: Options for Distributed Resource Management (cluster).
        :param dict environment_variables: Environment variables to pass to the DRM (if supported).
        :param str if_duplicate: If "raise", raises an error if a Task with the same UID has already been added to this
          Workflow.  If "return", return that Task, allowing for an easy way to avoid duplicate work.
        :rtype: cosmos.api.Task
        """
        # Avoid cyclical import dependencies
        from cosmos.job.drm.DRM_Base import DRM
        from cosmos.models.Stage import Stage
        from cosmos import recursive_resolve_dependency

        # parents
        if parents is None:
            parents = []
        elif isinstance(parents, Task):
            parents = [parents]
        else:
            parents = list(parents)

        # params
        if params is None:
            params = dict()
        for k, v in list(params.items()):
            # decompose `Dependency` objects to values and parents
            new_val, parent_tasks = recursive_resolve_dependency(v)

            params[k] = new_val
            parents.extend(parent_tasks - set(parents))

        # uid
        if uid is None:
            raise AssertionError("uid parameter must be specified")
            # Fix me assert params are all JSONable
            # uid = str(params)
        else:
            assert isinstance(uid, str), "uid must be a string"

        if stage_name is None:
            stage_name = str(func.__name__)

        # Get the right Stage
        stage = only_one((s for s in self.stages if s.name == stage_name),
                         None)
        if stage is None:
            stage = Stage(workflow=self,
                          name=stage_name,
                          status=StageStatus.no_attempt)
            self.session.add(stage)

        # Check if task is already in stage
        task = stage.get_task(uid, None)

        if task is not None:
            # if task is already in stage, but unsuccessful, raise an error (duplicate params) since unsuccessful tasks
            # were already removed on workflow load
            if task.successful:
                # If the user manually edited the dag and this a resume, parents might need to be-readded
                task.parents.extend(set(parents).difference(set(task.parents)))

                for p in parents:
                    if p.stage not in stage.parents:
                        stage.parents.append(p.stage)

                return task
            else:
                if if_duplicate == "raise":
                    raise DuplicateUid(
                        "Duplicate uid, you have added a Task to Stage %s with the uid (unique identifier) `%s` twice.  "
                        "Task uids must be unique within the same Stage." %
                        (stage_name, uid))
                elif if_duplicate == "return":
                    if task.params != params:
                        raise InvalidParams(
                            f"Tried to add a task with the same uid, but different parameters."
                        )
                    return task
                else:
                    raise ValueError(f"{if_duplicate} is not valid")
        else:
            # Create Task
            sig = funcsigs.signature(func)

            def params_or_signature_default_or(name, default):
                if name in params:
                    return params[name]
                if name in sig.parameters:
                    param_default = sig.parameters[name].default
                    if param_default is funcsigs._empty:
                        return default
                    else:
                        return param_default
                return default

            task = Task(
                stage=stage,
                params=params,
                parents=parents,
                uid=uid,
                drm=drm if drm is not None else self.cosmos_app.default_drm,
                job_class=job_class if job_class is not None else
                self.cosmos_app.default_job_class,
                queue=queue
                if queue is not None else self.cosmos_app.default_queue,
                must_succeed=must_succeed,
                core_req=core_req if core_req is not None else
                params_or_signature_default_or("core_req", 1),
                mem_req=mem_req if mem_req is not None else
                params_or_signature_default_or("mem_req", None),
                time_req=time_req
                if time_req is not None else self.cosmos_app.default_time_req,
                successful=False,
                max_attempts=max_attempts if max_attempts is not None else
                self.cosmos_app.default_max_attempts,
                attempt=1,
                NOOP=noop,
                gpu_req=gpu_req if gpu_req is not None else
                params_or_signature_default_or("gpu_req", 0),
                environment_variables=environment_variables
                if environment_variables is not None else
                self.cosmos_app.default_environment_variables,
            )

            task.cmd_fxn = func

            if drm_options is None:
                task.drm_options = {}
            else:
                task.drm_options = drm_options
            # use default for any keys not set
            if self.cosmos_app.default_drm_options is not None:
                for key, val in list(
                        self.cosmos_app.default_drm_options.items()):
                    if key not in task.drm_options:
                        task.drm_options[key] = val

            DRM.validate_drm_options(task.drm, task.drm_options)

        # Add Stage Dependencies
        for p in parents:
            if p.stage not in stage.parents:
                stage.parents.append(p.stage)

        self._dont_garbage_collect.append(task)

        return task

    def run(
        self,
        max_cores=None,
        dry=False,
        set_successful=True,
        cmd_wrapper=signature.default_cmd_fxn_wrapper,
        log_out_dir_func=default_task_log_output_dir,
        max_gpus=None,
        do_cleanup_atexit=True,
        lethal_signals=TERMINATION_SIGNALS,
    ):
        """
        Runs this Workflow's DAG

        :param int max_cores: The maximum number of cores to use at once.  A value of None indicates no maximum.
        :param int max_attempts: The maximum number of times to retry a failed job.
             Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...)
        :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique).
             It receives one parameter: the Task instance.
             By default a Task's log output is stored in log/stage_name/task_id.
             See _default_task_log_output_dir for more info.
        :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn.
        :param bool dry: If True, do not actually run any jobs.
        :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure.
            You might set this to False if you intend to add and
            run more tasks in this workflow later.
        :param do_cleanup_atexit: if False, do not attempt to cleanup unhandled exits.
        :param lethal_signals: signals to catch and shutdown

        Returns True if all tasks in the workflow ran successfully, False otherwise.
        If dry is specified, returns None.
        """

        if cmd_wrapper == signature.default_cmd_fxn_wrapper:
            warnings.warn(
                f"Having functions return bash strings as the default behavior is deprecated.  While "
                f"this behavior will be supported, it is recommended that you set cmd_wrapper to "
                f"cosmos.api.py_call which will be the new default."
                f"See examples/ex3.py. ")

        try:
            try:
                assert os.path.exists(os.getcwd(
                )), "current working dir does not exist! %s" % os.getcwd()

                assert hasattr(
                    self, "cosmos_app"
                ), "Workflow was not initialized using the Workflow.start method"
                assert hasattr(
                    log_out_dir_func,
                    "__call__"), "log_out_dir_func must be a function"
                assert self.session, "Workflow must be part of a sqlalchemy session"

                session = self.session
                self.log.info(
                    "Preparing to run %s using DRM `%s`, cwd is `%s`",
                    self,
                    self.cosmos_app.default_drm,
                    os.getcwd(),
                )
                try:
                    user = getpass.getuser()
                except:
                    # fallback to uid if we can't respove a user name
                    user = os.getuid()

                self.log.info("Running as %s@%s, pid %s", user,
                              os.uname()[1], os.getpid())

                self.max_cores = max_cores
                self.max_gpus = max_gpus
                #
                # Run some validation checks
                #

                # check GPU env variables are set correctly
                if self.max_gpus is not None and self.cosmos_app.default_drm == "local":
                    if "COSMOS_LOCAL_GPU_DEVICES" not in os.environ:
                        raise EnvironmentError(
                            "COSMOS_LOCAL_GPU_DEVICES environment variable must be set to a "
                            "comma delimited list of gpu devices if using a local DRM to manage "
                            "GPUs")

                # check for duplicate output files
                output_fnames_to_task_and_key = dict()
                for task in self.tasks:
                    for key, fname in list(task.output_map.items()):
                        current_value = output_fnames_to_task_and_key.setdefault(
                            fname, (task, key))
                        if current_value != (task, key):
                            task2, key2 = current_value
                            raise ValueError(
                                "Duplicate output files detected!:  "
                                '{task}.params["{key}"] == {task2}.params["{key2}"] == {fname}'
                                .format(**locals()))
                        output_fnames_to_task_and_key[fname] = (task, key)

                from ..job.JobManager import JobManager

                if self.jobmanager is None:
                    self.jobmanager = JobManager(
                        get_submit_args=self.cosmos_app.get_submit_args,
                        cmd_wrapper=cmd_wrapper,
                        log_out_dir_func=log_out_dir_func,
                        logger=self.log,
                        session=self.session,
                        workflow=self,
                    )

                self.status = WorkflowStatus.running
                self.successful = False

                if self.started_on is None:
                    self.started_on = datetime.datetime.now()

                task_graph = self.task_graph()
                stage_graph = self.stage_graph()

                assert len(set(self.stages)) == len(
                    self.stages), "duplicate stage name detected: %s" % (next(
                        duplicates(self.stages)))

                # renumber stages
                stage_graph_no_cycles = nx.DiGraph()
                stage_graph_no_cycles.add_nodes_from(stage_graph.nodes())
                stage_graph_no_cycles.add_edges_from(stage_graph.edges())
                for cycle in nx.simple_cycles(stage_graph):
                    stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0])
                for i, s in enumerate(topological_sort(stage_graph_no_cycles)):
                    s.number = i + 1
                    if s.status != StageStatus.successful:
                        s.status = StageStatus.no_attempt

                # Make sure everything is in the sqlalchemy session
                session.add(self)
                successful = list(
                    [t for t in task_graph.nodes() if t.successful])

                # print stages
                for s in sorted(self.stages, key=lambda s: s.number):
                    self.log.info("%s %s" % (s, s.status))

                # Create Task Queue
                task_queue = _copy_graph(task_graph)
                self.log.info("Skipping %s successful tasks..." %
                              len(successful))
                task_queue.remove_nodes_from(successful)

                if do_cleanup_atexit:
                    handle_exits(self)

                if self.max_cores is not None:
                    self.log.info("Ensuring there are enough cores...")
                    # make sure we've got enough cores
                    for t in task_queue:
                        assert int(t.core_req) <= self.max_cores, (
                            "%s requires more cpus (%s) than `max_cores` (%s)"
                            % (
                                t,
                                t.core_req,
                                self.max_cores,
                            ))

                # Run this thing!
                self.log.info("Committing to SQL db...")
                session.commit()
            except KeyboardInterrupt:
                # haven't started submitting yet, just raise the exception
                self.log.fatal("ctrl+c caught")
                self.terminate(due_to_failure=False)
                raise

            if not dry:
                _run(self, session, task_queue, lethal_signals=lethal_signals)

                # set status
                if self.status == WorkflowStatus.failed_but_running:
                    self.status = WorkflowStatus.failed
                    # set stage status to failed
                    for s in self.stages:
                        if s.status == StageStatus.running_but_failed:
                            s.status = StageStatus.failed
                    session.commit()
                    return False
                elif self.status == WorkflowStatus.running:
                    if set_successful:
                        self.status = WorkflowStatus.successful
                    session.commit()
                    return True
                else:
                    self.log.warning('%s exited with status "%s"', self,
                                     self.status)
                    session.commit()
                    return False
            else:
                self.log.info("Workflow dry run is complete")
                return None
        except Exception as ex:
            self.log.fatal("Exception was raised")
            self.log.fatal(ex, exc_info=True)
            self.terminate(due_to_failure=False)
            raise

    def terminate(self, due_to_failure=True):
        self.log.info("Terminating %s, due_to_failure=%s" %
                      (self, due_to_failure))
        if self.jobmanager:
            self.log.info(
                "Processing finished tasks and terminating {num_running_tasks} running tasks"
                .format(num_running_tasks=len(
                    self.jobmanager.running_tasks), ))
            _process_finished_tasks(self.jobmanager)
            self.jobmanager.terminate()

        if due_to_failure:
            self.status = WorkflowStatus.failed
        else:
            self.status = WorkflowStatus.killed

        self.session.commit()

    @property
    def tasks(self):
        return [t for s in self.stages for t in s.tasks]
        # return session.query(Task).join(Stage).filter(Stage.workflow == ex).all()

    def stage_graph(self):
        """
        :return: (networkx.DiGraph) a DAG of the stages
        """
        g = nx.DiGraph()
        g.add_nodes_from(self.stages)
        g.add_edges_from((s, c) for s in self.stages for c in s.children if c)
        return g

    def task_graph(self):
        """
        :return: (networkx.DiGraph) a DAG of the tasks
        """
        g = nx.DiGraph()
        g.add_nodes_from(self.tasks)
        g.add_edges_from([(t, c) for t in self.tasks for c in t.children])
        return g

    def get_stage(self, name_or_id):
        if isinstance(name_or_id, int):
            f = lambda s: s.id == name_or_id
        else:
            f = lambda s: s.name == name_or_id

        for stage in self.stages:
            if f(stage):
                return stage

        raise ValueError("Stage with name %s does not exist" % name_or_id)

    @property
    def url(self):
        return url_for("cosmos.workflow", name=self.name)

    def __repr__(self):
        return "<Workflow[%s] %s>" % (self.id or "", self.name)

    def __unicode__(self):
        return self.__repr__()

    def delete(self, delete_files=False):
        """
        :param delete_files: (bool) If True, delete :attr:`output_dir` directory and all contents on the filesystem
        """
        if hasattr(self, "log"):
            self.log.info("Deleting %s, delete_files=%s" %
                          (self, delete_files))
            for h in self.log.handlers:
                h.flush()
                h.close()
                self.log.removeHandler(h)

        if delete_files:
            raise NotImplementedError(
                "This should delete all Task.output_files")

        print("%s Deleting from SQL..." % self, file=sys.stderr)
        self.session.delete(self)
        self.session.commit()
        print("%s Deleted" % self, file=sys.stderr)

    def get_first_failed_task(self, key=lambda t: t.finished_on):
        """
        Return the first failed Task (chronologically).

        If no Task failed, return None.
        """
        for t in sorted([t for t in self.tasks if key(t) is not None],
                        key=key):
            if t.exit_status:
                return t
        return None