Beispiel #1
0
    def run(
        self,
        max_cores=None,
        dry=False,
        set_successful=True,
        cmd_wrapper=signature.default_cmd_fxn_wrapper,
        log_out_dir_func=default_task_log_output_dir,
        max_gpus=None,
        do_cleanup_atexit=True,
        lethal_signals=TERMINATION_SIGNALS,
    ):
        """
        Runs this Workflow's DAG

        :param int max_cores: The maximum number of cores to use at once.  A value of None indicates no maximum.
        :param int max_attempts: The maximum number of times to retry a failed job.
             Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...)
        :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique).
             It receives one parameter: the Task instance.
             By default a Task's log output is stored in log/stage_name/task_id.
             See _default_task_log_output_dir for more info.
        :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn.
        :param bool dry: If True, do not actually run any jobs.
        :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure.
            You might set this to False if you intend to add and
            run more tasks in this workflow later.
        :param do_cleanup_atexit: if False, do not attempt to cleanup unhandled exits.
        :param lethal_signals: signals to catch and shutdown

        Returns True if all tasks in the workflow ran successfully, False otherwise.
        If dry is specified, returns None.
        """

        if cmd_wrapper == signature.default_cmd_fxn_wrapper:
            warnings.warn(
                f"Having functions return bash strings as the default behavior is deprecated.  While "
                f"this behavior will be supported, it is recommended that you set cmd_wrapper to "
                f"cosmos.api.py_call which will be the new default."
                f"See examples/ex3.py. ")

        try:
            try:
                assert os.path.exists(os.getcwd(
                )), "current working dir does not exist! %s" % os.getcwd()

                assert hasattr(
                    self, "cosmos_app"
                ), "Workflow was not initialized using the Workflow.start method"
                assert hasattr(
                    log_out_dir_func,
                    "__call__"), "log_out_dir_func must be a function"
                assert self.session, "Workflow must be part of a sqlalchemy session"

                session = self.session
                self.log.info(
                    "Preparing to run %s using DRM `%s`, cwd is `%s`",
                    self,
                    self.cosmos_app.default_drm,
                    os.getcwd(),
                )
                try:
                    user = getpass.getuser()
                except:
                    # fallback to uid if we can't respove a user name
                    user = os.getuid()

                self.log.info("Running as %s@%s, pid %s", user,
                              os.uname()[1], os.getpid())

                self.max_cores = max_cores
                self.max_gpus = max_gpus
                #
                # Run some validation checks
                #

                # check GPU env variables are set correctly
                if self.max_gpus is not None and self.cosmos_app.default_drm == "local":
                    if "COSMOS_LOCAL_GPU_DEVICES" not in os.environ:
                        raise EnvironmentError(
                            "COSMOS_LOCAL_GPU_DEVICES environment variable must be set to a "
                            "comma delimited list of gpu devices if using a local DRM to manage "
                            "GPUs")

                # check for duplicate output files
                output_fnames_to_task_and_key = dict()
                for task in self.tasks:
                    for key, fname in list(task.output_map.items()):
                        current_value = output_fnames_to_task_and_key.setdefault(
                            fname, (task, key))
                        if current_value != (task, key):
                            task2, key2 = current_value
                            raise ValueError(
                                "Duplicate output files detected!:  "
                                '{task}.params["{key}"] == {task2}.params["{key2}"] == {fname}'
                                .format(**locals()))
                        output_fnames_to_task_and_key[fname] = (task, key)

                from ..job.JobManager import JobManager

                if self.jobmanager is None:
                    self.jobmanager = JobManager(
                        get_submit_args=self.cosmos_app.get_submit_args,
                        cmd_wrapper=cmd_wrapper,
                        log_out_dir_func=log_out_dir_func,
                        logger=self.log,
                        session=self.session,
                        workflow=self,
                    )

                self.status = WorkflowStatus.running
                self.successful = False

                if self.started_on is None:
                    self.started_on = datetime.datetime.now()

                task_graph = self.task_graph()
                stage_graph = self.stage_graph()

                assert len(set(self.stages)) == len(
                    self.stages), "duplicate stage name detected: %s" % (next(
                        duplicates(self.stages)))

                # renumber stages
                stage_graph_no_cycles = nx.DiGraph()
                stage_graph_no_cycles.add_nodes_from(stage_graph.nodes())
                stage_graph_no_cycles.add_edges_from(stage_graph.edges())
                for cycle in nx.simple_cycles(stage_graph):
                    stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0])
                for i, s in enumerate(topological_sort(stage_graph_no_cycles)):
                    s.number = i + 1
                    if s.status != StageStatus.successful:
                        s.status = StageStatus.no_attempt

                # Make sure everything is in the sqlalchemy session
                session.add(self)
                successful = list(
                    [t for t in task_graph.nodes() if t.successful])

                # print stages
                for s in sorted(self.stages, key=lambda s: s.number):
                    self.log.info("%s %s" % (s, s.status))

                # Create Task Queue
                task_queue = _copy_graph(task_graph)
                self.log.info("Skipping %s successful tasks..." %
                              len(successful))
                task_queue.remove_nodes_from(successful)

                if do_cleanup_atexit:
                    handle_exits(self)

                if self.max_cores is not None:
                    self.log.info("Ensuring there are enough cores...")
                    # make sure we've got enough cores
                    for t in task_queue:
                        assert int(t.core_req) <= self.max_cores, (
                            "%s requires more cpus (%s) than `max_cores` (%s)"
                            % (
                                t,
                                t.core_req,
                                self.max_cores,
                            ))

                # Run this thing!
                self.log.info("Committing to SQL db...")
                session.commit()
            except KeyboardInterrupt:
                # haven't started submitting yet, just raise the exception
                self.log.fatal("ctrl+c caught")
                self.terminate(due_to_failure=False)
                raise

            if not dry:
                _run(self, session, task_queue, lethal_signals=lethal_signals)

                # set status
                if self.status == WorkflowStatus.failed_but_running:
                    self.status = WorkflowStatus.failed
                    # set stage status to failed
                    for s in self.stages:
                        if s.status == StageStatus.running_but_failed:
                            s.status = StageStatus.failed
                    session.commit()
                    return False
                elif self.status == WorkflowStatus.running:
                    if set_successful:
                        self.status = WorkflowStatus.successful
                    session.commit()
                    return True
                else:
                    self.log.warning('%s exited with status "%s"', self,
                                     self.status)
                    session.commit()
                    return False
            else:
                self.log.info("Workflow dry run is complete")
                return None
        except Exception as ex:
            self.log.fatal("Exception was raised")
            self.log.fatal(ex, exc_info=True)
            self.terminate(due_to_failure=False)
            raise
Beispiel #2
0
    def run(self,
            max_cores=None,
            dry=False,
            set_successful=True,
            cmd_wrapper=signature.default_cmd_fxn_wrapper,
            log_out_dir_func=default_task_log_output_dir):
        """
        Runs this Workflow's DAG

        :param int max_cores: The maximum number of cores to use at once.  A value of None indicates no maximum.
        :param int max_attempts: The maximum number of times to retry a failed job.
             Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...)
        :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique).
             It receives one parameter: the Task instance.
             By default a Task's log output is stored in log/stage_name/task_id.
             See _default_task_log_output_dir for more info.
        :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn.
        :param bool dry: If True, do not actually run any jobs.
        :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure.  You might set this to False if you intend to add and
            run more tasks in this workflow later.

        Returns True if all tasks in the workflow ran successfully, False otherwise.
        If dry is specified, returns None.
        """
        try:
            assert os.path.exists(os.getcwd(
            )), 'current working dir does not exist! %s' % os.getcwd()

            assert hasattr(
                self, 'cosmos_app'
            ), 'Workflow was not initialized using the Workflow.start method'
            assert hasattr(log_out_dir_func,
                           '__call__'), 'log_out_dir_func must be a function'
            assert self.session, 'Workflow must be part of a sqlalchemy session'

            session = self.session
            self.log.info("Preparing to run %s using DRM `%s`, cwd is `%s`",
                          self, self.cosmos_app.default_drm, os.getcwd())
            try:
                user = getpass.getuser()
            except:
                # fallback to uid if we can't respove a user name
                user = os.getuid()

            self.log.info('Running as %s@%s, pid %s', user,
                          os.uname()[1], os.getpid())

            self.max_cores = max_cores

            from ..job.JobManager import JobManager

            if self.jobmanager is None:
                self.jobmanager = JobManager(
                    get_submit_args=self.cosmos_app.get_submit_args,
                    cmd_wrapper=cmd_wrapper,
                    log_out_dir_func=log_out_dir_func)

            self.status = WorkflowStatus.running
            self.successful = False

            if self.started_on is None:
                self.started_on = datetime.datetime.now()

            task_graph = self.task_graph()
            stage_graph = self.stage_graph()

            assert len(set(self.stages)) == len(
                self.stages), 'duplicate stage name detected: %s' % (next(
                    duplicates(self.stages)))

            # renumber stages
            stage_graph_no_cycles = nx.DiGraph()
            stage_graph_no_cycles.add_nodes_from(stage_graph.nodes())
            stage_graph_no_cycles.add_edges_from(stage_graph.edges())
            for cycle in nx.simple_cycles(stage_graph):
                stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0])
            for i, s in enumerate(topological_sort(stage_graph_no_cycles)):
                s.number = i + 1
                if s.status != StageStatus.successful:
                    s.status = StageStatus.no_attempt

            # Make sure everything is in the sqlalchemy session
            session.add(self)
            successful = filter(lambda t: t.successful, task_graph.nodes())

            # print stages
            for s in sorted(self.stages, key=lambda s: s.number):
                self.log.info('%s %s' % (s, s.status))

            # Create Task Queue
            task_queue = _copy_graph(task_graph)
            self.log.info('Skipping %s successful tasks...' % len(successful))
            task_queue.remove_nodes_from(successful)

            handle_exits(self)

            if self.max_cores is not None:
                self.log.info('Ensuring there are enough cores...')
                # make sure we've got enough cores
                for t in task_queue:
                    assert int(
                        t.core_req
                    ) <= self.max_cores, '%s requires more cpus (%s) than `max_cores` (%s)' % (
                        t, t.core_req, self.max_cores)

            # Run this thing!
            self.log.info('Committing to SQL db...')
            session.commit()
            if not dry:
                _run(self, session, task_queue)

                # set status
                if self.status == WorkflowStatus.failed_but_running:
                    self.status = WorkflowStatus.failed
                    # set stage status to failed
                    for s in self.stages:
                        if s.status == StageStatus.running_but_failed:
                            s.status = StageStatus.failed
                    session.commit()
                    return False
                elif self.status == WorkflowStatus.running:
                    if set_successful:
                        self.status = WorkflowStatus.successful
                    session.commit()
                    return True
                else:
                    self.log.warning('%s exited with status "%s"', self,
                                     self.status)
                    session.commit()
                    return False
            else:
                self.log.info('Workflow dry run is complete')
                return None
        except Exception as ex:
            self.log.fatal(ex, exc_info=True)
            raise
Beispiel #3
0
    def run(self, max_cores=None, dry=False, set_successful=True,
            cmd_wrapper=signature.default_cmd_fxn_wrapper,
            log_out_dir_func=default_task_log_output_dir):
        """
        Runs this Workflow's DAG

        :param int max_cores: The maximum number of cores to use at once.  A value of None indicates no maximum.
        :param int max_attempts: The maximum number of times to retry a failed job.
             Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...)
        :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique).
             It receives one parameter: the Task instance.
             By default a Task's log output is stored in log/stage_name/task_id.
             See _default_task_log_output_dir for more info.
        :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn.
        :param bool dry: If True, do not actually run any jobs.
        :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure.  You might set this to False if you intend to add and
            run more tasks in this workflow later.

        Returns True if all tasks in the workflow ran successfully, False otherwise.
        If dry is specified, returns None.
        """
        try:
            assert os.path.exists(os.getcwd()), 'current working dir does not exist! %s' % os.getcwd()

            assert hasattr(self, 'cosmos_app'), 'Workflow was not initialized using the Workflow.start method'
            assert hasattr(log_out_dir_func, '__call__'), 'log_out_dir_func must be a function'
            assert self.session, 'Workflow must be part of a sqlalchemy session'

            session = self.session
            self.log.info("Preparing to run %s using DRM `%s`, cwd is `%s`",
                self, self.cosmos_app.default_drm, os.getcwd())
            try:
                user = getpass.getuser()
            except:
                # fallback to uid if we can't respove a user name
                user = os.getuid()

            self.log.info('Running as %s@%s, pid %s',
                          user, os.uname()[1], os.getpid())

            self.max_cores = max_cores

            from ..job.JobManager import JobManager

            if self.jobmanager is None:
                self.jobmanager = JobManager(get_submit_args=self.cosmos_app.get_submit_args,
                                             cmd_wrapper=cmd_wrapper,
                                             log_out_dir_func=log_out_dir_func)

            self.status = WorkflowStatus.running
            self.successful = False

            if self.started_on is None:
                self.started_on = datetime.datetime.now()

            task_graph = self.task_graph()
            stage_graph = self.stage_graph()

            assert len(set(self.stages)) == len(self.stages), 'duplicate stage name detected: %s' % (
                next(duplicates(self.stages)))

            # renumber stages
            stage_graph_no_cycles = nx.DiGraph()
            stage_graph_no_cycles.add_nodes_from(stage_graph.nodes())
            stage_graph_no_cycles.add_edges_from(stage_graph.edges())
            for cycle in nx.simple_cycles(stage_graph):
                stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0])
            for i, s in enumerate(topological_sort(stage_graph_no_cycles)):
                s.number = i + 1
                if s.status != StageStatus.successful:
                    s.status = StageStatus.no_attempt

            # Make sure everything is in the sqlalchemy session
            session.add(self)
            successful = filter(lambda t: t.successful, task_graph.nodes())

            # print stages
            for s in sorted(self.stages, key=lambda s: s.number):
                self.log.info('%s %s' % (s, s.status))

            # Create Task Queue
            task_queue = _copy_graph(task_graph)
            self.log.info('Skipping %s successful tasks...' % len(successful))
            task_queue.remove_nodes_from(successful)

            handle_exits(self)

            if self.max_cores is not None:
                self.log.info('Ensuring there are enough cores...')
                # make sure we've got enough cores
                for t in task_queue:
                    assert int(t.core_req) <= self.max_cores, '%s requires more cpus (%s) than `max_cores` (%s)' % (t, t.core_req, self.max_cores)

            # Run this thing!
            self.log.info('Committing to SQL db...')
            session.commit()
            if not dry:
                _run(self, session, task_queue)

                # set status
                if self.status == WorkflowStatus.failed_but_running:
                    self.status = WorkflowStatus.failed
                    # set stage status to failed
                    for s in self.stages:
                        if s.status == StageStatus.running_but_failed:
                            s.status = StageStatus.failed
                    session.commit()
                    return False
                elif self.status == WorkflowStatus.running:
                    if set_successful:
                        self.status = WorkflowStatus.successful
                    session.commit()
                    return True
                else:
                    self.log.warning('%s exited with status "%s"', self, self.status)
                    session.commit()
                    return False
            else:
                self.log.info('Workflow dry run is complete')
                return None
        except Exception as ex:
            self.log.fatal(ex, exc_info=True)
            raise