def run( self, max_cores=None, dry=False, set_successful=True, cmd_wrapper=signature.default_cmd_fxn_wrapper, log_out_dir_func=default_task_log_output_dir, max_gpus=None, do_cleanup_atexit=True, lethal_signals=TERMINATION_SIGNALS, ): """ Runs this Workflow's DAG :param int max_cores: The maximum number of cores to use at once. A value of None indicates no maximum. :param int max_attempts: The maximum number of times to retry a failed job. Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...) :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique). It receives one parameter: the Task instance. By default a Task's log output is stored in log/stage_name/task_id. See _default_task_log_output_dir for more info. :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn. :param bool dry: If True, do not actually run any jobs. :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure. You might set this to False if you intend to add and run more tasks in this workflow later. :param do_cleanup_atexit: if False, do not attempt to cleanup unhandled exits. :param lethal_signals: signals to catch and shutdown Returns True if all tasks in the workflow ran successfully, False otherwise. If dry is specified, returns None. """ if cmd_wrapper == signature.default_cmd_fxn_wrapper: warnings.warn( f"Having functions return bash strings as the default behavior is deprecated. While " f"this behavior will be supported, it is recommended that you set cmd_wrapper to " f"cosmos.api.py_call which will be the new default." f"See examples/ex3.py. ") try: try: assert os.path.exists(os.getcwd( )), "current working dir does not exist! %s" % os.getcwd() assert hasattr( self, "cosmos_app" ), "Workflow was not initialized using the Workflow.start method" assert hasattr( log_out_dir_func, "__call__"), "log_out_dir_func must be a function" assert self.session, "Workflow must be part of a sqlalchemy session" session = self.session self.log.info( "Preparing to run %s using DRM `%s`, cwd is `%s`", self, self.cosmos_app.default_drm, os.getcwd(), ) try: user = getpass.getuser() except: # fallback to uid if we can't respove a user name user = os.getuid() self.log.info("Running as %s@%s, pid %s", user, os.uname()[1], os.getpid()) self.max_cores = max_cores self.max_gpus = max_gpus # # Run some validation checks # # check GPU env variables are set correctly if self.max_gpus is not None and self.cosmos_app.default_drm == "local": if "COSMOS_LOCAL_GPU_DEVICES" not in os.environ: raise EnvironmentError( "COSMOS_LOCAL_GPU_DEVICES environment variable must be set to a " "comma delimited list of gpu devices if using a local DRM to manage " "GPUs") # check for duplicate output files output_fnames_to_task_and_key = dict() for task in self.tasks: for key, fname in list(task.output_map.items()): current_value = output_fnames_to_task_and_key.setdefault( fname, (task, key)) if current_value != (task, key): task2, key2 = current_value raise ValueError( "Duplicate output files detected!: " '{task}.params["{key}"] == {task2}.params["{key2}"] == {fname}' .format(**locals())) output_fnames_to_task_and_key[fname] = (task, key) from ..job.JobManager import JobManager if self.jobmanager is None: self.jobmanager = JobManager( get_submit_args=self.cosmos_app.get_submit_args, cmd_wrapper=cmd_wrapper, log_out_dir_func=log_out_dir_func, logger=self.log, session=self.session, workflow=self, ) self.status = WorkflowStatus.running self.successful = False if self.started_on is None: self.started_on = datetime.datetime.now() task_graph = self.task_graph() stage_graph = self.stage_graph() assert len(set(self.stages)) == len( self.stages), "duplicate stage name detected: %s" % (next( duplicates(self.stages))) # renumber stages stage_graph_no_cycles = nx.DiGraph() stage_graph_no_cycles.add_nodes_from(stage_graph.nodes()) stage_graph_no_cycles.add_edges_from(stage_graph.edges()) for cycle in nx.simple_cycles(stage_graph): stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0]) for i, s in enumerate(topological_sort(stage_graph_no_cycles)): s.number = i + 1 if s.status != StageStatus.successful: s.status = StageStatus.no_attempt # Make sure everything is in the sqlalchemy session session.add(self) successful = list( [t for t in task_graph.nodes() if t.successful]) # print stages for s in sorted(self.stages, key=lambda s: s.number): self.log.info("%s %s" % (s, s.status)) # Create Task Queue task_queue = _copy_graph(task_graph) self.log.info("Skipping %s successful tasks..." % len(successful)) task_queue.remove_nodes_from(successful) if do_cleanup_atexit: handle_exits(self) if self.max_cores is not None: self.log.info("Ensuring there are enough cores...") # make sure we've got enough cores for t in task_queue: assert int(t.core_req) <= self.max_cores, ( "%s requires more cpus (%s) than `max_cores` (%s)" % ( t, t.core_req, self.max_cores, )) # Run this thing! self.log.info("Committing to SQL db...") session.commit() except KeyboardInterrupt: # haven't started submitting yet, just raise the exception self.log.fatal("ctrl+c caught") self.terminate(due_to_failure=False) raise if not dry: _run(self, session, task_queue, lethal_signals=lethal_signals) # set status if self.status == WorkflowStatus.failed_but_running: self.status = WorkflowStatus.failed # set stage status to failed for s in self.stages: if s.status == StageStatus.running_but_failed: s.status = StageStatus.failed session.commit() return False elif self.status == WorkflowStatus.running: if set_successful: self.status = WorkflowStatus.successful session.commit() return True else: self.log.warning('%s exited with status "%s"', self, self.status) session.commit() return False else: self.log.info("Workflow dry run is complete") return None except Exception as ex: self.log.fatal("Exception was raised") self.log.fatal(ex, exc_info=True) self.terminate(due_to_failure=False) raise
def run(self, max_cores=None, dry=False, set_successful=True, cmd_wrapper=signature.default_cmd_fxn_wrapper, log_out_dir_func=default_task_log_output_dir): """ Runs this Workflow's DAG :param int max_cores: The maximum number of cores to use at once. A value of None indicates no maximum. :param int max_attempts: The maximum number of times to retry a failed job. Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...) :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique). It receives one parameter: the Task instance. By default a Task's log output is stored in log/stage_name/task_id. See _default_task_log_output_dir for more info. :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn. :param bool dry: If True, do not actually run any jobs. :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure. You might set this to False if you intend to add and run more tasks in this workflow later. Returns True if all tasks in the workflow ran successfully, False otherwise. If dry is specified, returns None. """ try: assert os.path.exists(os.getcwd( )), 'current working dir does not exist! %s' % os.getcwd() assert hasattr( self, 'cosmos_app' ), 'Workflow was not initialized using the Workflow.start method' assert hasattr(log_out_dir_func, '__call__'), 'log_out_dir_func must be a function' assert self.session, 'Workflow must be part of a sqlalchemy session' session = self.session self.log.info("Preparing to run %s using DRM `%s`, cwd is `%s`", self, self.cosmos_app.default_drm, os.getcwd()) try: user = getpass.getuser() except: # fallback to uid if we can't respove a user name user = os.getuid() self.log.info('Running as %s@%s, pid %s', user, os.uname()[1], os.getpid()) self.max_cores = max_cores from ..job.JobManager import JobManager if self.jobmanager is None: self.jobmanager = JobManager( get_submit_args=self.cosmos_app.get_submit_args, cmd_wrapper=cmd_wrapper, log_out_dir_func=log_out_dir_func) self.status = WorkflowStatus.running self.successful = False if self.started_on is None: self.started_on = datetime.datetime.now() task_graph = self.task_graph() stage_graph = self.stage_graph() assert len(set(self.stages)) == len( self.stages), 'duplicate stage name detected: %s' % (next( duplicates(self.stages))) # renumber stages stage_graph_no_cycles = nx.DiGraph() stage_graph_no_cycles.add_nodes_from(stage_graph.nodes()) stage_graph_no_cycles.add_edges_from(stage_graph.edges()) for cycle in nx.simple_cycles(stage_graph): stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0]) for i, s in enumerate(topological_sort(stage_graph_no_cycles)): s.number = i + 1 if s.status != StageStatus.successful: s.status = StageStatus.no_attempt # Make sure everything is in the sqlalchemy session session.add(self) successful = filter(lambda t: t.successful, task_graph.nodes()) # print stages for s in sorted(self.stages, key=lambda s: s.number): self.log.info('%s %s' % (s, s.status)) # Create Task Queue task_queue = _copy_graph(task_graph) self.log.info('Skipping %s successful tasks...' % len(successful)) task_queue.remove_nodes_from(successful) handle_exits(self) if self.max_cores is not None: self.log.info('Ensuring there are enough cores...') # make sure we've got enough cores for t in task_queue: assert int( t.core_req ) <= self.max_cores, '%s requires more cpus (%s) than `max_cores` (%s)' % ( t, t.core_req, self.max_cores) # Run this thing! self.log.info('Committing to SQL db...') session.commit() if not dry: _run(self, session, task_queue) # set status if self.status == WorkflowStatus.failed_but_running: self.status = WorkflowStatus.failed # set stage status to failed for s in self.stages: if s.status == StageStatus.running_but_failed: s.status = StageStatus.failed session.commit() return False elif self.status == WorkflowStatus.running: if set_successful: self.status = WorkflowStatus.successful session.commit() return True else: self.log.warning('%s exited with status "%s"', self, self.status) session.commit() return False else: self.log.info('Workflow dry run is complete') return None except Exception as ex: self.log.fatal(ex, exc_info=True) raise
def run(self, max_cores=None, dry=False, set_successful=True, cmd_wrapper=signature.default_cmd_fxn_wrapper, log_out_dir_func=default_task_log_output_dir): """ Runs this Workflow's DAG :param int max_cores: The maximum number of cores to use at once. A value of None indicates no maximum. :param int max_attempts: The maximum number of times to retry a failed job. Can be overridden with on a per-Task basis with Workflow.add_task(..., max_attempts=N, ...) :param callable log_out_dir_func: A function that returns a Task's logging directory (must be unique). It receives one parameter: the Task instance. By default a Task's log output is stored in log/stage_name/task_id. See _default_task_log_output_dir for more info. :param callable cmd_wrapper: A decorator which will be applied to every Task's cmd_fxn. :param bool dry: If True, do not actually run any jobs. :param bool set_successful: Sets this workflow as successful if all tasks finish without a failure. You might set this to False if you intend to add and run more tasks in this workflow later. Returns True if all tasks in the workflow ran successfully, False otherwise. If dry is specified, returns None. """ try: assert os.path.exists(os.getcwd()), 'current working dir does not exist! %s' % os.getcwd() assert hasattr(self, 'cosmos_app'), 'Workflow was not initialized using the Workflow.start method' assert hasattr(log_out_dir_func, '__call__'), 'log_out_dir_func must be a function' assert self.session, 'Workflow must be part of a sqlalchemy session' session = self.session self.log.info("Preparing to run %s using DRM `%s`, cwd is `%s`", self, self.cosmos_app.default_drm, os.getcwd()) try: user = getpass.getuser() except: # fallback to uid if we can't respove a user name user = os.getuid() self.log.info('Running as %s@%s, pid %s', user, os.uname()[1], os.getpid()) self.max_cores = max_cores from ..job.JobManager import JobManager if self.jobmanager is None: self.jobmanager = JobManager(get_submit_args=self.cosmos_app.get_submit_args, cmd_wrapper=cmd_wrapper, log_out_dir_func=log_out_dir_func) self.status = WorkflowStatus.running self.successful = False if self.started_on is None: self.started_on = datetime.datetime.now() task_graph = self.task_graph() stage_graph = self.stage_graph() assert len(set(self.stages)) == len(self.stages), 'duplicate stage name detected: %s' % ( next(duplicates(self.stages))) # renumber stages stage_graph_no_cycles = nx.DiGraph() stage_graph_no_cycles.add_nodes_from(stage_graph.nodes()) stage_graph_no_cycles.add_edges_from(stage_graph.edges()) for cycle in nx.simple_cycles(stage_graph): stage_graph_no_cycles.remove_edge(cycle[-1], cycle[0]) for i, s in enumerate(topological_sort(stage_graph_no_cycles)): s.number = i + 1 if s.status != StageStatus.successful: s.status = StageStatus.no_attempt # Make sure everything is in the sqlalchemy session session.add(self) successful = filter(lambda t: t.successful, task_graph.nodes()) # print stages for s in sorted(self.stages, key=lambda s: s.number): self.log.info('%s %s' % (s, s.status)) # Create Task Queue task_queue = _copy_graph(task_graph) self.log.info('Skipping %s successful tasks...' % len(successful)) task_queue.remove_nodes_from(successful) handle_exits(self) if self.max_cores is not None: self.log.info('Ensuring there are enough cores...') # make sure we've got enough cores for t in task_queue: assert int(t.core_req) <= self.max_cores, '%s requires more cpus (%s) than `max_cores` (%s)' % (t, t.core_req, self.max_cores) # Run this thing! self.log.info('Committing to SQL db...') session.commit() if not dry: _run(self, session, task_queue) # set status if self.status == WorkflowStatus.failed_but_running: self.status = WorkflowStatus.failed # set stage status to failed for s in self.stages: if s.status == StageStatus.running_but_failed: s.status = StageStatus.failed session.commit() return False elif self.status == WorkflowStatus.running: if set_successful: self.status = WorkflowStatus.successful session.commit() return True else: self.log.warning('%s exited with status "%s"', self, self.status) session.commit() return False else: self.log.info('Workflow dry run is complete') return None except Exception as ex: self.log.fatal(ex, exc_info=True) raise