def hold(job, clean_job=False, clean_logs=False, hold_children=True): """Hold the given job make sure its no longer on the cluster. The function takes only jobs that are in active state and takes care of the cancellation of any children. :param job: the job :param clean_logs: if True, the job log files will be deleted :param clean_job: if True, the job results will be removed :param silent: if False, the method will print status messages """ if not job.state in db.STATES_ACTIVE: return False log.info("Holding job: %s-%d", str(job), job.id) set_state(job, db.STATE_HOLD, cleanup=clean_job) db.update_job_states(get_group_jobs(job)) if len(job.pipe_from) == 0: cluster = jip.cluster.get() cluster.cancel(job) if clean_logs: clean(job) if hold_children: # cancel children for child in job.children: hold(child, clean_job=clean_job, clean_logs=clean_logs, hold_children=hold_children) return True
def handle_signal(signum, frame): log.warn("Signal %s received, going to fail state", signum) # Some signal arrived and maybe the job object is detached, # we need to re-create the session and re-attach the object from sqlalchemy import inspect insp = inspect(job) if insp.detached: session = jip.db.create_session() session.add(job) set_state(job, jip.db.STATE_FAILED, check_state=save) if save: db.update_job_states([job] + job.pipe_to) sys.exit(1)
def cancel(job, clean_job=False, clean_logs=False, cluster=None, save=False, cancel_children=True): """Cancel the given job and make sure its no longer on the cluster. The function takes only jobs that are in active state and takes care of the cancellation of any children. :param job: the job :type job: `jip.db.Job` :param clean_logs: if True, the job log files will be deleted :param clean_job: if True, the job results will be removed :param cluster: if not Cluster is specified and this is the parent job in a group, the default cluster is loaded :param save: if True, save job in database after state change :param cancel_children: set this to False to disable canceling children of a given job :returns: True if job was canceled """ if not job.state in db.STATES_ACTIVE and job.state != db.STATE_CANCELED: return False log.info("Canceling job: %s-%d", str(job), job.id) set_state(job, db.STATE_CANCELED, cleanup=clean_job) if save: db.update_job_states(job) # cancel the job on the cluster if this is a parent job if len(job.pipe_from) == 0: cluster = jip.cluster.get() if not cluster else cluster cluster.cancel(job) if clean_logs: clean(job) # cancel children if cancel_children: for child in job.children: cancel(child, clean_job=clean_job, clean_logs=clean_logs, cluster=cluster, save=save, cancel_children=cancel_children) return True
def cancel(job, clean_job=False, clean_logs=False, cluster=None, save=False, cancel_children=True ): """Cancel the given job and make sure its no longer on the cluster. The function takes only jobs that are in active state and takes care of the cancellation of any children. :param job: the job :type job: `jip.db.Job` :param clean_logs: if True, the job log files will be deleted :param clean_job: if True, the job results will be removed :param cluster: if not Cluster is specified and this is the parent job in a group, the default cluster is loaded :param save: if True, save job in database after state change :param cancel_children: set this to False to disable canceling children of a given job :returns: True if job was canceled """ if not job.state in db.STATES_ACTIVE and job.state != db.STATE_CANCELED: return False log.info("Canceling job: %s-%d", str(job), job.id) set_state(job, db.STATE_CANCELED, cleanup=clean_job) if save: db.update_job_states(job) # cancel the job on the cluster if this is a parent job if len(job.pipe_from) == 0: cluster = jip.cluster.get() if not cluster else cluster cluster.cancel(job) if clean_logs: clean(job) # cancel children if cancel_children: for child in job.children: cancel(child, clean_job=clean_job, clean_logs=clean_logs, cluster=cluster, save=save, cancel_children=cancel_children) return True
def run_job(job, save=False, profiler=False, submit_embedded=False, closeDB=False): """Execute the given job. This method returns immediately in case the job has a pipe source. Otherwise the job and all its dispatch jobs are executed. NOTE that the run method creates a signal handler that sets the given job state to failed in case the jobs process is terminated by a signal. :param job: the job to run. Note the jobs with pipe sources are ignored :type job: `jip.db.Job` :param save: if True the jobs state changes are persisted in the database :param profiler: if set to True, job profiling is enabled :param submit_embedded: if True, embedded pipelines will be submitted and not executed directly :returns: True if the job was executed successfully :rtype: boolean """ if len(job.pipe_from) > 0: return # setup signal handling _setup_signal_handler(job, save=save) # create the dispatcher graph dispatcher_nodes = jip.executils.create_dispatcher_graph(job) log.info("%s | Dispatch graph: %s", job, dispatcher_nodes) # load job environment env = job.env if env is not None: for k, v in env.iteritems(): log.info("Loading job environment %s:%s", k, v) os.environ[k] = str(v) # Issue #37 # make sure working directories exist at submission time if not os.path.exists(job.working_directory): os.makedirs(job.working_directory) for child in job.pipe_to: if not os.path.exists(child.working_directory): os.makedirs(child.working_directory) # Execute the commands for dispatcher_node in dispatcher_nodes: dispatcher_node.run(profiler=profiler) all_jobs = get_group_jobs(job) if save: # save the update job state db.update_job_states(all_jobs) success = True # Close the DB connection for the execution of the commands, # the job object gets the detached state session=jip.db.create_session() jip.db.commit_session(session) session.close() # we collect the state of all jobs in the dispatcher first # a single failure will cause ALL nodes/jobs in that dispatcher # to be marked as failed for dispatcher_node in reversed(dispatcher_nodes): success &= dispatcher_node.wait() # The commands finished their execution, re-attach the job object session = jip.db.create_session() session.add(job) # get the new state and update all jobs new_state = db.STATE_DONE if success else db.STATE_FAILED for dispatcher_node in reversed(dispatcher_nodes): for job in dispatcher_node.sources: jip.jobs.set_state(job, new_state, update_children=False) if save: # save the update job state at the end of the run db.update_job_states(all_jobs) # handle embedded pipelines and callables if job.on_success and success: for element in job.on_success: if isinstance(element, jip.pipelines.Pipeline): ## run or submit embedded pipeline # Create a base profile for the embedded job # that is based on the current jobs profile profile = jip.profiles.Profile.from_job(job) # glob the inputs for n in element.nodes(): n._tool.options.glob_inputs() # TODO: handle the other paramters (i.e. profile, keep) # TODO: catch exception and make the job fail jobs = create_jobs(element, profile=profile) # add dependency to this job for j in jobs: j.dependencies.append(job) for exe in create_executions(jobs, save=submit_embedded): if not submit_embedded: success &= run_job(exe.job, save=save) else: submit_job(exe.job) return success
def submit_job(job, clean=False, force=False, save=True, cluster=None): """Submit the given job to the cluster. This only submits jobs that are not `DONE`. The job has to be in `canceled`, `failed`, `queued`, or `hold` state to be submitted, unless `force` is set to True. This will NOT submit the child jobs. You have to submit the children yourself and ensure you do that in proper order. If job submission is forced and a job is in active state, the job is canceled first to ensure there is only a single instance of the job on the cluster. You have to set save to True in order to save the jobs after successful submission. This will use :py:meth:`jip.db.create_session` to get a session instance. If no cluster is specified, :py:func:`jip.cluster.get` is used to load the default cluster. This will raise a ``jip.cluster.ClusterImplementationError`` in case no compute cluster is configured. :param job: the job to be submitted :param clean: if True, the job log files will be submitted :param force: force job submission :param save: if True, job will be saved to the database :param cluster: the compute cluster instance. If ``None``, the default cluster will be loaded from the jip configuration :returns: True if the job was submitted :raises jip.cluster.ClusterImplementationError: if no cluster could be loaded """ log.info("(Re)submitting %s", job) if not force and job.state == db.STATE_DONE: return False if len(job.pipe_from) != 0: return False cluster = cluster if cluster else jip.cluster.get() # cancel or clean the job if job.state in db.STATES_ACTIVE: cancel(job, clean_logs=True, cluster=cluster, cancel_children=False) elif clean: jip.jobs.clean(job, cluster=cluster) # set the job state set_state(job, db.STATE_QUEUED, update_children=True) if job.id is None: if not save: raise Exception("No ID assigned to your job! You have to enable " "database save with save=True to store the " "job and get an ID.") session = db.create_session() session.add(job) session = db.commit_session(session) session.close() # Issue #12 # we have to make sure that log file folders exist # otherwise job submission might succeed but nothing # will be executed and the job failes silently without log files for log_file in (job.stdout, job.stderr): if not log_file: continue parent = os.path.dirname(log_file) if not parent: continue if not os.path.exists(parent): os.makedirs(parent) # Issue #37 # make sure working directories exist at submission time if not os.path.exists(job.working_directory): os.makedirs(job.working_directory) for child in job.pipe_to: if not os.path.exists(child.working_directory): os.makedirs(child.working_directory) # submit the job cluster.submit(job) all_jobs = [job] # update child ids def _set_id(child): all_jobs.append(child) child.job_id = job.job_id for c in child.pipe_to: _set_id(c) map(_set_id, job.pipe_to) if save: # save updates to job_id and dates for all_jobs db.update_job_states(all_jobs) return True
def handle_signal(signum, frame): log.warn("Signal %s received, going to fail state", signum) set_state(job, jip.db.STATE_FAILED, check_state=save) if save: db.update_job_states([job] + job.pipe_to) sys.exit(1)
def run_job(job, save=False, profiler=False, submit_embedded=False, closeDB=False): """Execute the given job. This method returns immediately in case the job has a pipe source. Otherwise the job and all its dispatch jobs are executed. NOTE that the run method creates a signal handler that sets the given job state to failed in case the jobs process is terminated by a signal. :param job: the job to run. Note the jobs with pipe sources are ignored :type job: `jip.db.Job` :param save: if True the jobs state changes are persisted in the database :param profiler: if set to True, job profiling is enabled :param submit_embedded: if True, embedded pipelines will be submitted and not executed directly :returns: True if the job was executed successfully :rtype: boolean """ if len(job.pipe_from) > 0: return # setup signal handling _setup_signal_handler(job, save=save) # create the dispatcher graph dispatcher_nodes = jip.executils.create_dispatcher_graph(job) log.info("%s | Dispatch graph: %s", job, dispatcher_nodes) # load job environment env = job.env if env is not None: for k, v in env.iteritems(): log.info("Loading job environment %s:%s", k, v) os.environ[k] = str(v) # Issue #37 # make sure working directories exist at submission time if not os.path.exists(job.working_directory): os.makedirs(job.working_directory) for child in job.pipe_to: if not os.path.exists(child.working_directory): os.makedirs(child.working_directory) # Execute the commands for dispatcher_node in dispatcher_nodes: dispatcher_node.run(profiler=profiler) all_jobs = get_group_jobs(job) if save: # save the update job state db.update_job_states(all_jobs) success = True # Close the DB connection for the execution of the commands, # the job object gets the detached state session = jip.db.create_session() jip.db.commit_session(session) session.close() # we collect the state of all jobs in the dispatcher first # a single failure will cause ALL nodes/jobs in that dispatcher # to be marked as failed for dispatcher_node in reversed(dispatcher_nodes): success &= dispatcher_node.wait() # The commands finished their execution, re-attach the job object session = jip.db.create_session() session.add(job) # get the new state and update all jobs new_state = db.STATE_DONE if success else db.STATE_FAILED for dispatcher_node in reversed(dispatcher_nodes): for job in dispatcher_node.sources: jip.jobs.set_state(job, new_state, update_children=False) if save: # save the update job state at the end of the run db.update_job_states(all_jobs) # handle embedded pipelines and callables if job.on_success and success: for element in job.on_success: if isinstance(element, jip.pipelines.Pipeline): ## run or submit embedded pipeline # Create a base profile for the embedded job # that is based on the current jobs profile profile = jip.profiles.Profile.from_job(job) # glob the inputs for n in element.nodes(): n._tool.options.glob_inputs() # TODO: handle the other paramters (i.e. profile, keep) # TODO: catch exception and make the job fail jobs = create_jobs(element, profile=profile) # add dependency to this job for j in jobs: j.dependencies.append(job) for exe in create_executions(jobs, save=submit_embedded): if not submit_embedded: success &= run_job(exe.job, save=save) else: submit_job(exe.job) return success