def _submit_jobs(jobs, home_address, temp_dir=DEFAULT_TEMP_DIR, white_list=None, quiet=True): """ Method used to send a list of jobs onto the cluster. :param jobs: list of jobs to be executed :type jobs: list of `Job` :param home_address: Full address (including IP and port) of JobMonitor on submitting host. Running jobs will communicate with the parent process at that address via ZMQ. :type home_address: str :param temp_dir: Local temporary directory for storing output for an individual job. :type temp_dir: str :param white_list: List of acceptable nodes to use for scheduling job. If None, all are used. :type white_list: list of str :param quiet: When true, do not output information about the jobs that have been submitted. :type quiet: bool :returns: Session ID """ with Session() as session: for job in jobs: # set job white list job.white_list = white_list # remember address of submission host job.home_address = home_address # append jobs _append_job_to_session(session, job, temp_dir=temp_dir, quiet=quiet) sid = session.contact return sid
def __exit__(self, exc_type, exc_value, exc_tb): ''' Gracefully handle exceptions by terminating all jobs, and closing sockets. ''' # Always close socket self.socket.close() # Clean up if we have a valid session if self.session_id is not None: with Session(self.session_id) as session: # If we encounter an exception, kill all jobs if exc_type is not None: self.logger.info('Encountered %s, so killing all jobs.', exc_type.__name__) # try to kill off all old jobs try: session.control(JOB_IDS_SESSION_ALL, JobControlAction.TERMINATE) except InvalidJobException: self.logger.debug("Could not kill all jobs for " + "session.", exc_info=True) # Get rid of job info to prevent memory leak try: session.synchronize([JOB_IDS_SESSION_ALL], TIMEOUT_NO_WAIT, dispose=True) except ExitTimeoutException: pass
def test_with_session(): """'with' statement works with Session""" with Session() as s: print(s.version) print(s.contact) print(s.drmsInfo) print(s.drmaaImplementation)
def test_with_jt(): """'with' statement works with JobTemplate""" s = Session() s.initialize() with s.createJobTemplate() as jt: jt.remoteCommand = 'sleep' jt.args = ['10'] jid = s.runJob(jt) print(s.wait(jid)) s.exit()
def _resubmit(session_id, job, temp_dir): """ Resubmit a failed job. :returns: ID of new job """ logger = logging.getLogger(__name__) logger.info("starting resubmission process") if DRMAA_PRESENT: # append to session with Session(session_id) as session: # try to kill off old job try: session.control(job.id, JobControlAction.TERMINATE) logger.info("zombie job killed") except Exception: logger.error("Could not kill job with SGE id %s", job.id, exc_info=True) # create new job _append_job_to_session(session, job, temp_dir=temp_dir) else: logger.error("Could not restart job because we're in local mode.")
return "sge" # XXX: should probably change to GE elif "Platform LSF" in drms_info: # includes "IBM Platform LSF" return "lsf" elif drms_info.startswith("SLURM"): return "slurm" # not sure what PBS and PBS Pro return here. elif drms_info.startswith("Torque"): return "pbs" else: msg = ("unsupported distributed resource management system: %s" % drms_info) raise ValueError(msg) # non-reentrant code with Session() as _session: driver_name = get_driver_name(_session) driver = __import__(driver_name, globals(), locals(), [driver_name], 1) JobTemplateFactory = driver.JobTemplateFactory make_native_spec = driver.make_native_spec get_job_max_query_lifetime = driver.get_job_max_query_lifetime class RestartableJob(object): def __init__(self, session, job_tmpl_factory, global_mem_usage, mem_usage_key): self.session = session self.job_tmpl_factory = job_tmpl_factory # last trial index tried
def setUp(self): self.s = Session()