class JobSLURM(object): """Makes launching SLURM jobs easy to write and easy to use. Here are some examples on how to use this class: for command in ['print "hi"', 'print "hello"']: job = JobSLURM(command, time='00:01:00', qos='short') job.run() for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']: command = ['import sh\n'] command += ['script = sh.Command("analyze.py")\n'] command += ['script(%s)' % path] job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:]) job.run() print "Job %i is running !" % job.id Then you can easily check the status of your job and the standard out that is associated: print job.status print job.log_tail print job.info['time_left'] etc. """ extensions = {'bash': "sh", 'python': "py"} shebang_headers = { 'bash': ["#!/bin/bash -le"], # As a login shell and stop on error 'python': ["#!/usr/bin/env python"] } slurm_headers = OrderedDict(( ('job_name', { 'tag': '#SBATCH -J %s', 'needed': True }), ('change_dir', { 'tag': '#SBATCH -D %s', 'needed': True, 'default': os.path.abspath(os.getcwd()) }), ('out_file', { 'tag': '#SBATCH -o %s', 'needed': True, 'default': '/dev/null' }), ('project', { 'tag': '#SBATCH -A %s', 'needed': False, 'default': 'b2011035' }), ('time', { 'tag': '#SBATCH -t %s', 'needed': True, 'default': '7-00:00:00' }), ('machines', { 'tag': '#SBATCH -N %s', 'needed': True, 'default': '1' }), ('cores', { 'tag': '#SBATCH -n %s', 'needed': True, 'default': num_processors }), ('partition', { 'tag': '#SBATCH -p %s', 'needed': True, 'default': 'node' }), ('email', { 'tag': '#SBATCH --mail-user %s', 'needed': False, 'default': os.environ.get('EMAIL') }), ('email-when', { 'tag': '#SBATCH --mail-type=%s', 'needed': True, 'default': 'END' }), ('qos', { 'tag': '#SBATCH --qos=%s', 'needed': False, 'default': 'short' }), ('dependency', { 'tag': '#SBATCH -d %s', 'needed': False, 'default': 'afterok:1' }), ('constraint', { 'tag': '#SBATCH -C %s', 'needed': False, 'default': 'fat' }), ('cluster', { 'tag': '#SBATCH -M %s', 'needed': False, 'default': 'milou' }), ('alloc', { 'tag': '#SBATCH --reservation=%s', 'needed': False, 'default': 'workstation' }), ('jobid', { 'tag': '#SBATCH --jobid=%i', 'needed': False, 'default': 2173455 }), ('memory', { 'tag': '#SBATCH --mem=%i', 'needed': False, 'default': 120000 }), ('mem_per_cpu', { 'tag': '#SBATCH --mem-per-cpu=%i', 'needed': False, 'default': 512 }), ('threads', { 'tag': '#SBATCH --cpus-per-task=%i', 'needed': False, 'default': num_processors }), )) script_headers = { 'bash': ['echo "SLURM: start at $(date) on $(hostname)"'], 'python': [ 'import dateutil.tz, datetime, platform', 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'node = platform.node()', 'print "SLURM: start at {0} on {1}".format(now, node)' ] } script_footers = { 'bash': ['echo "SLURM: end at $(date)"'], 'python': [ 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'print "SLURM: end at {0}".format(now)' ] } def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name) @property def name(self): return self.kwargs['job_name'] def __init__(self, command=["print 'Hello world'"], language='python', base_dir=None, script_path=None, **kwargs): # Required attributes # self.command = command self.language = language self.kwargs = kwargs # Set the file paths # self.set_paths(base_dir, script_path) # Check command type # if not isinstance(self.command, list): self.command = [self.command] # Get the name # if 'job_name' not in self.kwargs: hashed = hashlib.md5(''.join(self.command)).digest() encoded = base64.urlsafe_b64encode(hashed) self.kwargs['job_name'] = encoded # Check we have a project otherwise choose the one with less hours # if hostname.startswith('milou'): if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ: if projects: self.kwargs['project'] = projects[0]['name'] def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath( os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath( os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path)) @property_cached def slurm_params(self): """The list of parameters to give to the `sbatch` command.""" # Main loop # result = OrderedDict() for param, info in self.slurm_headers.items(): if not info['needed'] and not param in self.kwargs: continue if param in self.kwargs: result[param] = self.kwargs.get(param) else: result[param] = info['default'] # Special cases # if result.get('cluster') == 'halvan': result['partition'] = 'halvan' # Return # return result @property def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [ self.slurm_headers[k]['tag'] % v for k, v in self.slurm_params.items() ] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join( flatter([ self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer ])) def make_script(self): """Make the script and return a FilePath object pointing to the script above.""" self.script_path.write(self.script) self.script_path.permissions.make_executable() return self.script_path @property def log(self): """The log as a FilePath object""" return self.slurm_params['out_file'] @property def log_tail(self): """If we have a log file, what is its tail""" if not self.kwargs['out_file'].exists: return False else: return tail(self.slurm_params['out_file']) @property def status(self): """What is the status of the job ?""" # If there is no script it is either ready or a lost duplicate # if not self.script_path.exists: if self.name in jobs.names: return "DUPLICATE" if self.name not in jobs.names: return "READY" # It is submitted already # if self.name in jobs.names: if jobs[self.name]['type'] == 'queued': return "QUEUED" if jobs[self.name]['type'] == 'running': return "RUNNING" # So the script exists for sure but it is not in the queue # if not self.kwargs['out_file'].exists: return "ABORTED" # Let's look in log file # if 'CANCELED' in self.log_tail: return "CANCELLED" if 'slurmstepd: error' in self.log_tail: return "CANCELLED" # It all looks good # if 'SLURM: end at' in self.log_tail: return "FINISHED" # At this point we have no idea # return "INTERUPTED" @property def info(self): """Get the existing job information dictionary""" if self.name not in jobs: return {'status': self.status} else: return jobs[self.name] #-------------------------------------------------------------------------# def run(self): """Will call self.launch() after performing some checks""" # Check already exists # if self.status == "READY": return self.launch() # Check name conflict # if self.status == "DUPLICATE": message = "Job with same name '%s' already in queue, but we lost the script." if self.status == "QUEUED": message = "Job '%s' already in queue." if self.status == "RUNNING": message = "Job '%s' already running." if self.status == "FINISHED": message = "Job '%s' already ended successfully." if self.status == "ABORTED": message = "Job '%s' was killed without any output file (?)." if self.status == "CANCELED": message = "Job '%s' was canceled or killed while running." if self.status == "INTERUPTED": message = "Job '%s' is not running. We don't know why. Look at the log file." print Color.i_red + message % (self.name, ) + Color.end print "Job might have run already (?). Not starting." def launch(self): """Make the script file and return the newly created job id""" # Make script file # self.make_script() # Do it # sbatch_out = sh.sbatch(self.script_path) jobs.expire() # Message # print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out), # Return id # self.id = int( re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0]) return self.id def cancel(self): if self.status != "QUEUED" and self.status != "RUNNING": raise Exception("Can't cancel job '%s'" % self.name) sh.scancel(self.info['jobid']) def wait(self): """Wait until the job is finished""" pass #-------------------------------------------------------------------------# def run_locally(self): """A convenience method to run the same result as a SLURM job but locally in a non-blocking way. Useful for testing.""" self.thread = threading.Thread(target=self.execute_locally) self.thread.daemon = True # So that they die when we die self.thread.start() def execute_locally(self): """Runs the equivalent command locally in a blocking way.""" # Make script file # self.make_script() # Do it # with open(self.kwargs['out_file'], 'w') as handle: sh.python(self.script_path, _out=handle, _err=handle) def wait_locally(self): """If you have run the query in a non-blocking way, call this method to pause until the query is finished.""" try: self.thread.join( sys.maxint) # maxint timeout so that we can Ctrl-C them except KeyboardInterrupt: print "Stopped waiting on job '%s'" % self.kwargs['job_name']
class JobSLURM(object): """Makes launching SLURM jobs easy to write and easy to use. Here are some examples on how to use this class: for command in ['print "hi"', 'print "hello"']: job = JobSLURM(command, time='00:01:00', qos='short') job.run() for path in ['~/data/scafolds1.txt', '~/data/scafolds2.txt', '~/data/scafolds3.txt']: command = ['import sh\n'] command += ['script = sh.Command("analyze.py")\n'] command += ['script(%s)' % path] job = JobSLURM(command, time='00:01:00', qos='short', job_name=path[-25:]) job.run() print "Job %i is running !" % job.id Then you can easily check the status of your job and the standard out that is associated: print job.status print job.log_tail print job.info['time_left'] etc. """ extensions = { 'bash': "sh", 'python': "py" } shebang_headers = { 'bash': ["#!/bin/bash -le"], # As a login shell and stop on error 'python': ["#!/usr/bin/env python"] } slurm_headers = OrderedDict(( ('job_name' , {'tag': '#SBATCH -J %s', 'needed': True}), ('change_dir' , {'tag': '#SBATCH -D %s', 'needed': True, 'default': os.path.abspath(os.getcwd())}), ('out_file' , {'tag': '#SBATCH -o %s', 'needed': True, 'default': '/dev/null'}), ('project' , {'tag': '#SBATCH -A %s', 'needed': False, 'default': 'b2011035'}), ('time' , {'tag': '#SBATCH -t %s', 'needed': True, 'default': '7-00:00:00'}), ('machines' , {'tag': '#SBATCH -N %s', 'needed': True, 'default': '1'}), ('cores' , {'tag': '#SBATCH -n %s', 'needed': True, 'default': num_processors}), ('partition' , {'tag': '#SBATCH -p %s', 'needed': True, 'default': 'node'}), ('email' , {'tag': '#SBATCH --mail-user %s', 'needed': False, 'default': os.environ.get('EMAIL')}), ('email-when' , {'tag': '#SBATCH --mail-type=%s', 'needed': True, 'default': 'END'}), ('qos' , {'tag': '#SBATCH --qos=%s', 'needed': False, 'default': 'short'}), ('dependency' , {'tag': '#SBATCH -d %s', 'needed': False, 'default': 'afterok:1'}), ('constraint' , {'tag': '#SBATCH -C %s', 'needed': False, 'default': 'fat'}), ('cluster' , {'tag': '#SBATCH -M %s', 'needed': False, 'default': 'milou'}), ('alloc' , {'tag': '#SBATCH --reservation=%s', 'needed': False, 'default': 'workstation'}), ('jobid' , {'tag': '#SBATCH --jobid=%i', 'needed': False, 'default': 2173455}), ('memory' , {'tag': '#SBATCH --mem=%i', 'needed': False, 'default': 120000}), ('mem_per_cpu', {'tag': '#SBATCH --mem-per-cpu=%i', 'needed': False, 'default': 512}), ('threads' , {'tag': '#SBATCH --cpus-per-task=%i', 'needed': False, 'default': num_processors}), )) script_headers = { 'bash': ['echo "SLURM: start at $(date) on $(hostname)"'], 'python': ['import dateutil.tz, datetime, platform', 'now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'node = platform.node()', 'print "SLURM: start at {0} on {1}".format(now, node)']} script_footers = { 'bash': ['echo "SLURM: end at $(date)"'], 'python': ['now = datetime.datetime.now(dateutil.tz.tzlocal())', r'now = now.strftime("%Y-%m-%d %Hh%Mm%Ss %Z%z")', 'print "SLURM: end at {0}".format(now)']} def __repr__(self): return '<%s object "%s">' % (self.__class__.__name__, self.name) @property def name(self): return self.kwargs['job_name'] def __init__(self, command = ["print 'Hello world'"], language = 'python', base_dir = None, script_path = None, **kwargs): # Required attributes # self.command = command self.language = language self.kwargs = kwargs # Set the file paths # self.set_paths(base_dir, script_path) # Check command type # if not isinstance(self.command, list): self.command = [self.command] # Get the name # if 'job_name' not in self.kwargs: hashed = hashlib.md5(''.join(self.command)).digest() encoded = base64.urlsafe_b64encode(hashed) self.kwargs['job_name'] = encoded # Check we have a project otherwise choose the one with less hours # if hostname.startswith('milou'): if 'project' not in self.kwargs and 'SBATCH_ACCOUNT' not in os.environ: if projects: self.kwargs['project'] = projects[0]['name'] def set_paths(self, base_dir, script_path): """Set the directory, the script path and the outfile path""" # Make absolute paths # if 'change_dir' in self.kwargs: self.kwargs['change_dir'] = DirectoryPath(os.path.abspath(self.kwargs['change_dir'])) if 'out_file' in self.kwargs: self.kwargs['out_file'] = FilePath(os.path.abspath(self.kwargs['out_file'])) # In case there is a base directory # if base_dir is not None: self.base_dir = DirectoryPath(os.path.abspath(base_dir)) self.script_path = FilePath(base_dir + "run." + self.extensions[self.language]) self.kwargs['change_dir'] = base_dir self.kwargs['out_file'] = FilePath(base_dir + "run.out") # Other cases # if base_dir is None and script_path is None: self.script_path = FilePath(new_temp_path()) if script_path is not None: self.script_path = FilePath(os.path.abspath(script_path)) @property_cached def slurm_params(self): """The list of parameters to give to the `sbatch` command.""" # Main loop # result = OrderedDict() for param, info in self.slurm_headers.items(): if not info['needed'] and not param in self.kwargs: continue if param in self.kwargs: result[param] = self.kwargs.get(param) else: result[param] = info['default'] # Special cases # if result.get('cluster') == 'halvan': result['partition'] = 'halvan' # Return # return result @property def script(self): """The script to be submitted to the SLURM queue.""" self.shebang_header = self.shebang_headers[self.language] self.slurm_header = [self.slurm_headers[k]['tag'] % v for k,v in self.slurm_params.items()] self.script_header = self.script_headers[self.language] self.script_footer = self.script_footers[self.language] return '\n'.join(flatter([self.shebang_header, self.slurm_header, self.script_header, self.command, self.script_footer])) def make_script(self): """Make the script and return a FilePath object pointing to the script above.""" self.script_path.write(self.script) self.script_path.permissions.make_executable() return self.script_path @property def log(self): """The log as a FilePath object""" return self.slurm_params['out_file'] @property def log_tail(self): """If we have a log file, what is its tail""" if not self.kwargs['out_file'].exists: return False else: return tail(self.slurm_params['out_file']) @property def status(self): """What is the status of the job ?""" # If there is no script it is either ready or a lost duplicate # if not self.script_path.exists: if self.name in jobs.names: return "DUPLICATE" if self.name not in jobs.names: return "READY" # It is submitted already # if self.name in jobs.names: if jobs[self.name]['type'] == 'queued': return "QUEUED" if jobs[self.name]['type'] == 'running': return "RUNNING" # So the script exists for sure but it is not in the queue # if not self.kwargs['out_file'].exists: return "ABORTED" # Let's look in log file # if 'CANCELED' in self.log_tail: return "CANCELLED" if 'slurmstepd: error' in self.log_tail: return "CANCELLED" # It all looks good # if 'SLURM: end at' in self.log_tail: return "FINISHED" # At this point we have no idea # return "INTERUPTED" @property def info(self): """Get the existing job information dictionary""" if self.name not in jobs: return {'status': self.status} else: return jobs[self.name] #-------------------------------------------------------------------------# def run(self): """Will call self.launch() after performing some checks""" # Check already exists # if self.status == "READY": return self.launch() # Check name conflict # if self.status == "DUPLICATE": message = "Job with same name '%s' already in queue, but we lost the script." if self.status == "QUEUED": message = "Job '%s' already in queue." if self.status == "RUNNING": message = "Job '%s' already running." if self.status == "FINISHED": message = "Job '%s' already ended successfully." if self.status == "ABORTED": message = "Job '%s' was killed without any output file (?)." if self.status == "CANCELED": message = "Job '%s' was canceled or killed while running." if self.status == "INTERUPTED": message = "Job '%s' is not running. We don't know why. Look at the log file." print Color.i_red + message % (self.name,) + Color.end print "Job might have run already (?). Not starting." def launch(self): """Make the script file and return the newly created job id""" # Make script file # self.make_script() # Do it # sbatch_out = sh.sbatch(self.script_path) jobs.expire() # Message # print Color.i_blu + "SLURM:" + Color.end + " " + str(sbatch_out), # Return id # self.id = int(re.findall("Submitted batch job ([0-9]+)", str(sbatch_out))[0]) return self.id def cancel(self): if self.status != "QUEUED" and self.status != "RUNNING": raise Exception("Can't cancel job '%s'" % self.name) sh.scancel(self.info['jobid']) def wait(self): """Wait until the job is finished""" pass #-------------------------------------------------------------------------# def run_locally(self): """A convenience method to run the same result as a SLURM job but locally in a non-blocking way. Useful for testing.""" self.thread = threading.Thread(target=self.execute_locally) self.thread.daemon = True # So that they die when we die self.thread.start() def execute_locally(self): """Runs the equivalent command locally in a blocking way.""" # Make script file # self.make_script() # Do it # with open(self.kwargs['out_file'], 'w') as handle: sh.python(self.script_path, _out=handle, _err=handle) def wait_locally(self): """If you have run the query in a non-blocking way, call this method to pause until the query is finished.""" try: self.thread.join(sys.maxint) # maxint timeout so that we can Ctrl-C them except KeyboardInterrupt: print "Stopped waiting on job '%s'" % self.kwargs['job_name']