def Scan(config, ctr_dirs): """ Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``. If the job has stopped running, more detailed information is fetched with ``scontrol``, and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_slurm) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job (squeue) ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs)) execute = execute_local if not Config.remote_host else execute_remote #args = Config.slurm_bin_path + '/squeue -a -h -o %i:%T -t all -j ' + ','.join(jobs.keys()) args = Config.slurm_bin_path + '/oarstat -fj ' + '-fj'.join(jobs.keys())
def Submit(config, jobdesc): """ Submits a job to the SLURM queue specified in arc.conf. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and submitted with ``sbatch``. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`str` """ configure(config, set_slurm) validate_attributes(jobdesc) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) # Run RTE stage0 debug('----- starting slurmSubmitter.py -----', 'slurm.Submit') RTE_stage0(jobdesc, 'SLURM', SBATCH_ACCOUNT = 'OtherAttributes.SBATCH_ACCOUNT') set_grid_global_jobid(jobdesc) # Create script file and write job script jobscript = get_job_script(jobdesc) script_file = write_script_file(jobscript) debug('Created file %s' % script_file, 'slurm.Submit') debug('SLURM jobname: %s' % jobdesc.Identification.JobName, 'slurm.Submit') debug('SLURM job script built', 'slurm.Submit') debug('----------------- BEGIN job script -----', 'slurm.Submit') emptylines = 0 for line in jobscript.split('\n'): if not line: emptylines += 1 else: debug(emptylines*'\n' + line.replace("%", "%%"), 'slurm.Submit') emptylines = 0 if emptylines > 1: debug((emptylines-1)*'\n', 'slurm.Submit') debug('----------------- END job script -----', 'slurm.Submit') if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes': return "-1" ####################################### # Submit the job ###################################### execute = execute_local if not Config.remote_host else execute_remote directory = jobdesc.OtherAttributes['joboption;directory'] debug('Session directory: %s' % directory, 'slurm.Submit') SLURM_TRIES = 0 handle = None while SLURM_TRIES < 10: args = '%s/oarsub %s' % (Config.slurm_bin_path, script_file) verbose('Executing \'%s\' on %s' % (args, Config.remote_host if Config.remote_host else 'localhost'), 'slurm.Submit') handle = execute(args) if handle.returncode == 0: break if handle.returncode == 198 or wait_for_queue(handle): debug('Waiting for queue to decrease', 'slurm.Submit') time.sleep(60) SLURM_TRIES += 1 continue break # Other error than full queue if handle.returncode == 0: # TODO: Test what happens when the jobqueue is full or when the slurm # ctld is not responding. SLURM 1.x and 2.2.x outputs the jobid into # STDERR and STDOUT respectively. Concat them, and let sed sort it out. # From the exit code we know that the job was submitted, so this # is safe. Ulf Tigerstedt <*****@*****.**> 1.5.2011 localid = get_job_id(handle) if localid: debug('Job submitted successfully!', 'slurm.Submit') debug('Local job id: ' + localid, 'slurm.Submit') debug('----- exiting submitSubmitter.py -----', 'slurm.Submit') return localid debug('job *NOT* submitted successfully!', 'slurm.Submit') debug('got error code from sbatch: %d !' % handle.returncode, 'slurm.Submit') debug('Output is:\n' + ''.join(handle.stdout), 'slurm.Submit') debug('Error output is:\n' + ''.join(handle.stderr), 'slurm.Submit') debug('----- exiting slurmSubmitter.py -----', 'slurm.Submit')