def test_bacct_done1(): """Test parsing `bacct -l` output for a not-so-trivial job.""" lsf = LsfLrms( name='test', architecture=gc3libs.Run.Arch.X86_64, max_cores=1, max_cores_per_job=1, max_memory_per_core=1 * GB, max_walltime=1 * hours, auth=None, # ignored if `transport` is `local` frontend='localhost', transport='local', bacct='bacct') acct = lsf._parse_acct_output(""" Accounting information about jobs that are: - submitted by all users. - accounted on all projects. - completed normally or exited - executed on all hosts. - submitted to all queues. - accounted on all service classes. ------------------------------------------------------------------------------ Job <3329618>, User <rmurri>, Project <default>, Status <DONE>, Queue <pub.1h>, Command <md5sum lsf.o3224113 lsf.o3224132>, Share group ch arged </lsf_biol_all/lsf_aeber/rmurri> Mon Oct 8 17:08:54: Submitted from host <brutus4>, CWD <$HOME>, Output File <l sf.o%J>; Mon Oct 8 17:10:01: Dispatched to <a3041>; Mon Oct 8 17:10:07: Completed <done>. Accounting information about this job: Share group charged </lsf_biol_all/lsf_aeber/rmurri> CPU_T WAIT TURNAROUND STATUS HOG_FACTOR MEM SWAP 0.04 67 73 done 0.0005 3M 34M ------------------------------------------------------------------------------ SUMMARY: ( time unit: second ) Total number of done jobs: 1 Total number of exited jobs: 0 Total CPU time consumed: 0.0 Average CPU time consumed: 0.0 Maximum CPU time of a job: 0.0 Minimum CPU time of a job: 0.0 Total wait time in queues: 67.0 Average wait time in queue: 67.0 Maximum wait time in queue: 67.0 Minimum wait time in queue: 67.0 Average turnaround time: 73 (seconds/job) Maximum turnaround time: 73 Minimum turnaround time: 73 Average hog factor of a job: 0.00 ( cpu time / turnaround time ) Maximum hog factor of a job: 0.00 Minimum hog factor of a job: 0.00 """) assert_equal(acct['duration'], Duration('6s')) assert_equal(acct['used_cpu_time'], Duration('0.04s')) assert_equal(acct['max_used_memory'], Memory('37MB')) # timestamps year = datetime.date.today().year assert_equal(acct['lsf_submission_time'], datetime.datetime(year, 10, 8, 17, 8, 54)) assert_equal(acct['lsf_start_time'], datetime.datetime(year, 10, 8, 17, 10, 1)) assert_equal(acct['lsf_completion_time'], datetime.datetime(year, 10, 8, 17, 10, 7))
def jwt_expiration_delta(self): '''datetime.timedelta: time interval until JSON web token expires (default: ``datetime.timedelta(hours=72)``) ''' t = Duration(self._config.get( self._section, 'jwt_expiration_delta')) return datetime.timedelta(seconds=t.amount(Duration.second))
def test_bacct_killed(): """Test parsing `bacct -l` output for a canceled job.""" lsf = LsfLrms( name='test', architecture=gc3libs.Run.Arch.X86_64, max_cores=1, max_cores_per_job=1, max_memory_per_core=1 * GB, max_walltime=1 * hours, auth=None, # ignored if `transport` is `local` frontend='localhost', transport='local', bacct='bacct') acct = lsf._parse_acct_output(""" Accounting information about jobs that are: - submitted by all users. - accounted on all projects. - completed normally or exited - executed on all hosts. - submitted to all queues. - accounted on all service classes. ------------------------------------------------------------------------------ Job <3224113>, User <rmurri>, Project <default>, Status <EXIT>, Queue <pub.1h>, Command <sleep 300>, Share group charged </lsf_biol_all/ls f_aeber/rmurri> Fri Oct 5 17:49:35: Submitted from host <brutus4>, CWD <$HOME>, Output File <l sf.o%J>; Fri Oct 5 17:50:35: Dispatched to <a3191>; Fri Oct 5 17:51:30: Completed <exit>; TERM_OWNER: job killed by owner. Accounting information about this job: Share group charged </lsf_biol_all/lsf_aeber/rmurri> CPU_T WAIT TURNAROUND STATUS HOG_FACTOR MEM SWAP 0.04 60 115 exit 0.0003 1M 34M ------------------------------------------------------------------------------ SUMMARY: ( time unit: second ) Total number of done jobs: 0 Total number of exited jobs: 1 Total CPU time consumed: 0.0 Average CPU time consumed: 0.0 Maximum CPU time of a job: 0.0 Minimum CPU time of a job: 0.0 Total wait time in queues: 60.0 Average wait time in queue: 60.0 Maximum wait time in queue: 60.0 Minimum wait time in queue: 60.0 Average turnaround time: 115 (seconds/job) Maximum turnaround time: 115 Minimum turnaround time: 115 Average hog factor of a job: 0.00 ( cpu time / turnaround time ) Maximum hog factor of a job: 0.00 Minimum hog factor of a job: 0.00 """) assert_equal(acct['duration'], Duration('55s')) assert_equal(acct['used_cpu_time'], Duration('0.04s')) assert_equal(acct['max_used_memory'], Memory('35MB')) # timestamps year = datetime.date.today().year assert_equal(acct['lsf_submission_time'], datetime.datetime(year, 10, 5, 17, 49, 35)) assert_equal(acct['lsf_start_time'], datetime.datetime(year, 10, 5, 17, 50, 35)) assert_equal(acct['lsf_completion_time'], datetime.datetime(year, 10, 5, 17, 51, 30))
def test_divide_duration2(): n = randint(1, 100) d1 = Duration(2 * n, unit=Duration.days) d2 = d1 / 2 assert d2 == Duration(n, unit=Duration.days) assert 2 * d2 == d1 assert d2 * 2 == d1
def __init__(self, path, default_walltime='3h'): self.path = path self.cfg = {} self.defaults = {} with open(path, 'rU') as fd: log.debug("Reading CSV configuration file %s", path) cr = csv.reader(fd) lineno = 0 for line in cr: lineno += 1 if len(line) != 9: log.warning( "Ignoring line '%d' in csv configuration file %s: wrong number of fields (%d != 9)", lineno, path, len(line)) continue if line[0] in self.cfg: log.warning( "Overwriting dupliacate key in '%s' csv configuration file: '%s'", csvcfgfile, line[0]) try: # Check if this is an header line. These values # should always be integers. int(line[1]) int(line[3]) int(line[4]) int(line[5]) except ValueError: log.debug( "Ignoring line '%d' of file %s, some values do not convert to integer as expected.", lineno, path) continue data = { 'fps': line[1], 'pixel_to_scale': line[2], 'difference_lag': line[3], 'threshold1': line[4], 'threshold2': line[5], 'video_is_needed': False if line[7].lower() == 'optional' else True, 'email_to': line[8], } try: data['requested_walltime'] = Duration(line[6]) except ValueError as ex: log.error( "Unable to parse walltime '%s' for key %s in file" " %s: %s. Using default value of %s", line[6], line[0], path, ex, default_walltime) data['requested_walltime'] = Duration(default_walltime) key = line[0] if key.lower() == "default": self.defaults = data else: self.cfg[key] = data
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug("Process with PID %s found." " Checking its running status ...", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING # if `requested_walltime` is set, enforce it as a # running time limit if app.requested_walltime is not None: exit_code2, stdout2, stderr2 = self.transport.execute_command( "ps -p %d -o etimes=" % pid) if exit_code2 != 0: # job terminated already, do cleanup and return self._cleanup_terminating_task(app, pid) return app.execution.state cancel = False elapsed = Duration(stdout2.strip() + 'seconds') if elapsed > self.max_walltime: log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.", app, elapsed.to_timedelta(), self.max_walltime, self.name) cancel = True if elapsed > app.requested_walltime: log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.", app, elapsed.to_timedelta(), app.requested_walltime) cancel = True if cancel: self.cancel_job(app) # set signal to SIGTERM in termination status self._cleanup_terminating_task(app, pid, termstatus=(15, -1)) return app.execution.state else: log.debug( "Process with PID %d not found," " assuming task %s has finished running.", pid, app) self._cleanup_terminating_task(app, pid) self._get_persisted_resource_state() return app.execution.state
def test_bjobs_output_for_accounting(): lsf = LsfLrms( name='test', architecture=gc3libs.Run.Arch.X86_64, max_cores=1, max_cores_per_job=1, max_memory_per_core=1 * GB, max_walltime=1 * hours, auth=None, # ignored if `transport` is `local` frontend='localhost', transport='local') bjobs_output = """ Job <131851>, Job Name <ChromaExtractShort>, User <wwolski>, Project <default>, Status <DONE>, Queue <pub.8h>, Job Priority <50>, Command <ChromatogramExtractor -in /cluster/scratch/malars/openswa th/data/AQUA_fixed_water/split_napedro_L120224_001_SW-400A QUA_no_background_2ul_dilution_10/split_napedro_L120224_00 1_SW-400AQUA_no_background_2ul_dilution_10_28.mzML.gz -tr /cluster/scratch/malars/openswath/assays/iRT/DIA_iRT.TraML -out split_napedro_L120224_001_SW-400AQUA_no_background_2 ul_dilution_10_28._rtnorm.chrom.mzML -is_swath -min_upper_ edge_dist 1 -threads 2>, Share group charged </lsf_biol_al l/lsf_biol_other/wwolski> Tue Jul 24 10:03:15: Submitted from host <brutus3>, CWD <$HOME/.gc3pie_jobs/lrm s_job.YNZmU17755/.>, Output File <lsf.o%J>, Requested Reso urces <select[mem<70000 && lustre] order[-ut] rusage[mem=1 000,m=1]>, Login Shell </bin/sh>, Specified Hosts <thin+9> , <single+8>, <smp16+6>, <smp24+5>, <smp48+4>; RUNLIMIT 480.0 min of a6122 Tue Jul 24 10:04:19: Started on <a6122>, Execution Home </cluster/home/biol/wwo lski>, Execution CWD </cluster/home/biol/wwolski/.gc3pie_j obs/lrms_job.YNZmU17755/.>; Tue Jul 24 10:05:45: Done successfully. The CPU time used is 2.1 seconds. MEMORY USAGE: MAX MEM: 41 Mbytes; AVG MEM: 41 Mbytes SCHEDULING PARAMETERS: r15s r1m r15m ut pg io ls it tmp swp mem loadSched - - - - - - - - 1000M - - loadStop - - - - - - - - - - - scratch xs s m l xl sp loadSched 4000.0 - - - - - - loadStop - - - - - - - """ # Also parse the output of jobs to get accounting information acct = lsf._parse_acct_output(bjobs_output) assert_equal(acct['duration'], Duration('86s')) assert_equal(acct['used_cpu_time'], Duration('2.1s')) assert_equal(acct['max_used_memory'], Memory('41MB'))
def __parse_acct_output_w_bjobs(stdout): data = dict() # Try to parse used cputime match = LsfLrms._cpu_time_re.search(stdout) if match: cpu_time = match.group('cputime') data['used_cpu_time'] = Duration(float(cpu_time), unit=seconds) # Parse memory usage match = LsfLrms._mem_used_re.search(stdout) if match: mem_used = match.group('mem_used') # mem_unit should always be Mbytes data['max_used_memory'] = Memory(float(mem_used), unit=MB) # Find submission time and completion time lines = iter(stdout.split('\n')) for line in lines: match = LsfLrms._EVENT_RE.match(line) if match: timestamp = line.split(': ')[0] event = match.group('event') if event == 'Submitted': data['lsf_submission_time'] = \ LsfLrms._parse_timespec(timestamp) elif event in ['Dispatched', 'Started']: data['lsf_start_time'] = \ LsfLrms._parse_timespec(timestamp) elif event in ['Completed', 'Done successfully']: data['lsf_completion_time'] = \ LsfLrms._parse_timespec(timestamp) continue if 'lsf_completion_time' in data and 'lsf_start_time' in data: data['duration'] = Duration(data['lsf_completion_time'] - data['lsf_start_time']) else: # XXX: what should we use for jobs that did not run at all? data['duration'] = Duration(0, unit=seconds) return data
def _legacy_parse_duration(duration_str): try: # old-style config: integral number of hours val = int(duration_str) * hours gc3libs.log.warning("'max_walltime' should always have a " "valid unit format (e.g. '24 hours'). Using " "default unit: hours") return val except ValueError: # apply `Duration` parsing rules; if this fails, users will # see the error message from the `Duration` parser. return Duration(duration_str)
def __parse_acct_output_w_bacct(stdout): acctinfo = {} lines = iter( stdout.split('\n')) # need to lookup next line in the loop for line in lines: match = LsfLrms._EVENT_RE.match(line) if match: timestamp = line.split(': ')[0] event = match.group('event') if event == 'Submitted': acctinfo['lsf_submission_time'] = \ LsfLrms._parse_timespec(timestamp) elif event == 'Dispatched': acctinfo['lsf_start_time'] = \ LsfLrms._parse_timespec(timestamp) elif event == 'Completed': acctinfo['lsf_completion_time'] = \ LsfLrms._parse_timespec(timestamp) continue match = LsfLrms._RESOURCE_USAGE_RE.match(line) if match: # actual resource usage is on next line rusage = next(lines) cpu_t, wait, turnaround, status, hog_factor, mem, swap = \ rusage.split() # common backend attrs (see Issue 78) if 'lsf_completion_time' in acctinfo and 'lsf_start_time' in acctinfo: acctinfo['duration'] = Duration( acctinfo['lsf_completion_time'] - acctinfo['lsf_start_time']) else: # XXX: what should we use for jobs that did not run at all? acctinfo['duration'] = Duration(0, unit=seconds) acctinfo['used_cpu_time'] = Duration(float(cpu_t), unit=seconds) acctinfo['max_used_memory'] = LsfLrms._parse_memspec(mem)\ + LsfLrms._parse_memspec(swap) # the resource usage line is the last interesting line break return acctinfo
def create_debug_run_jobs(self, user_name, job_collection, batches, verbosity, duration, memory, cores): '''Creates debug jobs for the parallel "run" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.RunPhase empty collection of *run* jobs that should be populated batches: List[dict] job descriptions verbosity: int logging verbosity for jobs duration: str computational time that should be allocated for a single job; in HH:MM:SS format memory: int amount of memory in Megabyte that should be allocated for a single cores: int number of CPU cores that should be allocated for a single job Returns ------- tmlib.workflow.jobs.RunPhase run jobs ''' logger.info('create "debug" run jobs for submission %d', job_collection.submission_id) logger.debug('allocated time for debug run jobs: %s', duration) logger.debug('allocated memory for debug run jobs: %s MB', memory) logger.debug('allocated cores for debug run jobs: %d', cores) for b in batches: job = DebugRunJob(step_name=self.step_name, arguments=self._build_debug_run_command( b['site_id'], verbosity), output_dir=self.log_location, job_id=b['site_id'], submission_id=job_collection.submission_id, parent_id=job_collection.persistent_id, user_name=user_name) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) if not isinstance(cores, int): raise TypeError('Argument "cores" must have type int.') if not cores > 0: raise ValueError('The value of "cores" must be positive.') job.requested_cores = cores job_collection.add(job) return job_collection
def create_init_job(self, user_name, job_collection, batch_args, verbosity, duration='12:00:00'): '''Creates job for the "init" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.InitPhase empty collection of *init* jobs that should be populated batch_args: tmlib.workflow.args.BatchArguments step-specific implementation of :class:`BatchArguments <tmlib.workflow.args.BatchArguments>` duration: str, optional computational time that should be allocated for the job in HH:MM:SS format (default: ``"12:00:00"``) verbosity: int logging verbosity for job Returns ------- tmlib.workflow.jobs.InitPhase init job ''' logger.info('create "init" job for submission %d', job_collection.submission_id) # FIXME: this should depend on batch and total size and on the # program being run; although 2'500MB seems a lot, it has # shown to be barely enough for `illuminati init` jobs in # large experiments. (But is plenty for most other uses.) memory = 2500 # MB cores = 1 job = InitJob(step_name=self.step_name, arguments=self._build_init_command( batch_args, verbosity), output_dir=self.log_location, submission_id=job_collection.submission_id, user_name=user_name, parent_id=job_collection.persistent_id) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) job.requested_cores = cores job_collection.add(job) return job_collection
def create_init_job(self, user_name, job_collection, batch_args, verbosity, duration='12:00:00'): '''Creates job for the "init" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.InitPhase empty collection of *init* jobs that should be populated batch_args: tmlib.workflow.args.BatchArguments step-specific implementation of :class:`BatchArguments <tmlib.workflow.args.BatchArguments>` duration: str, optional computational time that should be allocated for the job in HH:MM:SS format (default: ``"12:00:00"``) verbosity: int logging verbosity for job Returns ------- tmlib.workflow.jobs.InitPhase init job ''' logger.info('create "init" job for submission %d', job_collection.submission_id) memory = cfg.resource.max_memory_per_core cores = 1 logger.debug('allocated time for "init" job: %s', duration) logger.debug('allocated memory for "init" job: %s', memory) logger.debug('allocated cores for "init" job: %d', cores) job = InitJob(step_name=self.step_name, arguments=self._build_init_command( batch_args, verbosity), output_dir=self.log_location, submission_id=job_collection.submission_id, user_name=user_name, parent_id=job_collection.persistent_id) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) job.requested_cores = cores job_collection.add(job) return job_collection
def create_collect_job(self, user_name, job_collection, verbosity, duration='06:00:00'): '''Creates job for the "collect" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.CollectPhase empty collection of *collect* jobs that should be populated verbosity: int logging verbosity for jobs duration: str, optional computational time that should be allocated for a single job; in HH:MM:SS format (default: ``"06:00:00"``) Returns ------- tmlib.workflow.jobs.CollectJob collect job ''' logger.info('create "collect" job for submission %d', job_collection.submission_id) memory = cfg.resource.max_memory_per_core cores = 1 logger.debug('allocated time for "collect" job: %s', duration) logger.debug('allocated memory for "collect" job: %s', memory) logger.debug('allocated cores for "collect" job: %d', cores) job = CollectJob(step_name=self.step_name, arguments=self._build_collect_command(verbosity), output_dir=self.log_location, submission_id=job_collection.submission_id, user_name=user_name, parent_id=job_collection.persistent_id) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) job.requested_cores = cores job_collection.add(job) return job_collection
def create_collect_job(self, user_name, job_collection, verbosity, duration='06:00:00'): '''Creates job for the "collect" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.CollectPhase empty collection of *collect* jobs that should be populated verbosity: int logging verbosity for jobs duration: str, optional computational time that should be allocated for a single job; in HH:MM:SS format (default: ``"06:00:00"``) Returns ------- tmlib.workflow.jobs.CollectJob collect job ''' logger.info('create "collect" job for submission %d', job_collection.submission_id) # FIXME: See similar comment in `create_init_job` about the # amount of memory to allocate. memory = 2500 # MB cores = 1 job = CollectJob(step_name=self.step_name, arguments=self._build_collect_command(verbosity), output_dir=self.log_location, submission_id=job_collection.submission_id, user_name=user_name, parent_id=job_collection.persistent_id) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) job.requested_cores = cores job_collection.add(job) return job_collection
def _parse_duration(d): """ Parse a SLURM duration expression, in the form ``DD-HH:MM:SS.UUU``. The ``DD``, ``HH`` and ``.UUU`` parts are optional. """ total = Duration(0, unit=seconds) if '-' in d: # DD-HH:MM:SS ndays, d = d.split('-') total = Duration(int(ndays), unit=days) parts = list(reversed(d.split(':'))) assert len(parts) > 0 secs = parts[0] if '.' in secs: # SS.UUU total += Duration(float(secs), unit=seconds) else: total += Duration(int(secs), unit=seconds) if len(parts) > 1: total += Duration(int(parts[1]), unit=minutes) if len(parts) > 2: total += Duration(int(parts[2]), unit=hours) return total
def create_job(self, submission_id, user_name, duration='06:00:00', memory=(cfg.resource.max_cores_per_job * cfg.resource.max_memory_per_core.amount(Memory.MB)), cores=cfg.resource.max_cores_per_job): '''Creates a job for asynchroneous processing of a client tool request. Parameters ---------- submission_id: int ID of the corresponding submission user_name: str name of the submitting user duration: str, optional computational time that should be allocated for the job in HH:MM:SS format (default: ``"06:00:00"``) memory: int, optional amount of memory in Megabyte that should be allocated for the job (defaults to :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>` x :attr:`resource.max_memory_per_core <tmlib.config.LibraryConfig.resource>`) cores: int, optional number of CPU cores that should be allocated for the job (defaults to :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>`) Returns ------- tmlib.tools.jobs.ToolJob tool job ''' logger.info('create tool job for submission %d', submission_id) if cores > cfg.resource.max_cores_per_job: logger.warn('requested cores exceed available cores per node: %s', cfg.resource.max_cores_per_job) logger.debug('setting number of cores to %d', cfg.resource.max_cores_per_job) cores = cfg.resource.max_cores_per_job max_memory_per_node = ( cfg.resource.max_cores_per_job * cfg.resource.max_memory_per_core.amount(Memory.MB)) max_memory_per_core = cfg.resource.max_memory_per_core.amount( Memory.MB) if cores == 1: if memory > max_memory_per_core: # We just warn here, since this may still work. logger.warn( 'requested memory exceeds available memory per core: %d MB', max_memory_per_core) else: if memory > max_memory_per_node: logger.warn( 'requested memory exceeds available memory per node: %d MB', max_memory_per_node) logger.debug('setting memory to %d MB', max_memory_per_node) memory = max_memory_per_node logger.debug('allocated time for job: %s', duration) logger.debug('allocated memory for job: %d MB', memory) logger.debug('allocated cores for job: %d', cores) job = ToolJob(tool_name=self.tool_name, arguments=self._build_command(submission_id), output_dir=self._log_location, submission_id=submission_id, user_name=user_name) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) if not isinstance(cores, int): raise TypeError('Argument "cores" must have type int.') if not cores > 0: raise ValueError('The value of "cores" must be positive.') job.requested_cores = cores return job
def _parse_acct_output(self, stdout, stderr): acct = { 'cores': 0, 'duration': Duration(0, unit=seconds), 'used_cpu_time': Duration(0, unit=seconds), 'max_used_memory': Memory(0, unit=bytes) } exitcode = None signal = None for line in stdout.split('\n'): line = line.strip() if line == '': continue # because of the trailing `|` we have an extra empty field jobid, exit, state, ncpus, elapsed, totalcpu, submit,\ start, end, maxrss, maxvmsize, _ = line.split('|') # In some case the state can contain a specification, # e.g. "CANCELLED by 1000" state = state.split()[0] # SLURM job IDs have the form `jobID[.step]`: only the # lines with the `step` part carry resource usage records, # whereas the total `jobID` line carries the exit codes # and overall duration/timing information. if '.' not in jobid: if state not in [ 'BOOT_FAIL', 'CANCELLED', 'COMPLETED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT', ]: raise gc3libs.exceptions.UnexpectedJobState( "Unexpected SLURM job state '{state}'" " encountered in parsing `sacct` output".format( state=state)) # master job record acct['duration'] = SlurmLrms._parse_duration(elapsed) acct['used_cpu_time'] = SlurmLrms._parse_duration(totalcpu) if state in ['CANCELLED', 'TIMEOUT']: # In this case, the exit code of the master job is # `0:0` or `0:1`, but we want to keep track of the # fact that the job was killed by the system (or # the user). exitcode = os.EX_TEMPFAIL signal = int(Run.Signals.RemoteKill) elif state == 'NODE_FAIL': exitcode = os.EX_TEMPFAIL signal = int(Run.Signals.RemoteError) else: # compute POSIX exit status exitcode_, signal_ = exit.split(':') exitcode = int(exitcode_) signal = int(signal_) # XXX: the master job record seems to report the # *requested* slots, whereas the step records report # the actual usage. In our case these should be the # same, as the job script only runs one single step. # However, in the general case computing the *actual* # CPU usage is a mess, as we would have to check which # steps were executed simultaneously and which ones # were executed one after the other... acct['cores'] = int(ncpus) # provide starting point for resource usage records acct['max_used_memory'] = Memory(0, unit=MB) acct['slurm_max_used_ram'] = Memory(0, unit=MB) # XXX: apparently, Ubuntu's SLURM 2.3 has a bug # wherein `submit` == `end` in the master job record, # and the actual start time must be gathered from the # step records... try to work around submit = SlurmLrms._parse_timestamp(submit) start = SlurmLrms._parse_timestamp(start) end = SlurmLrms._parse_timestamp(end) acct['slurm_submission_time'] = min(submit, start) acct['slurm_start_time'] = end # actually computed below acct['slurm_completion_time'] = max(submit, start, end) else: # common resource usage records (see Issue 78) vmem = SlurmLrms._parse_memspec(maxvmsize) if vmem is not None: acct['max_used_memory'] = max(vmem, acct['max_used_memory']) # SLURM-specific resource usage records mem = SlurmLrms._parse_memspec(maxrss) if mem is not None: acct['slurm_max_used_ram'] = max( mem, acct['slurm_max_used_ram']) # XXX: see above for timestamps submit = SlurmLrms._parse_timestamp(submit) start = SlurmLrms._parse_timestamp(start) acct['slurm_submission_time'] = min( submit, acct['slurm_submission_time']) acct['slurm_start_time'] = min(start, acct['slurm_start_time']) # must compute termination status since it's not provided by `squeue` if signal is not None and exitcode is not None: acct['termstatus'] = (signal & 0x7f) + ((exitcode & 0xff) << 8) return acct
def create_job( self, submission_id, user_name, duration='06:00:00', # if all cores are used, we should allocate all available memory as well memory=cfg.resource.max_memory_per_core.amount(Memory.MB), cores=cfg.resource.max_cores_per_job): '''Creates a job for asynchroneous processing of a client tool request. Parameters ---------- submission_id: int ID of the corresponding submission user_name: str name of the submitting user duration: str, optional computational time that should be allocated for the job in HH:MM:SS format (default: ``"06:00:00"``) memory: int, optional amount of memory in Megabyte that should be allocated for the job (defaults to :attr:`resource.max_memory_per_core <tmlib.config.LibraryConfig.resource>`) cores: int, optional number of CPU cores that should be allocated for the job (defaults to :attr:`resource.max_cores_per_job <tmlib.config.LibraryConfig.resource>`) Returns ------- tmlib.tools.jobs.ToolJob tool job ''' logger.info('create tool job for submission %d', submission_id) try: cores = int(cores) except (ValueError, TypeError) as err: raise TypeError( 'Argument "cores" cannot be converted to type `int`: {err}'. format(err=err)) if not cores > 0: raise ValueError('The value of "cores" must be positive.') if cores > cfg.resource.max_cores_per_job: logger.warn('requested cores exceed available cores per node: %s', cfg.resource.max_cores_per_job) logger.warn('lowering number of cores to %d (max available)', cfg.resource.max_cores_per_job) cores = cfg.resource.max_cores_per_job # FIXME: this needs to be revisited when GC3Pie issue #624 is fixed; # for the moment, see https://github.com/uzh/gc3pie/issues/624#issuecomment-328122862 # as to why this is the right way to compute max memory max_memory_per_node = cfg.resource.max_memory_per_core.amount( Memory.MB) max_memory_per_core = max_memory_per_node / cfg.resource.max_cores_per_job if memory > max_memory_per_node: logger.warn( 'requested memory exceeds available memory per node: %d MB', max_memory_per_node) logger.warn('lowering memory to %d MB', max_memory_per_node) memory = max_memory_per_node logger.debug('allocated time for job: %s', duration) logger.debug('allocated memory for job: %s MB', memory) logger.debug('allocated cores for job: %d', cores) job = ToolJob(tool_name=self.tool_name, arguments=self._build_command(submission_id), output_dir=self._log_location, submission_id=submission_id, user_name=user_name) job.requested_walltime = Duration(duration) job.requested_memory = Memory(memory, Memory.MB) job.requested_cores = cores return job
def _parse_acct_output(self, stdout): acct = dict(exitcode=0, cores=0, duration=Duration(0, unit=seconds), used_cpu_time=Duration(0, unit=seconds), max_used_memory=Memory(0, unit=bytes)) for line in stdout.split('\n'): line = line.strip() if line == '': continue # because of the trailing `|` we have an extra empty field jobid, exit, state, ncpus, elapsed, totalcpu, submit,\ start, end, maxrss, maxvmsize, _ = line.split('|') # In some case the state can contain a specification, as # "CANCELLED by 1000" state = state.split()[0] # SLURM job IDs have the form `jobID[.step]`: only the # lines with the `step` part carry resource usage records, # whereas the total `jobID` line carries the exit codes # and overall duration/timing information. if '.' not in jobid: assert state in [ 'CANCELLED', 'COMPLETED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT' ] # master job record acct['duration'] = SlurmLrms._parse_duration(elapsed) acct['used_cpu_time'] = SlurmLrms._parse_duration(totalcpu) if state in ['CANCELLED', 'TIMEOUT']: # In this case, the exit code of the master job is # `0:0` or `0:1`, but we want to keep track of the # fact that the job was killed by the system (or # the user). acct['exitcode'] = os.EX_TEMPFAIL acct['signal'] = int(Run.Signals.RemoteKill) elif state == 'NODE_FAIL': acct['exitcode'] = os.EX_TEMPFAIL acct['signal'] = int(Run.Signals.RemoteError) else: # compute POSIX exit status acct['exitcode'], acct['signal'] = exit.split(':') # XXX: the master job record seems to report the # *requested* slots, whereas the step records report # the actual usage. In our case these should be the # same, as the job script only runs one single step. # However, in the general case computing the *actual* # CPU usage is a mess, as we would have to check which # steps were executed simultaneously and which ones # were executed one after the other... acct['cores'] = int(ncpus) # provide starting point for resource usage records acct['max_used_memory'] = Memory(0, unit=MB) acct['slurm_max_used_ram'] = Memory(0, unit=MB) # XXX: apparently, Ubuntu's SLURM 2.3 has a bug # wherein `submit` == `end` in the master job record, # and the actual start time must be gathered from the # step records... try to work around submit = SlurmLrms._parse_timestamp(submit) start = SlurmLrms._parse_timestamp(start) end = SlurmLrms._parse_timestamp(end) acct['slurm_submission_time'] = min(submit, start) acct['slurm_start_time'] = end # will be set when # looping on tasks, # see below acct['slurm_completion_time'] = max(submit, start, end) else: # common resource usage records (see Issue 78) vmem = SlurmLrms._parse_memspec(maxvmsize) acct['max_used_memory'] = max(vmem, acct['max_used_memory']) # SLURM-specific resource usage records mem = SlurmLrms._parse_memspec(maxrss) acct['slurm_max_used_ram'] = max(mem, acct['slurm_max_used_ram']) # XXX: see above for timestamps submit = SlurmLrms._parse_timestamp(submit) start = SlurmLrms._parse_timestamp(start) acct['slurm_submission_time'] = min( submit, acct['slurm_submission_time']) acct['slurm_start_time'] = min(start, acct['slurm_start_time']) return acct
def test_bacct_done0(): """Test parsing accounting information of a <sleep 300> job.""" # gotten with `bacct -l "jobid"` lsf = LsfLrms(name='test', architecture=gc3libs.Run.Arch.X86_64, max_cores=1, max_cores_per_job=1, max_memory_per_core=1 * GB, max_walltime=1 * hours, auth=None, # ignored if `transport` is `local` frontend='localhost', transport='local', bacct='bacct') acct = lsf._parse_acct_output(""" Accounting information about jobs that are: - submitted by all users. - accounted on all projects. - completed normally or exited - executed on all hosts. - submitted to all queues. - accounted on all service classes. ------------------------------------------------------------------------------ Job <3329613>, User <rmurri>, Project <default>, Status <DONE>, Queue <pub.1h>, Command <sleep 60>, Share group charged </lsf_biol_all/lsf _aeber/rmurri> Mon Oct 8 17:07:54: Submitted from host <brutus4>, CWD <$HOME>, Output File <l sf.o%J>; Mon Oct 8 17:08:44: Dispatched to <a3201>; Mon Oct 8 17:09:51: Completed <done>. Accounting information about this job: Share group charged </lsf_biol_all/lsf_aeber/rmurri> CPU_T WAIT TURNAROUND STATUS HOG_FACTOR MEM SWAP 0.08 50 117 done 0.0007 5M 222M ------------------------------------------------------------------------------ SUMMARY: ( time unit: second ) Total number of done jobs: 1 Total number of exited jobs: 0 Total CPU time consumed: 0.1 Average CPU time consumed: 0.1 Maximum CPU time of a job: 0.1 Minimum CPU time of a job: 0.1 Total wait time in queues: 50.0 Average wait time in queue: 50.0 Maximum wait time in queue: 50.0 Minimum wait time in queue: 50.0 Average turnaround time: 117 (seconds/job) Maximum turnaround time: 117 Minimum turnaround time: 117 Average hog factor of a job: 0.00 ( cpu time / turnaround time ) Maximum hog factor of a job: 0.00 Minimum hog factor of a job: 0.00 """, # STDERR '') assert acct['duration'] == Duration('67s') assert acct['used_cpu_time'] == Duration('0.08s') assert acct['max_used_memory'] == Memory('227MB') # timestamps year = datetime.date.today().year assert (acct['lsf_submission_time'] == datetime.datetime(year, 10, 8, 17, 7, 54)) assert (acct['lsf_start_time'] == datetime.datetime(year, 10, 8, 17, 8, 44)) assert (acct['lsf_completion_time'] == datetime.datetime(year, 10, 8, 17, 9, 51))
def __init__(self, simulation_dir, executable=None, **extra_args): # remember for later self.simulation_dir = simulation_dir self.shared_FS = extra_args['shared_FS'] inputs = dict() # execution wrapper needs to be added anyway geotop_wrapper_sh = resource_filename(Requirement.parse("gc3pie"), "gc3libs/etc/geotop_wrap.sh") inputs[geotop_wrapper_sh] = os.path.basename(geotop_wrapper_sh) _command = "./%s " % os.path.basename(geotop_wrapper_sh) # If shared_FS, no inputs are defined # as they are already available on the computational nodes if not self.shared_FS: # compress input folder inputs.update(dict(self._scan_and_tar(simulation_dir))) # set ANY_OUTPUT for output outputs = gc3libs.ANY_OUTPUT # Set executable name and include in input list if executable is not None: # use the specified executable # include executable within input list executable_name = './' + os.path.basename(executable) inputs[executable] = os.path.basename(executable) # use '-l' flag for wrapper script for non-shared FS _command += "input.tgz " else: # sharedFS: everything is local executable_name = os.path.abspath(executable) _command += " %s " % os.path.abspath(self.simulation_dir) outputs = [] _command += "%s" % executable_name # set some execution defaults... extra_args.setdefault('requested_cores', 1) extra_args.setdefault('requested_architecture', Run.Arch.X86_64) extra_args.setdefault('requested_walltime', Duration(8, hours)) # ...and remove excess ones extra_args.pop('output_dir', None) Application.__init__( self, # GEOtop requires only one argument: the simulation directory # In our case, since all input files are staged to the # execution directory, the only argument is fixed to ``.`` # arguments = ['./'+os.path.basename(geotop_wrapper_sh), 'input.tgz', executable_name ], arguments=_command, inputs=inputs, # outputs = gc3libs.ANY_OUTPUT, outputs=outputs, output_dir=os.path.join(simulation_dir, 'tmp'), stdout='ggeotop.log', join=True, tags=['APPS/EARTH/GEOTOP-1.224'], **extra_args)
def create_run_jobs(self, user_name, job_collection, verbosity, duration, memory, cores): '''Creates jobs for the parallel "run" phase of the step. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.job.RunPhase empty collection of *run* jobs that should be populated verbosity: int logging verbosity for jobs duration: str computational time that should be allocated for a single job; in HH:MM:SS format memory: int amount of memory in Megabyte that should be allocated for a single job cores: int number of CPU cores that should be allocated for a single job Returns ------- tmlib.workflow.jobs.RunPhase collection of jobs ''' logger.info('create "run" jobs for submission %d', job_collection.submission_id) if cores > cfg.resource.max_cores_per_job: logger.warn('requested cores exceed available cores per job: %s', cfg.resource.max_cores_per_job) logger.debug('lowering number of requested cores to %d', cfg.resource.max_cores_per_job) cores = cfg.resource.max_cores_per_job # Until issue gc3pie#624 is fixed, `max_memory_per_core` # doubles up as "total memory per node" max_memory_per_node = cfg.resource.max_memory_per_core.amount( Memory.MB) if memory > max_memory_per_node: logger.warn( 'requested memory exceeds available memory per node: %d MB', max_memory_per_node) logger.debug('lowering requested memory to %d MB', max_memory_per_node) memory = max_memory_per_node logger.debug('allocated time for run jobs: %s', duration) logger.debug('allocated memory for run jobs: %s', memory) logger.debug('allocated cores for run jobs: %d', cores) job_ids = self.get_run_job_ids() for j in job_ids: job_collection.add( RunJob(**self._get_run_job_args( step_name=self.step_name, arguments=self._build_run_command(j, verbosity), output_dir=self.log_location, job_id=j, submission_id=job_collection.submission_id, user_name=user_name, parent_id=job_collection.persistent_id, requested_walltime=Duration(duration), requested_memory=Memory(memory, Memory.MB), requested_cores=cores, ))) return job_collection
def _parse_time_duration(val): """ Convert the output of common Linux/UNIX system utilities into a GC3Pie `Duration` object. Any of the time formats *DD-HH:MM:SS* (days, hours, minutes, seconds), *HH:MM:SS* (hours, minutes, seconds), or *MM:SS* (minutes, seconds), or even just the number of seconds are acceptable:: >>> _parse_time_duration('25-00:31:05') == Duration('25d') + Duration('31m') + Duration('5s') True >>> _parse_time_duration('1:02:03') == Duration('1h') + Duration('2m') + Duration('3s') True >>> _parse_time_duration('01:02') == Duration('1m') + Duration('2s') True >>> _parse_time_duration('42') == Duration(42, unit=Duration.s) True The *seconds* portion of the time string can be followed by decimal digits for greater precision:: >>> _parse_time_duration('0:00.00') == Duration(0, unit=Duration.s) True >>> _parse_time_duration('4.20') == Duration(4.20, unit=Duration.s) True When only the number of seconds is given, an optional trailing unit specified `s` is allowed:: >>> _parse_time_duration('4.20s') == Duration(4.20, unit=Duration.s) True Among the programs whose output can be parsed by this function, there are: - GNU time's `%e` format specifier; - output of `ps -o etime=` (on both GNU/Linux and MacOSX) """ n = val.count(':') if 2 == n: if '-' in val: days, timespan = val.split('-') return (Duration(days + 'd') + Duration(timespan)) else: # Duration's ctor can natively parse this return Duration(val) elif 1 == n: # AA:BB is rejected as ambiguous by `Duration`'s built-in # parser; work around it mm, ss = val.split(':') return (Duration(int(mm, 10), unit=Duration.m) + Duration(float(ss), unit=Duration.s)) elif 0 == n: # remove final unit spec, if present if val.endswith('s'): val = val[:-1] # number of seconds with up to 2 decimal precision return Duration(float(val), unit=Duration.s) else: raise ValueError("Expecting duration in the form HH:MM:SS, MM:SS," " or just number of seconds," " got {val} instead".format(val=val))
import shutil # import csv from pkg_resources import Requirement, resource_filename import gc3libs import gc3libs.exceptions from gc3libs import Application, Run, Task from gc3libs.cmdline import SessionBasedScript, executable_file import gc3libs.utils from gc3libs.quantity import Memory, kB, MB, MiB, GB, Duration, hours, minutes, seconds from gc3libs.workflow import RetryableTask DEFAULT_CORES = 1 DEFAULT_MEMORY = Memory(1500, MB) DEFAULT_WALLTIME = Duration(300, hours) ## custom application class class GrdockApplication(Application): """ Custom class to wrap the execution of rdock. The wrapper script that will be executed on the remote end is organised in two steps: step 1: cavity creattion (will use rbcavity) step 2: docking (will use rbdock) Application will take the input ligand file and a ligand index ligand index is used to create an output file that maintains the same ligand index as a suffix: Es: input ligand: Docking1.sd -> output: Docked1.sd
def create_run_jobs(self, user_name, job_collection, verbosity, duration, memory, cores): '''Creates jobs for the parallel "run" phase of the step. The `illuminati` step is special in the sense that it implements multiple sequential runs within the "run" phase to build one pyramid zoom level after another. Parameters ---------- user_name: str name of the submitting user job_collection: tmlib.workflow.jobs.RunPhase emtpy collection for "run" jobs verbosity: int logging verbosity for jobs duration: str computational time that should be allocated for a single job; in HH:MM:SS format memory: int amount of memory in Megabyte that should be allocated for a single cores: int number of CPU cores that should be allocated for a single job Returns ------- tmlib.workflow.jobs.RunPhase collection of jobs ''' logger.info('create "run" jobs for submission %d', job_collection.submission_id) logger.debug('allocated time for "run" jobs: %s', duration) logger.debug('allocated memory for "run" jobs: %d MB', memory) logger.debug('allocated cores for "run" jobs: %d', cores) multi_run_jobs = collections.defaultdict(list) job_ids = self.get_run_job_ids() for j in job_ids: batch = self.get_run_batch(j) multi_run_jobs[batch['index']].append(j) for index, job_ids in multi_run_jobs.iteritems(): subjob_collection = SingleRunPhase( step_name=self.step_name, index=index, submission_id=job_collection.submission_id, parent_id=job_collection.persistent_id) for j in job_ids: job = RunJob(step_name=self.step_name, arguments=self._build_run_command(j, verbosity), output_dir=self.log_location, job_id=j, index=index, submission_id=subjob_collection.submission_id, parent_id=subjob_collection.persistent_id, user_name=user_name) if duration: job.requested_walltime = Duration(duration) if memory: job.requested_memory = Memory(memory, Memory.MB) if cores: if not isinstance(cores, int): raise TypeError('Argument "cores" must have type int.') if not cores > 0: raise ValueError( 'The value of "cores" must be positive.') job.requested_cores = cores subjob_collection.add(job) job_collection.add(subjob_collection) return job_collection