def __check_param(app, param_dict, param): """Check if a tool param is set to a given value. param_dict should be a series of nested dicts param should be a string in dotted format e.g. 'reference_source.reference_source_selector'` value can be a list, in which case the return is the logical OR of the checks against all values in the list """ name = param['name'] value = param['value'] op = param.get('op', '==') type_ = param.get('type') # When walking the dict down to the param, any element that's a list will be replaced by the first element of that # list. This handles repeats (you always check the first element of the repeat) and things like Trinity paired # inputs, which are lists of (single?) HDAs. This may prevent more complex rules but it's good enough for our needs # right now. subpd = param_dict.copy() try: # walk the param dict for subname in name.split('.'): subpd = subpd[subname] # replace lists by the first element of the list if isinstance(subpd, list): local.log.warning( "Converting list param element '%s' to single (first) element: %s", subname, name) subpd = subpd[0] except (KeyError, IndexError): return False runtime_value = subpd if not isinstance(value, list): value = [value] # TODO: probably shouldn't assume size but that's good enough for now since it's all we're interested in. if this # needed to be on something other than size we could add a 'property' key that indicates what property of the param # to check if type_ == 'data_table_lookup': # TODO: any way to automatically detect if a param is a data table value runtime_value = __data_table_lookup(app, param, runtime_value) value = [size_to_bytes(str(x)) for x in value] elif isinstance(runtime_value, model.HistoryDatasetCollectionAssociation): # TODO: this is probably only valid for pairs, do we want to maybe do sum([x.get_size() for x in runtime_value.dataset_instances]) ? runtime_value = runtime_value.dataset_instances[0].get_size() # TODO: maybe store this since it will never change, but the YAML is reloaded frequently via the caching # function so that's easier said than done for probably negligible gain value = [size_to_bytes(str(x)) for x in value] elif isinstance(runtime_value, model.DatasetCollectionElement): runtime_value = runtime_value.first_dataset_instance().get_size() value = [size_to_bytes(str(x)) for x in value] elif isinstance(runtime_value, model.DatasetInstance) or hasattr( runtime_value, 'get_size'): # hasattr for tests, is there a better way to mock it? runtime_value = runtime_value.get_size() value = [size_to_bytes(str(x)) for x in value] return any([OPERATIONS[op](runtime_value, x) for x in value])
def _parse_native_specs(job_id, native_spec): """ determine requested run time and memory from native specs native_spec (e.g. h_rt=01:00:02 -l h_vmem=1G) the native job_id the job ID (only used for logging) specification string passed to GE return time,mem (or None,None if nothing found) """ tme = None mem = None # parse time m = re.search(r"rt=([0-9:]+)[\s,]*", native_spec) if m is not None: tme = _parse_time(m.group(1)) if tme is None: log.error( "DRMAAUniva: job {job_id} has unparsable time native spec {spec}" .format(job_id=job_id, spec=native_spec)) # parse memory m = re.search(r"mem=([\d.]+[KGMT]?)[\s,]*", native_spec) if m is not None: mem = util.size_to_bytes(m.group(1)) # mem = _parse_mem(m.group(1)) if mem is None: log.error( "DRMAAUniva: job {job_id} has unparsable memory native spec {spec}" .format(job_id=job_id, spec=native_spec)) return tme, mem
def __override_params(selections, destination_config, override_allowed): rval = {} for param, value in selections.items(): max_value = 0 orig_value = value if value == 0: continue if override_allowed: # if override is not specified for this param then it can still be set to the max (in the next block) if specified max_value = destination_config.get('override', {}).get(param, 0) if max_value == 0: max_value = destination_config.get('max', {}).get(param, 0) value = __convert_resource_param(param, value) max_value = __convert_tool_mapping_param(param, max_value) value = min(value, max_value) normalize = destination_config.get('normalize', {}).get(param, None) if normalize: local.log.debug("Normalizing '%s bytes' by '%s'", value, normalize) normalize_bytes = size_to_bytes(str(normalize)) floor_factor = int(value / normalize_bytes) value = floor_factor * normalize_bytes local.log.debug("Normalized to '%s * %s = %s'", floor_factor, normalize_bytes, value) if value > 0: rval[param] = value local.log.debug("Value of param '%s' set by user: %s", param, value) else: local.log.warning( "User set param '%s' to '%s' but that is not allowed, so it will be ignored", param, orig_value) return rval
def _parse_amount(self, amount: str) -> Optional[Union[int, bool]]: if amount.lower() in ('unlimited', 'none', 'no limit'): return None try: return util.size_to_bytes(amount) except AssertionError: return False
def _create_quota(self, params): if params.amount.lower() in ('unlimited', 'none', 'no limit'): create_amount = None else: try: create_amount = util.size_to_bytes(params.amount) except AssertionError: create_amount = False if not params.name or not params.description: raise ActionInputError("Enter a valid name and a description.") elif self.sa_session.query(self.app.model.Quota).filter( self.app.model.Quota.table.c.name == params.name).first(): raise ActionInputError( "Quota names must be unique and a quota with that name already exists, so choose another name." ) elif not params.get('amount', None): raise ActionInputError("Enter a valid quota amount.") elif create_amount is False: raise ActionInputError("Unable to parse the provided amount.") elif params.operation not in self.app.model.Quota.valid_operations: raise ActionInputError("Enter a valid operation.") elif params.default != 'no' and params.default not in self.app.model.DefaultQuotaAssociation.types.__dict__.values( ): raise ActionInputError("Enter a valid default type.") elif params.default != 'no' and params.operation != '=': raise ActionInputError( "Operation for a default quota must be '='.") elif create_amount is None and params.operation != '=': raise ActionInputError( "Operation for an unlimited quota must be '='.") else: # Create the quota quota = self.app.model.Quota(name=params.name, description=params.description, amount=create_amount, operation=params.operation) self.sa_session.add(quota) # If this is a default quota, create the DefaultQuotaAssociation if params.default != 'no': self.app.quota_agent.set_default_quota(params.default, quota) else: # Create the UserQuotaAssociations for user in [ self.sa_session.query(self.app.model.User).get(x) for x in params.in_users ]: uqa = self.app.model.UserQuotaAssociation(user, quota) self.sa_session.add(uqa) # Create the GroupQuotaAssociations for group in [ self.sa_session.query(self.app.model.Group).get(x) for x in params.in_groups ]: gqa = self.app.model.GroupQuotaAssociation(group, quota) self.sa_session.add(gqa) self.sa_session.flush() message = "Quota '%s' has been created with %d associated users and %d associated groups." % \ ( quota.name, len( params.in_users ), len( params.in_groups ) ) return quota, message
def to_destination_if_size(job, max_size, to_destination_id, fallback_destination_id): """ A rule that will route a job to the "to_destination_id" if the input size is below a certain threshold, or to the "fallback_destination_id" if not. """ total_input_size = __calculate_dataset_total(job.input_datasets) total_library_size = __calculate_dataset_total(job.input_library_datasets) if (total_input_size + total_library_size) <= size_to_bytes(max_size): return to_destination_id else: return fallback_destination_id
def _create_quota( self, params ): if params.amount.lower() in ( 'unlimited', 'none', 'no limit' ): create_amount = None else: try: create_amount = util.size_to_bytes( params.amount ) except AssertionError: create_amount = False if not params.name or not params.description: raise ActionInputError( "Enter a valid name and a description." ) elif self.sa_session.query( self.app.model.Quota ).filter( self.app.model.Quota.table.c.name==params.name ).first(): raise ActionInputError( "Quota names must be unique and a quota with that name already exists, so choose another name." ) elif not params.get( 'amount', None ): raise ActionInputError( "Enter a valid quota amount." ) elif create_amount is False: raise ActionInputError( "Unable to parse the provided amount." ) elif params.operation not in self.app.model.Quota.valid_operations: raise ActionInputError( "Enter a valid operation." ) elif params.default != 'no' and params.default not in self.app.model.DefaultQuotaAssociation.types.__dict__.values(): raise ActionInputError( "Enter a valid default type." ) elif params.default != 'no' and params.operation != '=': raise ActionInputError( "Operation for a default quota must be '='." ) elif create_amount is None and params.operation != '=': raise ActionInputError( "Operation for an unlimited quota must be '='." ) else: # Create the quota quota = self.app.model.Quota( name=params.name, description=params.description, amount=create_amount, operation=params.operation ) self.sa_session.add( quota ) # If this is a default quota, create the DefaultQuotaAssociation if params.default != 'no': self.app.quota_agent.set_default_quota( params.default, quota ) else: # Create the UserQuotaAssociations for user in [ self.sa_session.query( self.app.model.User ).get( x ) for x in params.in_users ]: uqa = self.app.model.UserQuotaAssociation( user, quota ) self.sa_session.add( uqa ) # Create the GroupQuotaAssociations for group in [ self.sa_session.query( self.app.model.Group ).get( x ) for x in params.in_groups ]: gqa = self.app.model.GroupQuotaAssociation( group, quota ) self.sa_session.add( gqa ) self.sa_session.flush() message = "Quota '%s' has been created with %d associated users and %d associated groups." % \ ( quota.name, len( params.in_users ), len( params.in_groups ) ) return quota, message
def _edit_quota( self, quota, params ): if params.amount.lower() in ( 'unlimited', 'none', 'no limit' ): new_amount = None else: try: new_amount = util.size_to_bytes( params.amount ) except AssertionError: new_amount = False if not params.amount: raise ActionInputError( 'Enter a valid amount' ) elif new_amount is False: raise ActionInputError( 'Unable to parse the provided amount' ) elif params.operation not in self.app.model.Quota.valid_operations: raise ActionInputError( 'Enter a valid operation' ) else: quota.amount = new_amount quota.operation = params.operation self.sa_session.add( quota ) self.sa_session.flush() message = "Quota '%s' is now '%s'" % ( quota.name, quota.operation + quota.display_amount ) return message
def edit_quota(self, quota, params) -> str: if params.amount.lower() in ('unlimited', 'none', 'no limit'): new_amount = None else: try: new_amount = util.size_to_bytes(params.amount) except (AssertionError, ValueError): new_amount = False if not params.amount: raise ActionInputError('Enter a valid amount.') elif new_amount is False: raise ActionInputError('Unable to parse the provided amount.') elif params.operation not in model.Quota.valid_operations: raise ActionInputError('Enter a valid operation.') else: quota.amount = new_amount quota.operation = params.operation self.sa_session.add(quota) self.sa_session.flush() message = f"Quota '{quota.name}' is now '{quota.operation}{quota.display_amount}'." return message
def _parse_native_specs(job_id, native_spec): """ determine requested run time and memory from native specs native_spec (e.g. h_rt=01:00:02 -l h_vmem=1G) the native job_id the job ID (only used for logging) specification string passed to GE return time,mem (or None,None if nothing found) """ tme = None mem = None # parse time m = re.search(r"rt=([0-9:]+)[\s,]*", native_spec) if m is not None: tme = _parse_time(m.group(1)) if tme is None: log.error("DRMAAUniva: job {job_id} has unparsable time native spec {spec}".format(job_id=job_id, spec=native_spec)) # parse memory m = re.search(r"mem=([\d.]+[KGMT]?)[\s,]*", native_spec) if m is not None: mem = util.size_to_bytes(m.group(1)) # mem = _parse_mem(m.group(1)) if mem is None: log.error("DRMAAUniva: job {job_id} has unparsable memory native spec {spec}".format(job_id=job_id, spec=native_spec)) return tme, mem
def _get_drmaa_state_qacct(self, job_id, extinfo): ''' get the job (drmaa) state with qacct. extinfo: dict where signal, exit_status, deleted = True, time_wasted, and memory_wasted can be stored: - signal signal as reported in exit state from qstat (see below) - exit_status set to exit status if returned (ie if qstat returns an exits state larger 0 and less 129 (for exit states > 128 signal is set) in any case (exit state > 0) state FAILED is returned - deleted set to true if the job was deleted (otherwise not set at all), - time_wasted time used in seconds (taken from wallclock) - memory_wasted memory used by the program in byte (taken from maxvmem) return state - first initalised with UNDETERMINED and changed in the following case - DONE if exit state == 0 - FAILED if exit state != 0 - RUNNING if failed in 24,25 - FAILED if failed not in [0,24,25,100] ''' # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid})".format(jobid=job_id)) signals = {k: v for v, k in reversed(sorted(signal.__dict__.items())) if v.startswith('SIG') and not v.startswith('SIG_')} cmd = ['qacct', '-j', job_id] slp = 1 # run qacct -j JOBID (since the accounting data for the job might not be # available immediately a simple retry mechanism is implemented .. # max wait is approx 1min) while True: try: stdout = commands.execute(cmd).strip() except commands.CommandLineException as e: if slp <= 32 and f"job id {job_id} not found" in e.stderr: time.sleep(slp) slp *= 2 continue else: log.error(unicodify(e)) return self.drmaa.JobState.UNDETERMINED else: break qacct = dict() for line in stdout.split("\n"): # remove header if line.startswith("=") or line == "": continue line = line.split() qacct[line[0]] = " ".join(line[1:]) # qacct has three fields of interest: failed, exit_status, deleted_by # experiments # failed exit_status deleted_by # BASH ------------------------------------ # time-limit 100 137 # mem-limit 0 2 # python -------------------------------------------------------------- # time-limit # mem-limit 0 1 # C ------------------------------------------------------------------- # time-limit # mem-limit 0 C programm either have segfault (139) or allocated memory is checked for NULL (then a programmer defined message/exit code is given) # note that max_vmem might not be reliable, since the program never gets the memory. # C++ ----------------------------------------------------------------- # time-limit # mem-limit 0 same as for C programs # JAVA ---------------------------------------------------------------- # time-limit # mem-limit # perl ---------------------------------------------------------------- # time-limit # mem-limit # bash other tests ---------------------------------------------------- # qdel 100 137 user@mail extinfo["time_wasted"] = _parse_time(qacct["wallclock"]) extinfo["memory_wasted"] = size_to_bytes(qacct["maxvmem"]) extinfo["slots"] = int(qacct["slots"]) # deleted_by # If the job (the array task) has been deleted via qdel, "<username>@<hostname>", else # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": log.info(f"DRMAAUniva: job {job_id} was aborted by {qacct['deleted_by']}") extinfo["deleted"] = True return self.drmaa.JobState.FAILED state = self.drmaa.JobState.UNDETERMINED # exit_status # Exit status of the job script (or Univa Grid Engine specific status in case of certain # error conditions). The exit status is determined by following the normal shell conventions # If the command terminates normally the value of the command is its exit status. # However, in the case that the command exits abnormally, a value of 0200 (octal), 128 # (decimal) is added to the value of the command to make up the exit status. # For example: If a job dies through signal 9 (SIGKILL) then the exit status # becomes 128 + 9 = 137. if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: log.error(f"DRMAAUniva: job {job_id} has exit status {qacct['exit_status']}") extinfo['exit_status'] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: log.error(f"DRMAAUniva: job {job_id} was killed by signal {qacct['exit_status'] - 128}") state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] # failed # Indicates the problem which occurred in case a job could not be started on the execution # host (e.g. because the owner of the job did not have a valid account on that # machine). If Univa Grid Engine tries to start a job multiple times, this may lead to # multiple entries in the accounting file corresponding to the same job ID. # for the codes see https://docs.oracle.com/cd/E19957-01/820-0699/chp11-2/index.html if "failed" in qacct: code = int(qacct["failed"].split()[0]) # this happens in case of no error or exit_code!=0 (0) or a signal (100). # both cases are covered already if code in [0, 100]: pass # these seem to be OK as well elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: log.error(f"DRMAAUniva: job {job_id} failed with failure {qacct['failed']}") state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state
def _get_drmaa_state_qacct(self, job_id, extinfo): ''' get the job (drmaa) state with qacct. extinfo: dict where signal, exit_status, deleted = True, time_wasted, and memory_wasted can be stored: - signal signal as reported in exit state from qstat (see below) - exit_status set to exit status if returned (ie if qstat returns an exits state larger 0 and less 129 (for exit states > 128 signal is set) in any case (exit state > 0) state FAILED is returned - deleted set to true if the job was deleted (otherwise not set at all), - time_wasted time used in seconds (taken from wallclock) - memory_wasted memory used by the program in byte (taken from maxvmem) return state - first initalised with UNDETERMINED and changed in the following case - DONE if exit state == 0 - FAILED if exit state != 0 - RUNNING if failed in 24,25 - FAILED if failed not in [0,24,25,100] ''' # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid})".format(jobid=job_id)) signals = dict((k, v) for v, k in reversed(sorted(signal.__dict__.items())) if v.startswith('SIG') and not v.startswith('SIG_')) cmd = ['qacct', '-j', job_id] slp = 1 # run qacct -j JOBID (since the accounting data for the job might not be # available immediately a simple retry mechanism is implemented .. # max wait is approx 1min) while True: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() stderr = stderr.strip() if p.returncode != 0: if slp <= 32 and "job id {jobid} not found".format(jobid=job_id) in stderr: # log.debug('`%s` returned %s, stderr: %s => retry after %ds' % (' '.join(cmd), p.returncode, stderr, slp)) time.sleep(slp) slp *= 2 continue else: log.exception('`%s` returned %s, stderr: %s' % (' '.join(cmd), p.returncode, stderr)) return self.drmaa.JobState.UNDETERMINED else: break qacct = dict() for line in stdout.split("\n"): # remove header if line.startswith("=") or line == "": continue line = line.split() qacct[line[0]] = " ".join(line[1:]) # qacct has three fields of interest: failed, exit_status, deleted_by # experiments # failed exit_status deleted_by # BASH ------------------------------------ # time-limit 100 137 # mem-limit 0 2 # python -------------------------------------------------------------- # time-limit # mem-limit 0 1 # C ------------------------------------------------------------------- # time-limit # mem-limit 0 C programm either have segfault (139) or allocated memory is checked for NULL (then a programmer defined message/exit code is given) # note that max_vmem might not be reliable, since the program never gets the memory. # C++ ----------------------------------------------------------------- # time-limit # mem-limit 0 same as for C programs # JAVA ---------------------------------------------------------------- # time-limit # mem-limit # perl ---------------------------------------------------------------- # time-limit # mem-limit # bash other tests ---------------------------------------------------- # qdel 100 137 user@mail extinfo["time_wasted"] = _parse_time(qacct["wallclock"]) extinfo["memory_wasted"] = util.size_to_bytes(qacct["maxvmem"]) extinfo["slots"] = int(qacct["slots"]) # deleted_by # If the job (the array task) has been deleted via qdel, "<username>@<hostname>", else # "NONE". If qdel was called multiple times, every invocation is recorded in a comma # separated list. if "deleted_by" in qacct and qacct["deleted_by"] != "NONE": log.error("DRMAAUniva: job {job_id} was aborted by {culprit}".format(job_id=job_id, culprit=qacct["deleted_by"])) extinfo["deleted"] = True return self.drmaa.JobState.FAILED state = self.drmaa.JobState.UNDETERMINED # exit_status # Exit status of the job script (or Univa Grid Engine specific status in case of certain # error conditions). The exit status is determined by following the normal shell conventions # If the command terminates normally the value of the command is its exit status. # However, in the case that the command exits abnormally, a value of 0200 (octal), 128 # (decimal) is added to the value of the command to make up the exit status. # For example: If a job dies through signal 9 (SIGKILL) then the exit status # becomes 128 + 9 = 137. if "exit_status" in qacct: qacct["exit_status"] = int(qacct["exit_status"]) if qacct["exit_status"] < 1: log.error("DRMAAUniva: job {job_id} has exit status {status}".format(job_id=job_id, status=qacct["exit_status"])) state = self.drmaa.JobState.DONE elif 0 < qacct["exit_status"] < 129: log.error("DRMAAUniva: job {job_id} has exit status {status}".format(job_id=job_id, status=qacct["exit_status"])) extinfo['exit_status'] = qacct["exit_status"] state = self.drmaa.JobState.FAILED else: log.error("DRMAAUniva: job {job_id} was killed by signal {signal}".format(job_id=job_id, signal=qacct["exit_status"] - 128)) state = self.drmaa.JobState.FAILED extinfo["signal"] = signals[qacct["exit_status"] - 128] # failed # Indicates the problem which occurred in case a job could not be started on the execution # host (e.g. because the owner of the job did not have a valid account on that # machine). If Univa Grid Engine tries to start a job multiple times, this may lead to # multiple entries in the accounting file corresponding to the same job ID. # for the codes see https://docs.oracle.com/cd/E19957-01/820-0699/chp11-2/index.html if "failed" in qacct: code = int(qacct["failed"].split()[0]) # this happens in case of no error or exit_code!=0 (0) or a signal (100). # both cases are covered already if code in [0, 100]: pass # these seem to be OK as well elif code in [24, 25]: state = self.drmaa.JobState.RUNNING else: log.error("DRMAAUniva: job {job_id} failed with failure {failure}".format(job_id=job_id, failure=qacct["failed"])) state = self.drmaa.JobState.FAILED # log.debug("UnivaJobRunner._get_drmaa_state_qacct ({jobid}) -> {state}".format(jobid=job_id, state=self.drmaa_job_state_strings[state])) return state
} NATIVE_SPEC_PARAMS = ( 'submit_native_specification', 'native_specification', 'nativeSpecification', ) RESOURCE_PARAM_CONVERSIONS = { # in: int in GB; out: int in bytes 'mem': lambda x: x * (1024**3), } TOOL_MAPPING_PARAM_CONVERSIONS = { # in: size str; out: int in bytes 'mem': lambda x: size_to_bytes(str(x)), } NATIVE_SPEC_PARAM_CONVERSIONS = { # in: int in bytes or size str; out: int in mb 'mem': lambda x: int(size_to_bytes(str(x)) / (1024**2)), 'time': lambda x: '{}:00:00'.format(x) if isinstance(x, int) else x, } deferred_jobs = {} share_job_counts = {} class JobLogger(logging.LoggerAdapter): """Custom logger adapter to prepend job id to all messages""" def process(self, msg, kwargs):