def requirement(self): ssh = ScaleTools.Ssh(host=self.getConfig(self.configCondorServer), username=self.getConfig(self.configCondorUser), key=self.getConfig(self.configCondorKey)) # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching. # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??) # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \ # "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \ # "'{print $3 "\n" $4}'| sort -n | head -n1" constraint = "( %s ) && ( %s )" % (self._query_constraints, self.getConfig( self.configCondorConstraint)) cmd = ("condor_q -global -constraint '%s' %s" % (constraint, self._query_format_string)) result = ssh.handleSshCall(call=cmd, quiet=True) if result[0] != 0: self.logger.warning("Could not get HTCondor queue status! %d: %s" % (result[0], result[2])) return None elif any(error_string in result[1] for error_string in self._CLI_error_strings): self.logger.warning("condor_q request timed out.") return None queue_line = (entry.split(",", 3) for entry in str(result[1]).splitlines()) converted_line = ((int(status), int(cores), requirement) for status, cores, requirement in queue_line) if self.getConfig(self.configCondorRequirement): # TODO: We could use ClassAd bindings, to check requirement(s) filtered_line = ( (status, cores) for status, cores, requirement in converted_line if self.getConfig(self.configCondorRequirement) in requirement) else: filtered_line = ((status, cores) for status, cores, requirement in converted_line) required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 try: for job_status, requested_cpus in filtered_line: required_cpus_total += requested_cpus if job_status == self.condorStatusIdle: required_cpus_idle_jobs += requested_cpus elif job_status == self.condorStatusRunning: required_cpus_running_jobs += requested_cpus except ValueError: # This error should only occur, if the result was empty AND CondorRequirement is initial required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 self.logger.debug( "HTCondor queue: Idle: %d; Running: %d." % (required_cpus_idle_jobs, required_cpus_running_jobs)) # cores->machines: machine definition required for RequirementAdapter n_cores = -int( self.getConfig( self.configMachines)[self.getNeededMachineType()]["cores"]) self._curRequirement = -(required_cpus_total // n_cores) with Logging.JsonLog() as json_log: json_log.addItem(self.getNeededMachineType(), "jobs_idle", required_cpus_idle_jobs) json_log.addItem(self.getNeededMachineType(), "jobs_running", required_cpus_running_jobs) return self._curRequirement
def requirement(self): ssh = ScaleTools.Ssh(host=self.getConfig(self.configSlurmServer), username=self.getConfig(self.configSlurmUser), key=self.getConfig(self.configSlurmKey)) # Target.Requirements can't be filtered with -constraints since it would require ClassAd based regex matching. # TODO: Find a more generic way to match resources/requirements (condor_q -slotads ??) # cmd_idle = "condor_q -constraint 'JobStatus == 1' -slotads slotads_bwforcluster " \ # "-analyze:summary,reverse | tail -n1 | awk -F ' ' " \ # "'{print $3 "\n" $4}'| sort -n | head -n1" #constraint = "( %s ) && ( %s )" % (self._query_constraints, self.getConfig(self.configCondorConstraint)) #cmd = ("condor_q -global -allusers -nobatch -constraint '%s' %s" % (constraint, self._query_format_string)) #cmd = 'squeue -p nemo_vm_atlsch --noheader --format="%T %r %c"' self.logger.info("Checking requirements in partition {}".format( self.getConfig(self.configSlurmPartition))) cmd = 'squeue -p {} --noheader --format="%T %r %c"'.format( self.getConfig(self.configSlurmPartition)) result = ssh.handleSshCall(call=cmd, quiet=True) if result[0] != 0: self.logger.warning("Could not get Slurm queue status! %d: %s" % (result[0], result[2])) return None elif any(error_string in result[1] for error_string in self._CLI_error_strings): self.logger.warning("squeue request timed out.") return None required_cpus_total = 0 required_cpus_idle_jobs = 0 required_cpus_running_jobs = 0 cpus_dependency_jobs = 0 for line in result[1].splitlines(): values = line.split() #self.logger.debug(values) if len(values) != 3: continue if "Dependency" in values[1]: cpus_dependency_jobs = cpus_dependency_jobs + int(values[2]) continue if "PartitionTimeLimit" in values[1]: continue elif "PENDING" in values[0]: required_cpus_total = required_cpus_total + int(values[2]) required_cpus_idle_jobs = required_cpus_idle_jobs + int( values[2]) continue elif "RUNNING" in values[0]: required_cpus_total = required_cpus_total + int(values[2]) required_cpus_running_jobs = required_cpus_running_jobs + int( values[2]) continue else: self.logger.warning("unknown job state: %s. Ignoring.", values[0]) self.logger.debug( "Slurm queue: Idle: %d; Running: %d. in partition: %s." % (required_cpus_idle_jobs, required_cpus_running_jobs, self.getConfig(self.configSlurmPartition))) # cores->machines: machine definition required for RequirementAdapter n_cores = -int( self.getConfig( self.configMachines)[self.getNeededMachineType()]["cores"]) self._curRequirement = -(required_cpus_total // n_cores) self.logger.debug("Required CPUs total=%s" % required_cpus_total) self.logger.debug("Required CPUs idle Jobs=%s" % required_cpus_idle_jobs) self.logger.debug("Required CPUs running Jobs=%s" % required_cpus_running_jobs) self.logger.debug("CPUs dependency Jobs=%s" % cpus_dependency_jobs) with Logging.JsonLog() as json_log: json_log.addItem(self.getNeededMachineType(), "jobs_idle", required_cpus_idle_jobs) json_log.addItem(self.getNeededMachineType(), "jobs_running", required_cpus_running_jobs) return self._curRequirement