def get_new_events(self, start): """Get all new Events from Slurm DB since start datetime. Parameter start must be a valid datetime. Returns a list of Events. The list is empty if none found. """ self.log.info("searching new events since %s", str(start)) timestamp = int(round(time.mktime(start.timetuple()))) old_schema = self._is_old_schema() events = [] if old_schema is True: cpu_field = "cpu_count" else: cpu_field = "tres" req = """ SELECT time_start, time_end, node_name, %s, state, reason FROM %s_event_table WHERE node_name <> '' AND time_start >= %%s ORDER BY time_start """ % ( cpu_field, self.cluster.name, ) params = (timestamp,) self.cur.execute(req, params) while 1: row = self.cur.fetchone() if row == None: break datetime_start = datetime.fromtimestamp(row[0]) timestamp_end = row[1] if timestamp_end == 0: datetime_end = None else: datetime_end = datetime.fromtimestamp(timestamp_end) node_name = row[2] searched_node = Node(node_name, self.cluster, None, None, None, None, None) node = self.app.arch.find_node(searched_node) if node is None: raise HPCStatsSourceError("event node %s not found in loaded nodes" % (node_name)) if old_schema is True: nb_cpu = row[3] else: nb_cpu = extract_tres_cpu(row[3]) if nb_cpu == -1: raise HPCStatsSourceError("unable to extract cpu_count from event tres") event_type = EventImporterSlurm.txt_slurm_event_type(row[4]) reason = row[5] event = Event( node=node, cluster=self.cluster, nb_cpu=nb_cpu, start_datetime=datetime_start, end_datetime=datetime_end, event_type=event_type, reason=reason, ) events.append(event) return self.merge_successive_events(events)
def get_jobs_after_batchid(self, batchid, window_size=0): """Fill the jobs attribute with the list of Jobs found in Slurm DB whose id_job is over or equals to the batchid in parameter. Returns the last found batch_id. """ self.jobs = [] if window_size: limit = "LIMIT %d" % (window_size) else: limit = '' last_batch_id = -1 old_schema = self._is_old_schema() if old_schema is True: cpu_field = 'cpus_alloc' else: cpu_field = 'tres_alloc' if not len(self.partitions): partitions_clause = '' else: partitions_clause = "AND job.partition IN (%s)" % \ ','.join(['%s'] * len(self.partitions)) req = """ SELECT job_db_inx, id_job, id_user, id_group, time_submit, time_start, time_end, timelimit, nodes_alloc, %s, job.partition, qos.name AS qos, job.account, state, nodelist, assoc.user, job_name, wckey FROM %s_job_table job, %s_assoc_table assoc, qos_table qos WHERE job_db_inx >= %%s %s AND assoc.id_assoc = job.id_assoc AND qos.id = job.id_qos ORDER BY job_db_inx %s """ % (cpu_field, self.prefix, self.prefix, partitions_clause, limit) params = (batchid, ) + tuple(self.partitions) self.cur.execute(req, params) while (1): row = self.cur.fetchone() if row == None: break self.nb_loaded_jobs += 1 batch_id = last_batch_id = row[0] sched_id = row[1] submission_t = row[4] if submission_t == 0: submission = None else: submission = datetime.fromtimestamp(submission_t) start_t = row[5] if start_t == 0: start = None else: start = datetime.fromtimestamp(start_t) end_t = row[6] if end_t == 0: end = None else: end = datetime.fromtimestamp(end_t) # Some jobs in Slurm DBD have an end but no start. Typically, this # concernes the jobs that have been cancelled before starting. For # these jobs, we set the start equal to the end. if start is None and end is not None: start = end wall_t = row[7] if wall_t == 0: walltime = None elif wall_t >= 2147483648: walltime = "-1" else: walltime = str(wall_t) name = row[16] if old_schema is True: nbcpu = row[9] else: nbcpu = extract_tres_cpu(row[9]) if nbcpu == -1: raise HPCStatsSourceError( \ "unable to extract cpus_alloc from job tres") state = JobImporterSlurm.get_job_state_from_slurm_state(row[13]) nodelist = row[14] if nodelist == "(null)" or nodelist == "None assigned": nodelist = None partition = self.job_partition(sched_id, row[10], nodelist) qos = row[11] queue = "%s-%s" % (partition, qos) job_acct = row[12] login = row[15] searched_user = User(login, None, None, None) searched_account = Account(searched_user, self.cluster, None, None, None, None) account = self.app.users.find_account(searched_account) if account is None: msg = "account %s not found in loaded accounts" \ % (login) if self.strict_job_account_binding == True: raise HPCStatsSourceError(msg) elif login not in self.unknown_accounts: self.unknown_accounts.append(login) self.log.warn(Errors.E_J0001, msg) self.nb_excluded_jobs += 1 continue user = self.app.users.find_user(searched_user) if user is None: msg = "user %s not found in loaded users" % (login) raise HPCStatsSourceError(msg) job_department = user.department wckey = row[17] # empty wckey must be considered as None if wckey == '': wckey = None if wckey is None: project = None business = None else: wckey_items = wckey.split(':') if len(wckey_items) != 2: msg = "format of wckey %s is not valid" % (wckey) if self.strict_job_wckey_format == True: raise HPCStatsSourceError(msg) elif wckey not in self.invalid_wckeys: self.invalid_wckeys.append(wckey) self.log.warn(Errors.E_J0002, msg) project = None business = None else: project_code = wckey_items[0] searched_project = Project(None, project_code, None) project = self.app.projects.find_project(searched_project) if project is None: msg = "project %s not found in loaded projects" \ % (project_code) if self.strict_job_project_binding == True: raise HPCStatsSourceError(msg) elif project_code not in self.unknown_projects: self.unknown_projects.append(project_code) self.log.warn(Errors.E_J0003, msg) business_code = wckey_items[1] searched_business = Business(business_code, None) business = self.app.business.find(searched_business) if business is None: msg = "business code %s not found in loaded " \ "business codes" % (business_code) if self.strict_job_businesscode_binding == True: raise HPCStatsSourceError(msg) elif business_code not in self.unknown_businesses: self.unknown_businesses.append(business_code) self.log.warn(Errors.E_J0004, msg) job = Job(account, project, business, sched_id, str(batch_id), name, nbcpu, state, queue, job_acct, job_department, submission, start, end, walltime) self.jobs.append(job) if nodelist is not None: self.create_runs(nodelist, job) return last_batch_id
def get_new_events(self, start): """Get all new Events from Slurm DB since start datetime. Parameter start must be a valid datetime. Returns a list of Events. The list is empty if none found. """ self.log.info("searching new events since %s", str(start)) timestamp = int(round(time.mktime(start.timetuple()))) old_schema = self._is_old_schema() events = [] if old_schema is True: cpu_field = 'cpu_count' else: cpu_field = 'tres' req = """ SELECT time_start, time_end, node_name, %s, state, reason FROM %s_event_table WHERE node_name <> '' AND time_start >= %%s ORDER BY time_start """ % (cpu_field, self.prefix) params = (timestamp, ) self.cur.execute(req, params) while (1): row = self.cur.fetchone() if row == None: break datetime_start = datetime.fromtimestamp(row[0]) timestamp_end = row[1] if timestamp_end == 0: datetime_end = None else: datetime_end = datetime.fromtimestamp(timestamp_end) node_name = row[2] searched_node = Node(node_name, self.cluster, None, None, None, None, None) node = self.app.arch.find_node(searched_node) if node is None: self.log.warn( Errors.E_E0001, "event node %s is unknown in cluster %s " "architecture, ignoring this event", node_name, self.cluster.name) continue if old_schema is True: nb_cpu = row[3] else: nb_cpu = extract_tres_cpu(row[3]) if nb_cpu == -1: raise HPCStatsSourceError( \ "unable to extract cpu_count from event tres") event_type = EventImporterSlurm.txt_slurm_event_type(row[4]) reason = row[5] event = Event(node=node, cluster=self.cluster, nb_cpu=nb_cpu, start_datetime=datetime_start, end_datetime=datetime_end, event_type=event_type, reason=reason) events.append(event) return self.merge_successive_events(events)
def get_jobs_after_batchid(self, batchid, window_size=0): """Fill the jobs attribute with the list of Jobs found in Slurm DB whose id_job is over or equals to the batchid in parameter. Returns the last found batch_id. """ self.jobs = [] self.runs = [] if window_size: limit = "LIMIT %d" % (window_size) else: limit = '' last_batch_id = -1 old_schema = self._is_old_schema() if old_schema is True: cpu_field = 'cpus_alloc' else: cpu_field = 'tres_alloc' req = """ SELECT job_db_inx, id_job, id_user, id_group, time_submit, time_start, time_end, nodes_alloc, %s, job.partition, qos.name AS qos, state, nodelist, assoc.user, job_name, wckey FROM %s_job_table job, %s_assoc_table assoc, qos_table qos WHERE job_db_inx >= %%s AND assoc.id_assoc = job.id_assoc AND qos.id = job.id_qos ORDER BY job_db_inx %s """ % (cpu_field, self.cluster.name, self.cluster.name, limit) params = ( batchid, ) self.cur.execute(req, params) while (1): row = self.cur.fetchone() if row == None: break self.nb_loaded_jobs += 1 batch_id = last_batch_id = row[0] sched_id = row[1] submission_t = row[4] if submission_t == 0: submission = None else: submission = datetime.fromtimestamp(submission_t) start_t = row[5] if start_t == 0: start = None else: start = datetime.fromtimestamp(start_t) end_t = row[6] if end_t == 0: end = None else: end = datetime.fromtimestamp(end_t) # Some jobs in Slurm DBD have an end but no start. Typically, this # concernes the jobs that have been cancelled before starting. For # these jobs, we set the start equal to the end. if start is None and end is not None: start = end name = row[14] if old_schema is True: nbcpu = row[8] else: nbcpu = extract_tres_cpu(row[8]) if nbcpu == -1: raise HPCStatsSourceError( \ "unable to extract cpus_alloc from job tres") state = JobImporterSlurm.get_job_state_from_slurm_state(row[11]) nodelist = row[12] if nodelist == "(null)" or nodelist == "None assigned" : nodelist = None partition = self.job_partition(sched_id, row[9], nodelist) qos = row[10] queue = "%s-%s" % (partition, qos) login = row[13] searched_user = User(login, None, None, None) searched_account = Account(searched_user, self.cluster, None, None, None, None) account = self.app.users.find_account(searched_account) if account is None: msg = "account %s not found in loaded accounts" \ % (login) if self.strict_job_account_binding == True: raise HPCStatsSourceError(msg) elif login not in self.unknown_accounts: self.unknown_accounts.append(login) self.log.warn(Errors.E_J0001, msg) self.nb_excluded_jobs += 1 continue wckey = row[15] # empty wckey must be considered as None if wckey == '': wckey = None if wckey is None: project = None business = None else: wckey_items = wckey.split(':') if len(wckey_items) != 2: msg = "format of wckey %s is not valid" % (wckey) if self.strict_job_wckey_format == True: raise HPCStatsSourceError(msg) elif wckey not in self.invalid_wckeys: self.invalid_wckeys.append(wckey) self.log.warn(Errors.E_J0002, msg) project = None business = None else: project_code = wckey_items[0] searched_project = Project(None, project_code, None) project = self.app.projects.find_project(searched_project) if project is None: msg = "project %s not found in loaded projects" \ % (project_code) if self.strict_job_project_binding == True: raise HPCStatsSourceError(msg) elif project_code not in self.unknown_projects: self.unknown_projects.append(project_code) self.log.warn(Errors.E_J0003, msg) business_code = wckey_items[1] searched_business = Business(business_code, None) business = self.app.business.find(searched_business) if business is None: msg = "business code %s not found in loaded " \ "business codes" % (business_code) if self.strict_job_businesscode_binding == True: raise HPCStatsSourceError(msg) elif business_code not in self.unknown_businesses: self.unknown_businesses.append(business_code) self.log.warn(Errors.E_J0004, msg) job = Job(account, project, business, sched_id, str(batch_id), name, nbcpu, state, queue, submission, start, end) self.jobs.append(job) if nodelist is not None: self.create_runs(nodelist, job) return last_batch_id