def get_new_events(self, start):
        """Get all new Events from Slurm DB since start datetime. Parameter
           start must be a valid datetime. Returns a list of Events. The list
           is empty if none found.
        """

        self.log.info("searching new events since %s", str(start))
        timestamp = int(round(time.mktime(start.timetuple())))

        old_schema = self._is_old_schema()

        events = []

        if old_schema is True:
            cpu_field = "cpu_count"
        else:
            cpu_field = "tres"

        req = """
               SELECT time_start,
                      time_end,
                      node_name,
                      %s,
                      state,
                      reason
                 FROM %s_event_table
                WHERE node_name <> ''
                  AND time_start >= %%s
                ORDER BY time_start
              """ % (
            cpu_field,
            self.cluster.name,
        )
        params = (timestamp,)

        self.cur.execute(req, params)

        while 1:
            row = self.cur.fetchone()
            if row == None:
                break

            datetime_start = datetime.fromtimestamp(row[0])

            timestamp_end = row[1]
            if timestamp_end == 0:
                datetime_end = None
            else:
                datetime_end = datetime.fromtimestamp(timestamp_end)

            node_name = row[2]
            searched_node = Node(node_name, self.cluster, None, None, None, None, None)
            node = self.app.arch.find_node(searched_node)
            if node is None:
                raise HPCStatsSourceError("event node %s not found in loaded nodes" % (node_name))

            if old_schema is True:
                nb_cpu = row[3]
            else:
                nb_cpu = extract_tres_cpu(row[3])
                if nb_cpu == -1:
                    raise HPCStatsSourceError("unable to extract cpu_count from event tres")

            event_type = EventImporterSlurm.txt_slurm_event_type(row[4])
            reason = row[5]

            event = Event(
                node=node,
                cluster=self.cluster,
                nb_cpu=nb_cpu,
                start_datetime=datetime_start,
                end_datetime=datetime_end,
                event_type=event_type,
                reason=reason,
            )
            events.append(event)

        return self.merge_successive_events(events)
Exemple #2
0
    def get_jobs_after_batchid(self, batchid, window_size=0):
        """Fill the jobs attribute with the list of Jobs found in Slurm DB
           whose id_job is over or equals to the batchid in parameter.
           Returns the last found batch_id.
        """

        self.jobs = []

        if window_size:
            limit = "LIMIT %d" % (window_size)
        else:
            limit = ''

        last_batch_id = -1

        old_schema = self._is_old_schema()
        if old_schema is True:
            cpu_field = 'cpus_alloc'
        else:
            cpu_field = 'tres_alloc'

        if not len(self.partitions):
            partitions_clause = ''
        else:
            partitions_clause = "AND job.partition IN (%s)" % \
                                ','.join(['%s'] * len(self.partitions))

        req = """
                SELECT job_db_inx,
                       id_job,
                       id_user,
                       id_group,
                       time_submit,
                       time_start,
                       time_end,
                       timelimit,
                       nodes_alloc,
                       %s,
                       job.partition,
                       qos.name AS qos,
                       job.account,
                       state,
                       nodelist,
                       assoc.user,
                       job_name,
                       wckey
                  FROM %s_job_table job,
                       %s_assoc_table assoc,
                       qos_table qos
                 WHERE job_db_inx >= %%s
                   %s
                   AND assoc.id_assoc = job.id_assoc
                   AND qos.id = job.id_qos
              ORDER BY job_db_inx %s
              """ % (cpu_field, self.prefix, self.prefix, partitions_clause,
                     limit)
        params = (batchid, ) + tuple(self.partitions)
        self.cur.execute(req, params)
        while (1):
            row = self.cur.fetchone()
            if row == None:
                break

            self.nb_loaded_jobs += 1

            batch_id = last_batch_id = row[0]
            sched_id = row[1]

            submission_t = row[4]
            if submission_t == 0:
                submission = None
            else:
                submission = datetime.fromtimestamp(submission_t)

            start_t = row[5]
            if start_t == 0:
                start = None
            else:
                start = datetime.fromtimestamp(start_t)

            end_t = row[6]
            if end_t == 0:
                end = None
            else:
                end = datetime.fromtimestamp(end_t)

            # Some jobs in Slurm DBD have an end but no start. Typically, this
            # concernes the jobs that have been cancelled before starting. For
            # these jobs, we set the start equal to the end.
            if start is None and end is not None:
                start = end

            wall_t = row[7]
            if wall_t == 0:
                walltime = None
            elif wall_t >= 2147483648:
                walltime = "-1"
            else:
                walltime = str(wall_t)

            name = row[16]
            if old_schema is True:
                nbcpu = row[9]
            else:
                nbcpu = extract_tres_cpu(row[9])
                if nbcpu == -1:
                    raise HPCStatsSourceError( \
                            "unable to extract cpus_alloc from job tres")

            state = JobImporterSlurm.get_job_state_from_slurm_state(row[13])

            nodelist = row[14]
            if nodelist == "(null)" or nodelist == "None assigned":
                nodelist = None

            partition = self.job_partition(sched_id, row[10], nodelist)
            qos = row[11]
            queue = "%s-%s" % (partition, qos)
            job_acct = row[12]

            login = row[15]

            searched_user = User(login, None, None, None)
            searched_account = Account(searched_user, self.cluster, None, None,
                                       None, None)
            account = self.app.users.find_account(searched_account)
            if account is None:
                msg = "account %s not found in loaded accounts" \
                        % (login)
                if self.strict_job_account_binding == True:
                    raise HPCStatsSourceError(msg)
                elif login not in self.unknown_accounts:
                    self.unknown_accounts.append(login)
                    self.log.warn(Errors.E_J0001, msg)
                self.nb_excluded_jobs += 1
                continue
            user = self.app.users.find_user(searched_user)
            if user is None:
                msg = "user %s not found in loaded users" % (login)
                raise HPCStatsSourceError(msg)
            job_department = user.department

            wckey = row[17]

            # empty wckey must be considered as None
            if wckey == '':
                wckey = None

            if wckey is None:
                project = None
                business = None
            else:
                wckey_items = wckey.split(':')
                if len(wckey_items) != 2:
                    msg = "format of wckey %s is not valid" % (wckey)
                    if self.strict_job_wckey_format == True:
                        raise HPCStatsSourceError(msg)
                    elif wckey not in self.invalid_wckeys:
                        self.invalid_wckeys.append(wckey)
                        self.log.warn(Errors.E_J0002, msg)
                    project = None
                    business = None
                else:
                    project_code = wckey_items[0]
                    searched_project = Project(None, project_code, None)
                    project = self.app.projects.find_project(searched_project)
                    if project is None:
                        msg = "project %s not found in loaded projects" \
                                % (project_code)
                        if self.strict_job_project_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif project_code not in self.unknown_projects:
                            self.unknown_projects.append(project_code)
                            self.log.warn(Errors.E_J0003, msg)

                    business_code = wckey_items[1]
                    searched_business = Business(business_code, None)
                    business = self.app.business.find(searched_business)

                    if business is None:
                        msg = "business code %s not found in loaded " \
                              "business codes" % (business_code)
                        if self.strict_job_businesscode_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif business_code not in self.unknown_businesses:
                            self.unknown_businesses.append(business_code)
                            self.log.warn(Errors.E_J0004, msg)

            job = Job(account, project, business, sched_id, str(batch_id),
                      name, nbcpu, state, queue, job_acct, job_department,
                      submission, start, end, walltime)
            self.jobs.append(job)

            if nodelist is not None:
                self.create_runs(nodelist, job)

        return last_batch_id
    def get_new_events(self, start):
        """Get all new Events from Slurm DB since start datetime. Parameter
           start must be a valid datetime. Returns a list of Events. The list
           is empty if none found.
        """

        self.log.info("searching new events since %s", str(start))
        timestamp = int(round(time.mktime(start.timetuple())))

        old_schema = self._is_old_schema()

        events = []

        if old_schema is True:
            cpu_field = 'cpu_count'
        else:
            cpu_field = 'tres'

        req = """
               SELECT time_start,
                      time_end,
                      node_name,
                      %s,
                      state,
                      reason
                 FROM %s_event_table
                WHERE node_name <> ''
                  AND time_start >= %%s
                ORDER BY time_start
              """ % (cpu_field, self.prefix)
        params = (timestamp, )

        self.cur.execute(req, params)

        while (1):
            row = self.cur.fetchone()
            if row == None:
                break

            datetime_start = datetime.fromtimestamp(row[0])

            timestamp_end = row[1]
            if timestamp_end == 0:
                datetime_end = None
            else:
                datetime_end = datetime.fromtimestamp(timestamp_end)

            node_name = row[2]
            searched_node = Node(node_name, self.cluster, None, None, None,
                                 None, None)
            node = self.app.arch.find_node(searched_node)
            if node is None:
                self.log.warn(
                    Errors.E_E0001, "event node %s is unknown in cluster %s "
                    "architecture, ignoring this event", node_name,
                    self.cluster.name)
                continue

            if old_schema is True:
                nb_cpu = row[3]
            else:
                nb_cpu = extract_tres_cpu(row[3])
                if nb_cpu == -1:
                    raise HPCStatsSourceError( \
                            "unable to extract cpu_count from event tres")

            event_type = EventImporterSlurm.txt_slurm_event_type(row[4])
            reason = row[5]

            event = Event(node=node,
                          cluster=self.cluster,
                          nb_cpu=nb_cpu,
                          start_datetime=datetime_start,
                          end_datetime=datetime_end,
                          event_type=event_type,
                          reason=reason)
            events.append(event)

        return self.merge_successive_events(events)
Exemple #4
0
    def get_jobs_after_batchid(self, batchid, window_size=0):
        """Fill the jobs attribute with the list of Jobs found in Slurm DB
           whose id_job is over or equals to the batchid in parameter.
           Returns the last found batch_id.
        """

        self.jobs = []
        self.runs = []

        if window_size:
            limit = "LIMIT %d" % (window_size)
        else:
            limit = ''

        last_batch_id = -1

        old_schema = self._is_old_schema()
        if old_schema is True:
            cpu_field = 'cpus_alloc'
        else:
            cpu_field = 'tres_alloc'

        req = """
                SELECT job_db_inx,
                       id_job,
                       id_user,
                       id_group,
                       time_submit,
                       time_start,
                       time_end,
                       nodes_alloc,
                       %s,
                       job.partition,
                       qos.name AS qos,
                       state,
                       nodelist,
                       assoc.user,
                       job_name,
                       wckey
                  FROM %s_job_table job,
                       %s_assoc_table assoc,
                       qos_table qos
                 WHERE job_db_inx >= %%s
                   AND assoc.id_assoc = job.id_assoc
                   AND qos.id = job.id_qos
              ORDER BY job_db_inx %s
              """ % (cpu_field,
                     self.cluster.name,
                     self.cluster.name,
                     limit)
        params = ( batchid, )
        self.cur.execute(req, params)
        while (1):
            row = self.cur.fetchone()
            if row == None:
                break

            self.nb_loaded_jobs += 1

            batch_id = last_batch_id = row[0]
            sched_id = row[1]

            submission_t = row[4]
            if submission_t == 0:
                submission = None
            else:
                submission = datetime.fromtimestamp(submission_t)

            start_t = row[5]
            if start_t == 0:
                start = None
            else:
                start = datetime.fromtimestamp(start_t)

            end_t = row[6]
            if end_t == 0:
                end = None
            else:
                end = datetime.fromtimestamp(end_t)

            # Some jobs in Slurm DBD have an end but no start. Typically, this
            # concernes the jobs that have been cancelled before starting. For
            # these jobs, we set the start equal to the end.
            if start is None and end is not None:
                start = end

            name = row[14]
            if old_schema is True:
                nbcpu = row[8]
            else:
                nbcpu = extract_tres_cpu(row[8])
                if nbcpu == -1:
                    raise HPCStatsSourceError( \
                            "unable to extract cpus_alloc from job tres")

            state = JobImporterSlurm.get_job_state_from_slurm_state(row[11])

            nodelist = row[12]
            if nodelist == "(null)" or nodelist == "None assigned" :
                nodelist = None

            partition = self.job_partition(sched_id, row[9], nodelist)
            qos = row[10]
            queue = "%s-%s" % (partition, qos)

            login = row[13]

            searched_user = User(login, None, None, None)
            searched_account = Account(searched_user, self.cluster,
                                       None, None, None, None)
            account = self.app.users.find_account(searched_account)
            if account is None:
                msg = "account %s not found in loaded accounts" \
                        % (login)
                if self.strict_job_account_binding == True:
                    raise HPCStatsSourceError(msg)
                elif login not in self.unknown_accounts:
                    self.unknown_accounts.append(login)
                    self.log.warn(Errors.E_J0001, msg)
                self.nb_excluded_jobs += 1
                continue

            wckey = row[15]

            # empty wckey must be considered as None
            if wckey == '':
                wckey = None

            if wckey is None:
                project = None
                business = None
            else:
                wckey_items = wckey.split(':')
                if len(wckey_items) != 2:
                    msg = "format of wckey %s is not valid" % (wckey)
                    if self.strict_job_wckey_format == True:
                        raise HPCStatsSourceError(msg)
                    elif wckey not in self.invalid_wckeys:
                        self.invalid_wckeys.append(wckey)
                        self.log.warn(Errors.E_J0002, msg)
                    project = None
                    business = None
                else:
                    project_code = wckey_items[0]
                    searched_project = Project(None, project_code, None)
                    project = self.app.projects.find_project(searched_project)
                    if project is None:
                        msg = "project %s not found in loaded projects" \
                                % (project_code)
                        if self.strict_job_project_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif project_code not in self.unknown_projects:
                            self.unknown_projects.append(project_code)
                            self.log.warn(Errors.E_J0003, msg)

                    business_code = wckey_items[1]
                    searched_business = Business(business_code, None)
                    business = self.app.business.find(searched_business)

                    if business is None:
                        msg = "business code %s not found in loaded " \
                              "business codes" % (business_code)
                        if self.strict_job_businesscode_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif business_code not in self.unknown_businesses:
                            self.unknown_businesses.append(business_code)
                            self.log.warn(Errors.E_J0004, msg)

            job = Job(account, project, business, sched_id, str(batch_id),
                      name, nbcpu, state, queue, submission, start, end)
            self.jobs.append(job)

            if nodelist is not None:
                self.create_runs(nodelist, job)

        return last_batch_id