예제 #1
0
    def load(self):
        """Load BusinessCodes from CSV files in businesses attribute. Raise
           HPCStatsSourceError if error in encountered.
        """

        self.businesses = []

        self.check()

        with open(self._business_file, 'r') as csvfile:

            file_reader = csv.reader(csvfile, delimiter=';', quotechar='|')
            for row in file_reader:
                if len(row) != 2:
                    raise HPCStatsSourceError( \
                            "business line format in CSV is invalid")
                code = row[0].strip()
                description = row[1].strip()
                if len(code) == 0:
                    raise HPCStatsSourceError( \
                            "business code in CSV is empty")
                if len(description) == 0:
                    description = None
                business = Business(code, description)
                self.businesses.append(business)
예제 #2
0
    def create_runs(self, nodelist, job):
        """Create all Runs objects for the job in parameter and all the nodes
           in nodelist.
        """

        if is_bg_nodelist(nodelist):
            nodeset = compute_bg_nodelist(nodelist)
        else:
            try:
                nodeset = NodeSet(nodelist)
            except NodeSetParseRangeError:
                raise HPCStatsSourceError( \
                        "could not parse nodeset %s for job %s" \
                          % (nodelist, job.batch_id))

        for nodename in nodeset:
            searched_node = Node(nodename, self.cluster, None, None, None,
                                 None, None)
            node = self.app.arch.find_node(searched_node)
            if node is None:
                self.log.warn(Errors.E_J0006,
                              "unable to find node %s for job %s in loaded " \
                              "nodes", nodename, job.batch_id)
            else:
                run = Run(self.cluster, node, job)
                job.runs.append(run)
예제 #3
0
    def check(self):
        """Check if CSV file exists and is a proper flat file."""

        if not os.path.isfile(self._business_file):
            raise HPCStatsSourceError( \
                    "business CSV file %s does not exist" \
                      % (self._business_file))
예제 #4
0
class FSUsageImporterSSH(FSUsageImporter):
    """This class imports FSUsage data from a CSV file available through
       SSH on a remote server.
    """
    def __init__(self, app, db, config, cluster):

        super(FSUsageImporterSSH, self).__init__(app, db, config, cluster)

        section = self.cluster.name + "/fsusage"

        self.ssh_host = config.get(section, 'host')
        self.ssh_user = config.get(section, 'user')
        self.ssh_pkey = config.get(section, 'pkey')
        self.fsfile = config.get(section, 'file')

        self.timestamp_fmt = config.get_default(section, 'timestamp_fmt',
                                                '%Y-%m-%dT%H:%M:%S.%fZ')

        self.filesystems = None  # loaded filesystems
        self.fsusages = None  # loaded fsusages

    def connect_ssh(self):
        """Connect through SSH to remote server and return connection handler.
           Raises HPCStatsSourceError in case of problem.
        """

        try:
            self.log.debug("ssh connection to %s@%s", self.ssh_user,
                           self.ssh_host)
            ssh = paramiko.SSHClient()
            # set automatically RSA key on known_hosts file
            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            ssh.connect(self.ssh_host,
                        username=self.ssh_user,
                        key_filename=self.ssh_pkey)
        except (paramiko.AuthenticationException, paramiko.SSHException,
                socket.error) as err:
            raise HPCStatsSourceError( \
                    "unable to connect by SSH to %s@%s: %s" % \
                      (self.ssh_user, self.ssh_host, err))
        return ssh

    def check(self):
        """Check if the remote SSH server is available for connections, if
           the remote CSV file can be opened and is not empty.
           Raises HPCStatsSourceError in case of problem.
        """
        ssh = self.connect_ssh()
        try:
            sftp = ssh.open_sftp()
        except paramiko.SFTPError, err:
            raise HPCStatsSourceError( \
                    "Error while opening SFTP connection: %s" \
                      % (err))
        try:
            sftpfile = sftp.open(self.fsfile, 'r')
        except IOError, err:
            raise HPCStatsSourceError( \
                    "Error while opening file %s by SFTP: %s" \
                      % (self.fsfile, err))
예제 #5
0
    def check(self):
        """Checks if archfile actually exists or raises HPCStatsSourceError if
           not.
        """

        if not os.path.isfile(self.archfile):
            raise HPCStatsSourceError( \
                    "Architecture file %s does not exist" \
                      % (self.archfile))
예제 #6
0
    def config_get(self, section, option, isint=False):
        """Static method to get option/section in architecture file and raise
           HPCStatsSourceError when a problem occurs.
        """

        try:
            if isint:
                return self.arch.getint(section, option)
            else:
                return self.arch.get(section, option)
        except ConfigParser.NoSectionError:
            raise HPCStatsSourceError( \
                    "missing section %s in architecture file" \
                      % (section))
        except ConfigParser.NoOptionError:
            raise HPCStatsSourceError( \
                    "missing option %s in section %s of " \
                    "architecture file" \
                      % (option, section))
예제 #7
0
 def check(self):
     """Check if the remote SSH server is available for connections, if
        the remote CSV file can be opened and is not empty.
        Raises HPCStatsSourceError in case of problem.
     """
     ssh = self.connect_ssh()
     try:
         sftp = ssh.open_sftp()
     except paramiko.SFTPError, err:
         raise HPCStatsSourceError( \
                 "Error while opening SFTP connection: %s" \
                   % (err))
예제 #8
0
    def load(self):
        """Load Filesystems and FSQuotas from CSV logfile read through SSH.
           Raises HPCStatsSourceError if any error is encountered.
        """

        self.filesystems = []
        self.fsquotas = []

        ssh = self.connect_ssh()

        # The remote file is accessed through SFTP. We could have used
        # Paramiko sftp.open() but iterating over a long file (line by line)
        # is quite slow. We prefer to download the full file to a local
        # temporary file and then read/parse this local file.
        try:
            sftp = ssh.open_sftp()
        except paramiko.SFTPError, err:
            raise HPCStatsSourceError( \
                    "Error while opening SFTP connection: %s" \
                      % (err))
예제 #9
0
    def connect_db(self, cluster):
        """Connect to a cluster Slurm database and set conn/cur attributes
           accordingly.
        """

        try:
            conn_params = {
               'host': self.clusters_db[cluster]['dbhost'],
               'user': self.clusters_db[cluster]['dbuser'],
               'db':   self.clusters_db[cluster]['dbname'],
               'port': self.clusters_db[cluster]['dbport'],
            }
            if self.clusters_db[cluster]['dbpass'] is not None:
                conn_params['passwd'] = self.clusters_db[cluster]['dbpass']

            self.conn = MySQLdb.connect(**conn_params)
            self.cur = self.conn.cursor()
        except _mysql_exceptions.OperationalError as error:
            raise HPCStatsSourceError( \
                    "connection to Slurm DBD MySQL failed: %s" % (error))
예제 #10
0
    def connect_db(self):
        """Connect to cluster Slurm database and set conn/cur attribute
           accordingly. Raises HPCStatsSourceError in case of problem.
        """

        try:
            conn_params = {
                'host': self._dbhost,
                'user': self._dbuser,
                'db': self._dbname,
                'port': self._dbport,
            }
            if self._dbpass is not None:
                conn_params['passwd'] = self._dbpass

            self.conn = MySQLdb.connect(**conn_params)
            self.cur = self.conn.cursor()
        except _mysql_exceptions.OperationalError as error:
            raise HPCStatsSourceError( \
                    "connection to Slurm DBD MySQL failed: %s" % (error))
예제 #11
0
    def connect_ssh(self):
        """Connect through SSH to remote server and return connection handler.
           Raises HPCStatsSourceError in case of problem.
        """

        try:
            self.log.debug("ssh connection to %s@%s", self.ssh_user,
                           self.ssh_host)
            ssh = paramiko.SSHClient()
            # set automatically RSA key on known_hosts file
            ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
            ssh.connect(self.ssh_host,
                        username=self.ssh_user,
                        key_filename=self.ssh_pkey)
        except (paramiko.AuthenticationException, paramiko.SSHException,
                socket.error) as err:
            raise HPCStatsSourceError( \
                    "unable to connect by SSH to %s@%s: %s" % \
                      (self.ssh_user, self.ssh_host, err))
        return ssh
예제 #12
0
    def load(self):
        """Load Cluster, Nodes and partitions from Architecture files. Raises
           HPCStatsRuntimeError or HPCStatsSourceError if error is encountered
           while loading data from sources. It sets attributes cluster, nodes
           and partitions with loaded data.
        """

        self.cluster = Cluster(self.cluster_name)
        self.nodes = []
        self.partitions = {}

        self.read_arch()
        config_get = self.config_get
        partitions = config_get(self.cluster.name, "partitions").split(',')

        for partition in partitions:

            part_sect = self.cluster.name + "/" + partition

            nodegroups = config_get(part_sect, "nodegroups").split(',')
            job_partitions = config_get(part_sect, "job_partitions") \
                               .split(',')

            nodeset_part = NodeSet() # nodeset for the partitions attribute

            for nodegroup in nodegroups:

                nodegroup_sect = self.cluster.name + "/" + partition \
                                 + "/" + nodegroup
                nodenames = config_get(nodegroup_sect, "names")
                nodeset_part.add(nodenames)

                sockets = config_get(nodegroup_sect, "sockets", isint=True)
                cores_per_socket = config_get(nodegroup_sect,
                                              "corespersocket",
                                              isint=True)
                cpu = sockets * cores_per_socket

                float_instructions = config_get(nodegroup_sect,
                                                "floatinstructions",
                                                isint=True)

                freq_str = config_get(nodegroup_sect, "frequency")
                freq = ArchitectureImporterArchfile.convert_freq(freq_str)
                if freq is None:
                    raise HPCStatsSourceError( \
                            "format of frequency for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  freq_str ))

                flops = sockets * cores_per_socket * float_instructions * freq

                mem_str = config_get(nodegroup_sect, "memory")
                mem = ArchitectureImporterArchfile.convert_mem(mem_str)
                if mem is None:
                    raise HPCStatsSourceError( \
                            "format of memory for nodeset %s/%s/%s (%s) " \
                            "'%s' is not valid" \
                              % ( self.cluster.name,
                                  partition,
                                  nodegroup,
                                  nodenames,
                                  mem_str ))

                model = config_get(nodegroup_sect, "model")
            
                nodeset_group = NodeSet(nodenames)
                for nodename in nodeset_group:
                    # create and append node
                    new_node = Node(name=nodename,
                                    cluster=self.cluster,
                                    model=model,
                                    partition=partition,
                                    cpu=cpu,
                                    memory=mem,
                                    flops=flops)
                    self.nodes.append(new_node)

            self.partitions[str(nodeset_part)] = job_partitions
예제 #13
0
        ssh = self.connect_ssh()
        try:
            sftp = ssh.open_sftp()
        except paramiko.SFTPError, err:
            raise HPCStatsSourceError( \
                    "Error while opening SFTP connection: %s" \
                      % (err))
        try:
            sftpfile = sftp.open(self.fqfile, 'r')
        except IOError, err:
            raise HPCStatsSourceError( \
                    "Error while opening file %s by SFTP: %s" \
                      % (self.fqfile, err))
        if sftpfile.readline() == "":
            raise HPCStatsSourceError( \
                    "Remote file %s is empty" \
                      % (self.fqfile))
        sftp.close()
        ssh.close()

    def load(self):
        """Load Filesystems and FSQuotas from CSV logfile read through SSH.
           Raises HPCStatsSourceError if any error is encountered.
        """

        self.filesystems = []
        self.fsquotas = []

        ssh = self.connect_ssh()

        # The remote file is accessed through SFTP. We could have used
예제 #14
0
    def get_new_events(self, start):
        """Get all new Events from Slurm DB since start datetime. Parameter
           start must be a valid datetime. Returns a list of Events. The list
           is empty if none found.
        """

        self.log.info("searching new events since %s", str(start))
        timestamp = int(round(time.mktime(start.timetuple())))

        old_schema = self._is_old_schema()

        events = []

        if old_schema is True:
            cpu_field = 'cpu_count'
        else:
            cpu_field = 'tres'

        req = """
               SELECT time_start,
                      time_end,
                      node_name,
                      %s,
                      state,
                      reason
                 FROM %s_event_table
                WHERE node_name <> ''
                  AND time_start >= %%s
                ORDER BY time_start
              """ % (cpu_field, self.prefix)
        params = (timestamp, )

        self.cur.execute(req, params)

        while (1):
            row = self.cur.fetchone()
            if row == None:
                break

            datetime_start = datetime.fromtimestamp(row[0])

            timestamp_end = row[1]
            if timestamp_end == 0:
                datetime_end = None
            else:
                datetime_end = datetime.fromtimestamp(timestamp_end)

            node_name = row[2]
            searched_node = Node(node_name, self.cluster, None, None, None,
                                 None, None)
            node = self.app.arch.find_node(searched_node)
            if node is None:
                self.log.warn(
                    Errors.E_E0001, "event node %s is unknown in cluster %s "
                    "architecture, ignoring this event", node_name,
                    self.cluster.name)
                continue

            if old_schema is True:
                nb_cpu = row[3]
            else:
                nb_cpu = extract_tres_cpu(row[3])
                if nb_cpu == -1:
                    raise HPCStatsSourceError( \
                            "unable to extract cpu_count from event tres")

            event_type = EventImporterSlurm.txt_slurm_event_type(row[4])
            reason = row[5]

            event = Event(node=node,
                          cluster=self.cluster,
                          nb_cpu=nb_cpu,
                          start_datetime=datetime_start,
                          end_datetime=datetime_end,
                          event_type=event_type,
                          reason=reason)
            events.append(event)

        return self.merge_successive_events(events)
예제 #15
0
    def get_jobs_after_batchid(self, batchid, window_size=0):
        """Fill the jobs attribute with the list of Jobs found in Slurm DB
           whose id_job is over or equals to the batchid in parameter.
           Returns the last found batch_id.
        """

        self.jobs = []

        if window_size:
            limit = "LIMIT %d" % (window_size)
        else:
            limit = ''

        last_batch_id = -1

        old_schema = self._is_old_schema()
        if old_schema is True:
            cpu_field = 'cpus_alloc'
        else:
            cpu_field = 'tres_alloc'

        if not len(self.partitions):
            partitions_clause = ''
        else:
            partitions_clause = "AND job.partition IN (%s)" % \
                                ','.join(['%s'] * len(self.partitions))

        req = """
                SELECT job_db_inx,
                       id_job,
                       id_user,
                       id_group,
                       time_submit,
                       time_start,
                       time_end,
                       timelimit,
                       nodes_alloc,
                       %s,
                       job.partition,
                       qos.name AS qos,
                       job.account,
                       state,
                       nodelist,
                       assoc.user,
                       job_name,
                       wckey
                  FROM %s_job_table job,
                       %s_assoc_table assoc,
                       qos_table qos
                 WHERE job_db_inx >= %%s
                   %s
                   AND assoc.id_assoc = job.id_assoc
                   AND qos.id = job.id_qos
              ORDER BY job_db_inx %s
              """ % (cpu_field, self.prefix, self.prefix, partitions_clause,
                     limit)
        params = (batchid, ) + tuple(self.partitions)
        self.cur.execute(req, params)
        while (1):
            row = self.cur.fetchone()
            if row == None:
                break

            self.nb_loaded_jobs += 1

            batch_id = last_batch_id = row[0]
            sched_id = row[1]

            submission_t = row[4]
            if submission_t == 0:
                submission = None
            else:
                submission = datetime.fromtimestamp(submission_t)

            start_t = row[5]
            if start_t == 0:
                start = None
            else:
                start = datetime.fromtimestamp(start_t)

            end_t = row[6]
            if end_t == 0:
                end = None
            else:
                end = datetime.fromtimestamp(end_t)

            # Some jobs in Slurm DBD have an end but no start. Typically, this
            # concernes the jobs that have been cancelled before starting. For
            # these jobs, we set the start equal to the end.
            if start is None and end is not None:
                start = end

            wall_t = row[7]
            if wall_t == 0:
                walltime = None
            elif wall_t >= 2147483648:
                walltime = "-1"
            else:
                walltime = str(wall_t)

            name = row[16]
            if old_schema is True:
                nbcpu = row[9]
            else:
                nbcpu = extract_tres_cpu(row[9])
                if nbcpu == -1:
                    raise HPCStatsSourceError( \
                            "unable to extract cpus_alloc from job tres")

            state = JobImporterSlurm.get_job_state_from_slurm_state(row[13])

            nodelist = row[14]
            if nodelist == "(null)" or nodelist == "None assigned":
                nodelist = None

            partition = self.job_partition(sched_id, row[10], nodelist)
            qos = row[11]
            queue = "%s-%s" % (partition, qos)
            job_acct = row[12]

            login = row[15]

            searched_user = User(login, None, None, None)
            searched_account = Account(searched_user, self.cluster, None, None,
                                       None, None)
            account = self.app.users.find_account(searched_account)
            if account is None:
                msg = "account %s not found in loaded accounts" \
                        % (login)
                if self.strict_job_account_binding == True:
                    raise HPCStatsSourceError(msg)
                elif login not in self.unknown_accounts:
                    self.unknown_accounts.append(login)
                    self.log.warn(Errors.E_J0001, msg)
                self.nb_excluded_jobs += 1
                continue
            user = self.app.users.find_user(searched_user)
            if user is None:
                msg = "user %s not found in loaded users" % (login)
                raise HPCStatsSourceError(msg)
            job_department = user.department

            wckey = row[17]

            # empty wckey must be considered as None
            if wckey == '':
                wckey = None

            if wckey is None:
                project = None
                business = None
            else:
                wckey_items = wckey.split(':')
                if len(wckey_items) != 2:
                    msg = "format of wckey %s is not valid" % (wckey)
                    if self.strict_job_wckey_format == True:
                        raise HPCStatsSourceError(msg)
                    elif wckey not in self.invalid_wckeys:
                        self.invalid_wckeys.append(wckey)
                        self.log.warn(Errors.E_J0002, msg)
                    project = None
                    business = None
                else:
                    project_code = wckey_items[0]
                    searched_project = Project(None, project_code, None)
                    project = self.app.projects.find_project(searched_project)
                    if project is None:
                        msg = "project %s not found in loaded projects" \
                                % (project_code)
                        if self.strict_job_project_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif project_code not in self.unknown_projects:
                            self.unknown_projects.append(project_code)
                            self.log.warn(Errors.E_J0003, msg)

                    business_code = wckey_items[1]
                    searched_business = Business(business_code, None)
                    business = self.app.business.find(searched_business)

                    if business is None:
                        msg = "business code %s not found in loaded " \
                              "business codes" % (business_code)
                        if self.strict_job_businesscode_binding == True:
                            raise HPCStatsSourceError(msg)
                        elif business_code not in self.unknown_businesses:
                            self.unknown_businesses.append(business_code)
                            self.log.warn(Errors.E_J0004, msg)

            job = Job(account, project, business, sched_id, str(batch_id),
                      name, nbcpu, state, queue, job_acct, job_department,
                      submission, start, end, walltime)
            self.jobs.append(job)

            if nodelist is not None:
                self.create_runs(nodelist, job)

        return last_batch_id
예제 #16
0
    def load(self):
        """Open CSV file and load project out of it.
           Raises Exceptions if error is found in the file.
           Returns the list of Projects with their Domains.
        """

        self.check()

        self.domains = []
        self.projects = []

        with open(self.csv_file, 'r') as csvfile:

            file_reader = csv.reader(csvfile, delimiter=';', quotechar='|')

            for row in file_reader:

                project_code = row[0]
                project_name = row[1]

                # domains
                domain_str = row[2]
                domain_m = re.match(r"\[(.*)\](.*)", domain_str)
                if domain_m:
                    domain_key = domain_m.group(1)
                    domain_name = domain_m.group(2)
                else:
                    raise HPCStatsSourceError( \
                            "Project CSV %s domain format is invalid" \
                              % (project_code))

                domain_key = domain_key.strip()
                domain_name = domain_name.strip()
                if len(domain_key) == 0:
                    raise HPCStatsSourceError( \
                            "Project CSV %s domain key is empty" \
                              % (project_code))
                if len(domain_name) == 0:
                    raise HPCStatsSourceError( \
                            "Project CSV %s domain name is empty" \
                              % (project_code))

                # Create the Domain and search for it among the already
                # existing ones. If not found, append to the list of Domains.
                new_domain = Domain(key=domain_key, name=domain_name)
                domain = self.find_domain(new_domain)
                if domain is None:
                    domain = new_domain
                    self.domains.append(domain)

                # Create the Project and search for it among the already
                # existing ones. If found, raise HPCStatsSourceError
                project = Project(domain=domain,
                                  code=project_code,
                                  description=project_name)
                # check for duplicate project and raise error if found
                if self.find_project(project):
                    raise HPCStatsSourceError( \
                              "duplicated project code %s in CSV file" \
                                  % (project_code))

                self.projects.append(project)

        return self.projects