Beispiel #1
0
 def open_db_conn(self):
     """Return a database connection"""
     #  PG Defaults in libpq connection string / dsn parameters:
     #  DbUser,user: same as UNIX user
     #  DbName,dbname: DbUser
     #  DbHost,host: UNIX socket
     #  DbPort,port: 5432
     # Other optional PG parameters:
     # http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS
     dburl = 'dbname=%s user=%s host=%s' % (self._static_info['DbName'],
                                            self._static_info['DbUser'],
                                            self._static_info['DbHost'])
     if self._static_info['DbPort']:
         dburl += ' port=%s' % self._static_info['DbPort']
     if self._static_info['DbPassword']:
         dburl += ' password=%s' % self._static_info['DbPassword']
     DebugPrint(4, "Connecting to PgSQL database: %s" % dburl)
     try:
         self._connection = psycopg2.connect(dburl)
         self._cursor = self._get_cursor(self._connection)
     except:
         tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                             sys.exc_traceback)
         errmsg = 'Failed to connect to %s:\n%s' % (dburl,
                                                    "\n".join(tblist))
         DebugPrint(1, errmsg)
         raise
         # Masking connection failure
         #self._connection = None
     return self._connection
Beispiel #2
0
    def process_record(self, record):
        #TODO: yield the value for processing to gratia ()
        # logfile attribute (if present) is used to keep track and delete files

        DebugPrint(5, "Creating JUR for %s" % record)

        # Filter out uninteresting records (and remove their files)
        if False:
            if 'gratia_logfile' in record:
                DebugPrint(
                    1, 'Deleting transient record file: ' +
                    record["gratia_logfile"])
                file_utils.RemoveFile(record['gratia_logfile'])
            raise IgnoreRecordException("Ignoring record.")

        # Define the record
        # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement
        # setters have the name of the attribute
        # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape )
        resource_type = "Batch"
        r = Gratia.UsageRecord(resource_type)

        # fill r using the values in record

        # remember to specify the transient file (that will be removed if the record
        # is acquired successfully)
        if 'logfile' in record:
            r.AddTransientInputFile(record['gratia_logfile'])

        return r
Beispiel #3
0
    def do_process_recovery(self, start_time=None, end_time=None):
        """ Recovery procedure
        the recovery command will output the records that are 
        processed and sent to Gratia by process_recovery_fd
        """
        rec_command = None
        if start_time is not None and end_time is not None:
            rec_command = self.RECOVERY_COMMAND % {
                'data': "",
                'start': start_time,
                'end': end_time
            }
        else:
            rec_command = self.RECOVERY_COMMAND % {'data': ""}
        DebugPrint(-1, "RUNNING: %s" % rec_command)
        fd = os.popen(rec_command)
        submit_count, found_count = self.process_data_fd(fd)
        if fd.close():
            DebugPrint(-1, "Recovery mode ERROR: Call to rec " \
                           "failed: %s" % rec_command)

        DebugPrint(-1, "Recovery mode: Records submitted: " \
                       "%d" % submit_count)
        DebugPrint(-1, "Recovery mode: Records found: " \
                       "%d" % found_count)
Beispiel #4
0
 def logfiles_to_process(self, args):
     """List all the log files. args is a list of file names or directory names
     """
     for arg in args:
         if os.path.isfile(arg) and os.stat(arg).st_size:
             DebugPrint(5, "Processing logfile %s" % arg)
             yield arg
         elif os.path.isdir(arg):
             DebugPrint(5, "Processing directory %s." % arg)
             for logfile in os.listdir(arg):
                 m = self.LOGFILE_RE.match(logfile)
                 if m:
                     DebugPrint(5, "Processing logfile %s" % logfile)
                     yield os.path.join(arg, logfile)
Beispiel #5
0
 def __init__(self, target=None):
     """
     Create a checkpoint file
     target - checkpoint filename (optionally null)
     """
     if target:
         try:
             fd = os.open(target, os.O_RDWR | os.O_CREAT)
             self._fp = os.fdopen(fd, 'r+')
             self._val = long(self._fp.readline())
             DebugPrint(1, "Resuming from checkpoint in %s" % target)
         except IOError:
             raise IOError("Could not open checkpoint file %s" % target)
         except ValueError:
             DebugPrint(1, "Failed to read checkpoint file %s" % target)
    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception as e:
            print(e, file=sys.stderr)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')

        # SLURM made changes to the accounting database schema
        slurm_version = self.get_slurm_version()
        if LooseVersion(slurm_version) < LooseVersion("15.08.0"):
            # Original schema
            self.sacct = SlurmAcct_v1(self.conn, self.cluster, slurm_version)
        else:
            # Added TRES (Trackable resources) in 15.08.0pre5
            self.sacct = SlurmAcct_v2(self.conn, self.cluster, slurm_version)
 def get_version(self):
     """Return the input version (LRM version, server version). Normally form an external program.
     This is not the probe version"""
     #For error:    raise Exception("Unable to invoke %s" % cmd)
     DebugPrint(
         2,
         "Called ProbeInput get_version instead of the Probe specific one.")
     return ProbeInput.UNKNOWN
 def do_test(self, static_info=None):
     """Prepare the input for testing, e.g. replacing some methods with stubs,
     increasing verbosity, limiting actions, ...
     Invoked after init (object has been created and initialized) and before start
     (static_info from config file not passed, final initialization not done) and get_records
     """
     DebugPrint(4, "ProbeInput test invoked but not defined")
     pass
Beispiel #9
0
    def register_gratia(self, name):
        Gratia.RegisterReporter(name)

        try:
            slurm_version = self.get_slurm_version()
        except Exception, e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise
Beispiel #10
0
    def _jobs(self, where, having='1=1'):
        cursor = self._conn.cursor()

        # Note: When jobs are preempted, multiple cluster_job_table records
        #       are inserted, each with distinct start and end times.
        #       We consider the walltime to be the total time running,
        #       adding up all the records.

        sql = '''SELECT j.id_job
            , j.exit_code
            , j.id_group
            , j.id_user
            , j.job_name
            , j.cpus_alloc
            , j.partition
            , j.state
            , MIN(j.time_start) AS time_start
            , MAX(j.time_end) AS time_end
            , SUM(j.time_suspended) AS time_suspended
            , SUM(CASE WHEN j.time_end < j.time_start + j.time_suspended
                       THEN 0
                       ELSE j.time_end - j.time_start - j.time_suspended
                  END) AS wall_time
            , a.acct
            , a.user
            , ( SELECT MAX(s.max_rss)
                FROM %(cluster)s_step_table s
                WHERE s.job_db_inx = j.job_db_inx
                /* Note: Will underreport mem for jobs with simultaneous steps */
              ) AS max_rss
            , ( SELECT SUM(s.user_sec) + SUM(s.user_usec/1000000)
                FROM %(cluster)s_step_table s
                WHERE s.job_db_inx = j.job_db_inx
              ) AS cpu_user
            , ( SELECT SUM(s.sys_sec) + SUM(s.sys_usec/1000000)
                FROM %(cluster)s_step_table s
                WHERE s.job_db_inx = j.job_db_inx
              ) AS cpu_sys
            FROM %(cluster)s_job_table as j
            LEFT JOIN %(cluster)s_assoc_table AS a ON j.id_assoc = a.id_assoc
            WHERE %(where)s
            GROUP BY id_job
            HAVING %(having)s
            ORDER BY j.time_end
        ''' % {
            'cluster': self._cluster,
            'where': where,
            'having': having
        }

        DebugPrint(5, "Executing SQL: %s" % sql)
        cursor.execute(sql)

        for r in cursor:
            # Add handy data to job record
            r['cluster'] = self._cluster
            self._addUserInfoIfMissing(r)
            yield r
Beispiel #11
0
    def query(self, sql):
        """Generator returning one row at the time as pseudo-dictionary (DictCursor).

        psycopg2.extras.DictCursor is a tuple, accessible by indexes and returned as
        values, not keys, in a loop (for i in row) but row.keys() lists the columns
        and row['column_name'] accesses the column.
        It is compatible w/ standard cursors
        For proper dictionary see psycopg2.extras.RealDictCursor
        NOTE that the values are not mutable (cannot be changed)

        :param sql: string w/ the SQL query
        :return: row as psycopg2.extras.DictCursor (tuple and dictionary)
        """
        if not sql:
            DebugPrint(2, "WARNING: No SQL provided: no query.")
            return
        if not self._connection:
            DebugPrint(
                4,
                "WARNING: No connection provided: trying to (re)open connection."
            )
            if not self.open_db_conn():
                DebugPrint(2, "WARNING: Unable to open connection: no query.")
            return
        if not self._cursor:
            self._cursor = self._get_cursor(self._connection)
            if not self._cursor:
                DebugPrint(2, "WARNING: Unable to get cursor: no query.")
                return
        cursor = self._cursor
        DebugPrint(4, "Executing SQL: %s" % sql)
        try:
            cursor.execute(sql)
        except psycopg2.ProgrammingError as er:
            DebugPrint(2, "ERROR, error running the query: %s" % er)
        if cursor.rowcount is None:
            DebugPrint(2, "WARNING, problems running the query: %s" % sql)
        elif cursor.rowcount <= 0:
            DebugPrint(
                3,
                "WARNING, no rows returned by the query (rowcount: %s). OK for iterators."
                % cursor.rowcount)
        # resultset = self._cur.fetchall()
        if self.support_itersize:
            for r in cursor:
                yield r
        else:
            # implement itersize manually (for psycopg < 2.4)
            # normal iteration would be inefficient fetching one record at the time
            while True:
                resultset = cursor.fetchmany()
                if not resultset:
                    break
                for r in resultset:
                    yield r
Beispiel #12
0
 def process_data_file(self, logfile):
     # Open the file and send it to process
     try:
         fd = open(logfile, 'r')
     except IOError, ie:
         DebugPrint(
             2, "Cannot process %s: (errno=%d) %s" %
             (logfile, ie.errno, ie.strerror))
         return 0, 0
 def add_static_info(self, static_info):
     if not static_info:
         return
     for k in static_info:
         if k in self._static_info:
             DebugPrint(
                 4, "Updating probe %s from %s to %s" %
                 (k, self._static_info[k], static_info[k]))
         self._static_info[k] = static_info[k]
Beispiel #14
0
def lines_to_record(lines):
    """Parse one or more lines of data into a record (data structure)
    Here regular expressions are used to match values for a dictionary
    The input steram is a series of "name = value" lines with
    empty lines or the end of a stream separating records
    '#' at the beginning of the line is used to add comments (skipped)

    :param lines:
    :return:
    """
    # dictionary, caseless_dictionary, sorted dictionary, array
    # are all possible structures, be consistent with what you use in process_record
    record = {}
    if not type(lines) == type([]):
        lines = [lines]
    for line in lines:
        line = line.strip()
        m = val_bool_re.match(line)
        if m:
            attr, val = m.groups()
            if val.lower().find("true") >= 0:
                record[attr] = True
            else:
                record[attr] = False
            continue
        m = val_int_re.match(line)
        if m:
            attr, val = m.groups()
            record[attr] = int(val)
            continue
        m = val_double_re.match(line)
        if m:
            attr, val = m.groups()
            record[attr] = float(val)
            continue
        m = val_string_re.match(line)
        if m:
            attr, val = m.groups()
            record[attr] = str(val)
            continue
        m = val_catchall_re.match(line)
        if m:
            attr, val = m.groups()
            record[attr] = str(val)
            continue
        if not line:
            yield record
            record = {}
            continue
        if line[0] == '#':
            continue
        DebugPrint(2, "Invalid line in record stream: %s" % line)

    yield record
    def register_gratia(self, name):
        Gratia.RegisterReporter(name)

        try:
            slurm_version = self.get_slurm_version()
        except Exception as e:
            DebugPrint(0, "Unable to get SLURM version: %s" % str(e))
            raise

        Gratia.RegisterService("SLURM", slurm_version)
        Gratia.setProbeBatchManager("slurm")
Beispiel #16
0
    def process_data_fd(self, fd, filename=None):
        """
        Process records from a file descriptor.  
        If filename is None there are no transient files (e.g. recovery mode)
        Otherwise filename is a transient file Gratia will attempt to cleanup 
        afterward.
        Transient files are associated with the first record in the file. This 
        works well only if transient files habe only one record, otherwise they 
        will be deleted if the first record is processed successfully (or 
        deemed uninteresting), quarantined if the first record fails to process.
        """
        count_submit = 0
        count_found = 0
        if filename:
            added_transient = False
        else:
            added_transient = True

        for record in lines_to_record(fd):
            count_found += 1
            if not record:
                DebugPrint(5, "Ignoring empty record from file: %s" % fd.name)
                continue
            if not added_transient:
                record['gratia_logfile'] = filename
                added_transient = True
            try:
                yield record
            except KeyboardInterrupt:
                raise
            except SystemExit:
                raise
            except IgnoreRecordException, e:
                DebugPrint(3, "Ignoring Record: %s" % str(e))
                count_submit += 1
                continue
            except Exception, e:
                DebugPrint(
                    2, "Exception while processing the record: %s" % str(e))
                continue
Beispiel #17
0
    def query(self, sql):
        """Generator returning one row at the time as pseudo-dictionary (DictCursor).

        psycopg2.extras.DictCursor is a tuple, accessible by indexes and returned as
        values, not keys, in a loop (for i in row) but row.keys() lists the columns
        and row['column_name'] accesses the column.
        It is compatible w/ standard cursors
        For proper dictionary see psycopg2.extras.RealDictCursor
        NOTE that the values are not mutable (cannot be changed)

        :param sql: string w/ the SQL query
        :return: row as psycopg2.extras.DictCursor (tuple and dictionary)
        """
        if not sql:
            DebugPrint(2, "WARNING: No SQL provided: no query.")
            return
        if not self._connection:
            DebugPrint(
                4,
                "WARNING: No connection provided: trying to (re)open connection."
            )
            if not self.open_db_conn():
                DebugPrint(2, "WARNING: Unable to open connection: no query.")
            return
        if not self._cursor:
            self._cursor = self._get_cursor(self._connection)
            if not self._cursor:
                DebugPrint(2, "WARNING: Unable to get cursor: no query.")
                return
        cursor = self._cursor
        DebugPrint(4, "Executing SQL: %s" % sql)
        try:
            cursor.execute(sql)
        except psycopg2.ProgrammingError, er:
            DebugPrint(2, "ERROR, error running the query: %s" % er)
Beispiel #18
0
    def process_record(self, record):
        #TODO: yield the value for processing to gratia ()
        # logfile attribute (if present) is used to keep track and delete files

        DebugPrint(5, "Creating JUR for %s" % record)

        # Filter out uninteresting records (and remove their files)
        if False:
            if 'gratia_logfile' in record:
                DebugPrint(
                    1, 'Deleting transient record file: ' +
                    record["gratia_logfile"])
                file_utils.RemoveFile(record['gratia_logfile'])
            raise IgnoreRecordException("Ignoring record.")

        # Define the record
        # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement
        # setters have the name of the attribute
        # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape )
        resource_type = "Batch"
        r = Gratia.UsageRecord(resource_type)

        # fill r using the values in record

        # remember to specify the transient file (that will be removed if the record
        # is acquired successfully)
        if 'logfile' in record:
            r.AddTransientInputFile(record['gratia_logfile'])

        return r


# TODO: end of part to remove
#############################################################

# Some references
# http://seann.herdejurgen.com/resume/samag.com/html/v11/i04/a6.htm
# http://stackoverflow.com/questions/14863224/efficient-reading-of-800-gb-xml-file-in-python-2-7
# http://radimrehurek.com/2014/03/data-streaming-in-python-generators-iterators-iterables/
Beispiel #19
0
    def _users(self, where):
        cursor = self._conn.cursor()

        # Default GROUP_CONCAT() maximum length is 1024 chars
        # Increase it to 64MB
        cursor.execute('SET SESSION group_concat_max_len=64*1024*1024;')

        # See enum job_states in slurm/slurm.h for state values
        sql = '''SELECT j.id_user
            , j.id_group
            , (SELECT SUM(cpus_req)   FROM %(cluster)s_job_table WHERE
                  id_user = j.id_user AND state IN (0,2)) AS cpus_pending
            , (SELECT GROUP_CONCAT('|', tres_alloc) FROM %(cluster)s_job_table WHERE
                  id_user = j.id_user AND state IN (1)  ) AS tres_alloc_list
            , MAX(j.time_end) AS time_end
            , a.acct
            , a.user
            FROM %(cluster)s_job_table as j
            LEFT JOIN %(cluster)s_assoc_table AS a ON j.id_assoc = a.id_assoc
            WHERE %(where)s
            GROUP BY id_user
            ORDER BY time_end
        ''' % {
            'cluster': self._cluster,
            'where': where
        }

        DebugPrint(5, "Executing SQL: %s" % sql)
        cursor.execute(sql)

        for r in cursor:
            # Add handy data to job record
            r['cluster'] = self._cluster

            # Extract cpus_alloc from tres_alloc and sum to get cpus_running
            # We were formerly relying on SQL to sum the cpus_alloc.
            # Now we get a list of tres_alloc parameters, parse them, and sum
            # the CPU count ourselves.
            r['cpus_running'] = 0
            if r['tres_alloc_list']:
                for tres_txt in r['tres_alloc_list'].split('|'):
                    tres = self._parse_tres(tres_txt)
                    # tres_types_t.TRES_CPU = 1
                    r['cpus_running'] += tres.get(1, 0)

            # Return 0 instead of None where we don't have values
            if r['cpus_pending'] is None:
                r['cpus_pending'] = 0
            self._addUserInfoIfMissing(r)
            yield r
Beispiel #20
0
    def process_data_dirs(self, dirs=None):
        submit_count = 0
        found_count = 0
        logs_found = 0
        logfile_errors = 0
        # Note we are not ordering logfiles by type, as we don't want to
        # pull them all into memory at once.
        DebugPrint(
            4,
            "We will process the following directories: %s." % ", ".join(dirs))
        for log in self.logfiles_to_process(dirs):
            logs_found += 1
            _, logfile_name = os.path.split(log)
            # This should actually not be needed (done in the itarator)
            # Make sure the filename is in a reasonable format
            m = self.LOGFILE_RE.match(logfile_name)
            if not m:
                DebugPrint(2, "Ignoring log file with invalid name: %s" % log)
                continue
            cnt_submit, cnt_found = self.process_data_file(log)
            if cnt_submit == cnt_found and cnt_submit > 0:
                DebugPrint(
                    5, "Processed %i records from file %s" % (cnt_submit, log))
            else:
                DebugPrint(
                    2,
                    "Unable to process records from file (will add to quarantine): %s.  Submit count %d; found count %d"
                    % (log, cnt_submit, cnt_found))
                GratiaCore.QuarantineFile(log, False)
                logfile_errors += 1
            submit_count += cnt_submit
            found_count += cnt_found

        DebugPrint(2, "Number of logfiles processed: %d" % logs_found)
        DebugPrint(2, "Number of logfiles with errors: %d" % logfile_errors)
        DebugPrint(2, "Number of usage records submitted: %d" % submit_count)
        DebugPrint(2, "Number of usage records found: %d" % found_count)
Beispiel #21
0
    def _parse_tres(self, tres):
        """Parse SLURM database tres_alloc job data into dict"""
        # SLURM 15 changed its job_table.cpus_alloc database column to tres_alloc
        #    and converted the data to a comma separated list of "key=value" pairs
        # Keys are defined in tres_types_t in src/common/slurmdb_defs.h
        #    1 => CPU, 2 => MEM, 3 => ENERGY, 4 => NODE

        ret = dict()

        for item in tres.split(','):
            # Skip blank entries
            if not item:
                continue

            try:
                k, v = item.split('=', 1)
                ret[int(k)] = int(v)
            except ValueError:
                # TRES string is damaged? Continuing.
                DebugPrint(1, "Error parsing TRES string '%s'" % tres)

        return ret
Beispiel #22
0
    def _users(self, where):
        cursor = self._conn.cursor()

        # See enum job_states in slurm/slurm.h for state values
        sql = '''SELECT j.id_user
            , j.id_group
            , (SELECT SUM(cpus_req)   FROM %(cluster)s_job_table WHERE
                  id_user = j.id_user AND state IN (0,2)) AS cpus_pending
            , (SELECT SUM(cpus_alloc) FROM %(cluster)s_job_table WHERE
                  id_user = j.id_user AND state IN (1)  ) AS cpus_running
            , MAX(j.time_end) AS time_end
            , a.acct
            , a.user
            FROM %(cluster)s_job_table as j
            LEFT JOIN %(cluster)s_assoc_table AS a ON j.id_assoc = a.id_assoc
            WHERE %(where)s
            GROUP BY id_user
            ORDER BY time_end
        ''' % {
            'cluster': self._cluster,
            'where': where
        }

        DebugPrint(5, "Executing SQL: %s" % sql)
        cursor.execute(sql)

        for r in cursor:
            # Add handy data to job record
            r['cluster'] = self._cluster
            # Return 0 instead of None where we don't have values
            if r['cpus_pending'] is None:
                r['cpus_pending'] = 0
            if r['cpus_running'] is None:
                r['cpus_running'] = 0
            self._addUserInfoIfMissing(r)
            yield r
class SlurmProbe:

    opts = None
    args = None
    checkpoint = None
    conn = None
    cluster = None
    sacct = None

    def __init__(self):
        try:
            self.opts, self.args = self.parse_opts()
        except Exception, e:
            print >> sys.stderr, str(e)
            sys.exit(1)

        # Initialize Gratia
        if not self.opts.gratia_config or not os.path.exists(
                self.opts.gratia_config):
            raise Exception("Gratia config, %s, does not exist." %
                            self.opts.gratia_config)
        Gratia.Initialize(self.opts.gratia_config)

        if self.opts.verbose:
            Gratia.Config.set_DebugLevel(5)

        # Sanity checks for the probe's runtime environment.
        GratiaWrapper.CheckPreconditions()

        if self.opts.sleep:
            rnd = random.randint(1, int(self.opts.sleep))
            DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd)
            time.sleep(rnd)

        # Make sure we have an exclusive lock for this probe.
        GratiaWrapper.ExclusiveLock()

        self.register_gratia("slurm_meter")

        # Find the checkpoint filename (if enabled)
        if self.opts.checkpoint:
            checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(),
                                           "checkpoint")
        else:
            checkpoint_file = None

        # Open the checkpoint file
        self.checkpoint = SlurmCheckpoint(checkpoint_file)

        # Only process DataFileExpiration days of history
        # (unless we're resuming from a checkpoint file)
        if self.checkpoint.val is None:
            self.checkpoint.val = int(time.time() -
                                      (Gratia.Config.get_DataFileExpiration() *
                                       86400))

        # Connect to database
        self.conn = self.get_db_conn()

        self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster')
        self.sacct = SlurmAcct(self.conn, self.cluster)
    def _jobs(self, where, having='1=1'):
        cursor = self._conn.cursor()

        # Note: When jobs are preempted, multiple cluster_job_table records
        #       are inserted, each with distinct start and end times.
        #       We consider the walltime to be the total time running,
        #       adding up all the records.

        if LooseVersion(self._slurm_version) < LooseVersion("18"):
            max_rss = '''( SELECT MAX(s.max_rss)
                FROM `%(cluster)s_step_table` s
                WHERE s.job_db_inx = j.job_db_inx
                /* Note: Will underreport mem for jobs with simultaneous steps */
              )''' % {
                'cluster': self._cluster
            }
        else:
            max_rss = '''MAX(j.mem_req)'''

        sql = '''SELECT j.id_job
            , j.exit_code
            , j.id_group
            , j.id_user
            , j.job_name
            , j.tres_alloc
            , j.partition
            , j.state
            , MIN(j.time_start) AS time_start
            , MAX(j.time_end) AS time_end
            , SUM(j.time_suspended) AS time_suspended
            , SUM(CASE WHEN j.time_end < j.time_start + j.time_suspended
                       THEN 0
                       ELSE j.time_end - j.time_start - j.time_suspended
                  END) AS wall_time
            , a.acct
            , a.user
            , %(max_rss)s AS max_rss
            , ( SELECT SUM(s.user_sec) + SUM(s.user_usec/1000000)
                FROM `%(cluster)s_step_table` s
                WHERE s.job_db_inx = j.job_db_inx
              ) AS cpu_user
            , ( SELECT SUM(s.sys_sec) + SUM(s.sys_usec/1000000)
                FROM `%(cluster)s_step_table` s
                WHERE s.job_db_inx = j.job_db_inx
              ) AS cpu_sys
            FROM `%(cluster)s_job_table` as j
            LEFT JOIN `%(cluster)s_assoc_table` AS a ON j.id_assoc = a.id_assoc
            WHERE %(where)s
            GROUP BY j.id_job
                   , j.exit_code
                   , j.id_group
                   , j.id_user
                   , j.job_name
                   , j.tres_alloc
                   , j.partition
                   , j.state
                   , a.acct
                   , a.user
                   , j.job_db_inx
            HAVING %(having)s
            ORDER BY j.time_end
        ''' % {
            'cluster': self._cluster,
            'where': where,
            'having': having,
            'max_rss': max_rss
        }

        DebugPrint(5, "Executing SQL: %s" % sql)
        cursor.execute(sql)

        for r in cursor:
            # Add handy data to job record
            r['cluster'] = self._cluster

            # Extract cpus_alloc from tres_alloc
            tres = self._parse_tres(r['tres_alloc'])
            r['cpus_alloc'] = tres.get(1, 0)  # tres_types_t.TRES_CPU = 1

            self._addUserInfoIfMissing(r)
            yield r
 def _get_version(self,
                  rpm_package_name=None,
                  version_command=None,
                  version_command_filter=None):
     """Get program version looking in order for:
     0. self._version (caching the value form previous executions)
     1. rpm -q
     2. the output (stdout only, not stderr) of version_command filtered by version_command_filter
     3. the value in the config file (stored in self._static_info['version']
     This is a protected method
     """
     DebugPrint(
         5, "Called get_version (%s, %s; %s, %s, %s)" %
         (self._version, self._static_info['version'], rpm_package_name,
          version_command, version_command_filter))
     if self._version:
         return self._version
     if rpm_package_name:
         # Use RPM version, as specified in
         # http://fedoraproject.org/wiki/Packaging%3aNamingGuidelines#Package_Versioning
         # rpm --queryformat "%{NAME} %{VERSION} %{RELEASE} %{ARCH}" -q
         # %% to escape %
         fd = os.popen(
             'rpm --queryformat "%%{NAME} %%{VERSION} %%{RELEASE} %%{ARCH}" -q %s'
             % rpm_package_name)
         version = fd.read()
         if fd.close():
             DebugPrint(
                 4, "Unable to invoke rpm to retrieve the %s version" %
                 rpm_package_name)
             #raise Exception("Unable to invoke rpm to retrieve version")
         else:
             rpm_version_re = re.compile("^(.*)\s+(.*)\s+(.*)\s+(.*)$")
             m = rpm_version_re.match(version.strip())
             if m:
                 self._version = "%s-%s" % (m.groups()[1], m.groups()[2])
                 return self._version
             DebugPrint(
                 4, "Unable to parse the %s version from 'rpm -q'" %
                 rpm_package_name)
     if version_command:
         # Use version command
         fd = os.popen(version_command)
         version = fd.read()
         if fd.close():
             DebugPrint(
                 4, "Unable to invoke '%s' to retrieve the version" %
                 version_command)
             #raise Exception("Unable to invoke command")
         else:
             if version_command_filter:
                 version = version_command_filter(version.strip())
             if version:
                 self._version = version
                 return self._version
             DebugPrint(
                 4,
                 "Unable to parse the version from '%s'" % version_command)
     # If other fail try the version attribute
     retv = self._static_info['version']
     if not retv:
         DebugPrint(
             2, "Unable to retrieve the ProbeInput (%s) version" %
             type(self).__name__)
         # raise Exception("Unable to parse condor_version output: %s" % version)
         return ProbeInput.UNKNOWN
     self._version = retv
     return retv
Beispiel #26
0
class PgInput(DbInput):
    """PostgreSQL input.
    Database name, host, user are mandatory parameters. Port (5432) and password are optional

    Type conversion is done by psycopg2 (http://initd.org/psycopg/docs/usage.html)::

        *Python	*PostgreSQL
        None	NULL
        bool	bool
        float   real, double
        int     smallint
        long    integer, bigint
        Decimal numeric
        str     varchar
        unicode text
        buffer, memoryview, bytearray, bytes, Buffer protocol   bytea
        date	date
        time	time
        datetime    timestamp, timestamptz
        timedelta	interval
        list	ARRAY
        tuple, namedtuple   Composite types
        dict	hstore
        Psycopg's Range	range
        Anything(TM)	json
        uuid	uuid
    """
    def __init__(self, conn=None):
        DbInput.__init__(self)
        # PsycoPG 2.4 or greater support itersize, so that iterable named cursor
        # is not fetching only 1 row at the time
        self.support_itersize = True
        self._cursor = None
        if conn:
            self._connection = conn
        else:
            self._connection = None

    def open_db_conn(self):
        """Return a database connection"""
        #  PG Defaults in libpq connection string / dsn parameters:
        #  DbUser,user: same as UNIX user
        #  DbName,dbname: DbUser
        #  DbHost,host: UNIX socket
        #  DbPort,port: 5432
        # Other optional PG parameters:
        # http://www.postgresql.org/docs/current/static/libpq-connect.html#LIBPQ-PARAMKEYWORDS
        dburl = 'dbname=%s user=%s host=%s' % (self._static_info['DbName'],
                                               self._static_info['DbUser'],
                                               self._static_info['DbHost'])
        if self._static_info['DbPort']:
            dburl += ' port=%s' % self._static_info['DbPort']
        if self._static_info['DbPassword']:
            dburl += ' password=%s' % self._static_info['DbPassword']
        DebugPrint(4, "Connecting to PgSQL database: %s" % dburl)
        try:
            self._connection = psycopg2.connect(dburl)
            self._cursor = self._get_cursor(self._connection)
        except:
            tblist = traceback.format_exception(sys.exc_type, sys.exc_value,
                                                sys.exc_traceback)
            errmsg = 'Failed to connect to %s:\n%s' % (dburl,
                                                       "\n".join(tblist))
            DebugPrint(1, errmsg)
            raise
            # Masking connection failure
            #self._connection = None
        return self._connection

    def _get_cursor(self, connection, buffer_size=None):
        """Return a cursor for the given connection

        :param connection: PG connection
        :param buffer_size: size used when fetching resultsets (None for the default one)
        :return: cursor
        """
        # give the cursor a unique name which will invoke server side cursors
        # TODO: should this be unique each time or for input?
        cursor = connection.cursor(name='cur%s' %
                                   str(uuid.uuid4()).replace('-', ''),
                                   cursor_factory=psycopg2.extras.DictCursor)
        #cursor.tzinfo_factory = None
        if not buffer_size:
            cursor.arraysize = self._max_select_mem()
        else:
            cursor.arraysize = buffer_size
        try:
            cursor.itersize = cursor.arraysize
        except AttributeError:
            self.support_itersize = False
        return cursor

    def close_db_conn(self):
        """Explicitly close the connection.
        Connection is closed automatically at del
        """
        # NOTE: uncommitted operations are rolled back but inputs are read only
        if self._connection is not None:
            if self._cursor is not None:
                try:
                    self._cursor.close()
                except psycopg2.InterfaceError:
                    # was already closed
                    pass
                self._cursor = None
            try:
                self._connection.close()
            except psycopg2.InterfaceError:
                # was already closed
                pass
            self._connection = None

    def status_ok(self):
        """Return True if OK, False if the connection is closed"""
        if self._connection is None or self._cursor is None:
            return False
        # TODO: do a select 1 test? The only way to really test
        # try:
        #    self._cursor.execute("SELECT 1")
        #    return True
        #except:
        #    return False
        return True

    def status_string(self):
        """Return a string describing the current status"""
        if self._connection is None:
            return "NOT CONNECTED"
        if self._cursor is None:
            return "NO CURSOR"
        retv = "CONNECTED"
        trans_status = self._cursor.get_transaction_status()
        trans_string = ""
        if trans_status == psycopg2.extensions.STATUS_READY:
            trans_string = "STATUS_READY"
        elif trans_status == psycopg2.extensions.STATUS_BEGIN:
            trans_string = "STATUS_BEGIN"
        elif trans_status == psycopg2.extensions.STATUS_IN_TRANSACTION:
            trans_string = "STATUS_IN_TRANSACTION"
        elif trans_status == psycopg2.extensions.STATUS_PREPARED:
            trans_string = "STATUS_PREPARED"
        if trans_status is not None:
            retv = "%s (%s/%s)" % (retv, trans_status, trans_string)

    def query(self, sql):
        """Generator returning one row at the time as pseudo-dictionary (DictCursor).

        psycopg2.extras.DictCursor is a tuple, accessible by indexes and returned as
        values, not keys, in a loop (for i in row) but row.keys() lists the columns
        and row['column_name'] accesses the column.
        It is compatible w/ standard cursors
        For proper dictionary see psycopg2.extras.RealDictCursor
        NOTE that the values are not mutable (cannot be changed)

        :param sql: string w/ the SQL query
        :return: row as psycopg2.extras.DictCursor (tuple and dictionary)
        """
        if not sql:
            DebugPrint(2, "WARNING: No SQL provided: no query.")
            return
        if not self._connection:
            DebugPrint(
                4,
                "WARNING: No connection provided: trying to (re)open connection."
            )
            if not self.open_db_conn():
                DebugPrint(2, "WARNING: Unable to open connection: no query.")
            return
        if not self._cursor:
            self._cursor = self._get_cursor(self._connection)
            if not self._cursor:
                DebugPrint(2, "WARNING: Unable to get cursor: no query.")
                return
        cursor = self._cursor
        DebugPrint(4, "Executing SQL: %s" % sql)
        try:
            cursor.execute(sql)
        except psycopg2.ProgrammingError, er:
            DebugPrint(2, "ERROR, error running the query: %s" % er)
        if cursor.rowcount is None:
            DebugPrint(2, "WARNING, problems running the query: %s" % sql)
        elif cursor.rowcount <= 0:
            DebugPrint(
                3,
                "WARNING, no rows returned by the query (rowcount: %s). OK for iterators."
                % cursor.rowcount)
        # resultset = self._cur.fetchall()
        if self.support_itersize:
            for r in cursor:
                yield r
        else:
            # implement itersize manually (for psycopg < 2.4)
            # normal iteration would be inefficient fetching one record at the time
            while True:
                resultset = cursor.fetchmany()
                if not resultset:
                    break
                for r in resultset:
                    yield r