def register_gratia(self, name): Gratia.RegisterReporter(name) try: slurm_version = self.get_slurm_version() except Exception as e: DebugPrint(0, "Unable to get SLURM version: %s" % str(e)) raise Gratia.RegisterService("SLURM", slurm_version) Gratia.setProbeBatchManager("slurm")
def __init__(self): try: self.opts, self.args = self.parse_opts() except Exception as e: print(e, file=sys.stderr) sys.exit(1) # Initialize Gratia if not self.opts.gratia_config or not os.path.exists( self.opts.gratia_config): raise Exception("Gratia config, %s, does not exist." % self.opts.gratia_config) Gratia.Initialize(self.opts.gratia_config) if self.opts.verbose: Gratia.Config.set_DebugLevel(5) # Sanity checks for the probe's runtime environment. GratiaWrapper.CheckPreconditions() if self.opts.sleep: rnd = random.randint(1, int(self.opts.sleep)) DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd) time.sleep(rnd) # Make sure we have an exclusive lock for this probe. GratiaWrapper.ExclusiveLock() self.register_gratia("slurm_meter") # Find the checkpoint filename (if enabled) if self.opts.checkpoint: checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(), "checkpoint") else: checkpoint_file = None # Open the checkpoint file self.checkpoint = SlurmCheckpoint(checkpoint_file) # Only process DataFileExpiration days of history # (unless we're resuming from a checkpoint file) if self.checkpoint.val is None: self.checkpoint.val = int(time.time() - (Gratia.Config.get_DataFileExpiration() * 86400)) # Connect to database self.conn = self.get_db_conn() self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster') # SLURM made changes to the accounting database schema slurm_version = self.get_slurm_version() if LooseVersion(slurm_version) < LooseVersion("15.08.0"): # Original schema self.sacct = SlurmAcct_v1(self.conn, self.cluster, slurm_version) else: # Added TRES (Trackable resources) in 15.08.0pre5 self.sacct = SlurmAcct_v2(self.conn, self.cluster, slurm_version)
def __init__(self, cp): global has_gratia global Gratia global StorageElement global StorageElementRecord if not has_gratia: try: Gratia = __import__("Gratia") StorageElement = __import__("StorageElement") StorageElementRecord = __import__("StorageElementRecord") has_gratia = True except: raise if not has_gratia: print "Unable to import Gratia and Storage modules!" sys.exit(1) Gratia.Initialize() try: if Gratia.Config.get_SiteName().lower().find('generic') >= 0: Gratia.Config.setSiteName(socket.getfqdn()) except: pass try: if Gratia.Config.get_ProbeName().lower().find('generic') >= 0: Gratia.Config.setProbeName('dCache-storage:%s' % socket.getfqdn()) except: pass
def process_record(self, record): #TODO: yield the value for processing to gratia () # logfile attribute (if present) is used to keep track and delete files DebugPrint(5, "Creating JUR for %s" % record) # Filter out uninteresting records (and remove their files) if False: if 'gratia_logfile' in record: DebugPrint( 1, 'Deleting transient record file: ' + record["gratia_logfile"]) file_utils.RemoveFile(record['gratia_logfile']) raise IgnoreRecordException("Ignoring record.") # Define the record # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement # setters have the name of the attribute # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape ) resource_type = "Batch" r = Gratia.UsageRecord(resource_type) # fill r using the values in record # remember to specify the transient file (that will be removed if the record # is acquired successfully) if 'logfile' in record: r.AddTransientInputFile(record['gratia_logfile']) return r
def register_gratia(self, name): Gratia.RegisterReporter(name) try: slurm_version = self.get_slurm_version() except Exception, e: DebugPrint(0, "Unable to get SLURM version: %s" % str(e)) raise
def register_gratia(self): """Register in Gratia the Reporter (gratia probe), ReporterLibrary (Gratia library version) and the Service (input) :return: """ Gratia.RegisterReporter(self.probe_name) try: input_version = self.get_version() except SystemExit: raise except KeyboardInterrupt: raise except Exception as e: DebugPrint(0, "Unable to get input version: %s" % str(e)) raise # TODO: check the meaning of RegisterReporter vs RegisterService Gratia.RegisterService(self._probeinput.get_name(), input_version)
def _processDBRow(self, row): """ Completely process a single DB row. Take the row, convert it to a UsageRecord, and send it up to Gratia. Process any recoverable errors which occurred during the process. Note we skip a row if it is an Intra-site transfer and we are instructed not to send them. Otherwise, we process the row in Gratia or exit the probe. @return: The number of jobs in this row, regardless of whether we sent them successfully or not. """ # Skip intra-site transfers if required if self._skipIntraSiteXfer(row): return row['njobs'] if (TestContainer.isTest()): if (self._summarize): TestContainer.sendInterrupt(15) return TestContainer.processRow(row, self._log) usageRecord = self._convertBillingInfoToGratiaUsageRecord(\ row) # Send to gratia, and see what it says. response = Gratia.Send(usageRecord) baseMsg = "Record: %s, %s, njobs %i" % (str( row['datestamp']), row['transaction'], row['njobs']) if response == "Fatal Error: too many pending files": # The server is currently not accepting record and # Gratia.py was not able to store the record, we will # need to resend it. # For now take a long nap and then by 'break' we # force a retry for this record. self._log.error("Error sending : too many pending files") longsleep = 15 * 60 self._log.warn("sleeping for = %i seconds." % longsleep) sleep_check(longsleep, self._stopFileName) elif response.startswith('Fatal Error') or \ response.startswith('Internal Error'): self._log.critical('error sending ' + baseMsg + \ '\ngot response ' + response) sys.exit(2) self._log.debug('sent ' + baseMsg) # If we got a non-fatal error, slow down since the server # might be overloaded. if response[:2] != 'OK': self._log.error('error sending ' + baseMsg + \ '\ngot response ' + response) return row['njobs']
def GetRecord(jobid=0): """ Create a sample Gratia record """ record = Gratia.UsageRecord('Batch') record.LocalUserId('cmsuser000') record.GlobalUsername('john ainsworth') record.DN('CN=john ainsworth, L=MC, OU=Manchester, O=eScience, C=UK') record.LocalJobId('PBS.1234.0bad') record.LocalJobId('PBS.1234.' + str(jobid)) # overwrite the previous entry record.JobName('cmsreco ', 'this is not a real job name') record.Charge('1240') record.Status('4') record.Status(4) record.Njobs(3, 'Aggregation over 10 days') record.Network(3.5, 'Gb', 30, 'total') # record.Disk(3.5, "Gb", 13891, "max") # record.Memory(650000, "KB", "min") # record.Swap(1.5, "GB", "max") record.ServiceLevel('BottomFeeder', 'QOS') record.TimeDuration(24, 'submit') record.TimeInstant('2005-11-02T15:48:39Z', 'submit') record.WallDuration(6000 * 3600 * 25 + 63 * 60 + 21.2, 'Was entered in seconds') record.CpuDuration('PT23H12M1.75S', 'user', 'Was entered as text') record.CpuDuration('PT12M1.75S', 'sys', 'Was entered as text') record.NodeCount(3) # default to total record.Processors(3, .75, 'total') record.StartTime(1130946550, 'Was entered in seconds') record.EndTime('2005-11-03T17:52:55Z', 'Was entered as text') record.MachineName('flxi02.fnal.gov') record.SubmitHost('patlx7.fnal.gov') record.Host('flxi02.fnal.gov', True) record.Queue('CepaQueue') record.ProjectName('cms reco') record.AdditionalInfo('RemoteWallTime', 94365) record.Resource('RemoteCpuTime', 'PT23H') return record
def GetRecord(jobid = 0): r = Gratia.UsageRecord("Batch") r.LocalUserId("cmsuser000") r.GlobalUsername("john ainsworth") r.DN("CN=john ainsworth, L=MC, OU=Manchester, O=eScience, C=UK") r.LocalJobId("PBS.1234.0bad") r.LocalJobId("PBS.1234.5." + str(jobid)) # overwrite the previous entry r.JobName("cmsreco","this is not a real job name") r.Charge("1240") r.Status("4") r.Status(4) r.Njobs(3,"Aggregation over 10 days") r.Network(3.5,"Gb",30,"total") #r.Disk(3.5,"Gb",13891,"max") #r.Memory(650000,"KB","min") #r.Swap(1.5,"GB","max") r.ServiceLevel("BottomFeeder","QOS") r.TimeDuration(24,"submit") r.TimeInstant("2005-11-02T15:48:39Z","submit") r.WallDuration(6000*3600*25+63*60+21.2,"Was entered in seconds") r.CpuDuration("PT23H12M1.75S","user","Was entered as text") r.CpuDuration("PT12M1.75S","sys","Was entered as text") r.NodeCount(3) # default to total r.Processors(3,.75,"total") r.StartTime(1130946550,"Was entered in seconds") r.EndTime("2005-11-03T17:52:55Z","Was entered as text") r.MachineName("flxi02.fnal.gov") r.SubmitHost("patlx7.fnal.gov") r.Host("flxi02.fnal.gov",True) r.Queue("CepaQueue") r.ProjectName("cms reco") r.AdditionalInfo("RemoteWallTime",94365) r.Resource("RemoteCpuTime","PT23H") return r
def process_record(self, record): #TODO: yield the value for processing to gratia () # logfile attribute (if present) is used to keep track and delete files DebugPrint(5, "Creating JUR for %s" % record) # Filter out uninteresting records (and remove their files) if False: if 'gratia_logfile' in record: DebugPrint( 1, 'Deleting transient record file: ' + record["gratia_logfile"]) file_utils.RemoveFile(record['gratia_logfile']) raise IgnoreRecordException("Ignoring record.") # Define the record # UsageRecord is defined in https://twiki.opensciencegrid.org/bin/view/Accounting/ProbeDevelopement # setters have the name of the attribute # Set resource type ( Batch, BatchPilot, GridMonitor, Storage, ActiveTape ) resource_type = "Batch" r = Gratia.UsageRecord(resource_type) # fill r using the values in record # remember to specify the transient file (that will be removed if the record # is acquired successfully) if 'logfile' in record: r.AddTransientInputFile(record['gratia_logfile']) return r # TODO: end of part to remove ############################################################# # Some references # http://seann.herdejurgen.com/resume/samag.com/html/v11/i04/a6.htm # http://stackoverflow.com/questions/14863224/efficient-reading-of-800-gb-xml-file-in-python-2-7 # http://radimrehurek.com/2014/03/data-streaming-in-python-generators-iterators-iterables/
lines = fd.readlines() boincjob = False for var2 in lines: if var2.count("QDate") > 0: starttime = var2.split()[2] elif var2.count("RemoteWallClockTime") > 0: walltime = var2.split()[2] elif var2.count("CompletionDate") > 0: endtime = var2.split()[2] elif var2.count("Owner") > 0: if var2.split()[2] == '"boinc"': boincjob = True if boincjob == True: Gratia.setProbeBatchManager("Condor") Gratia.Initialize() r = Gratia.UsageRecord("Condor") r.ResourceType("Backfill") # parsing the filenames for the hostname/localjobid. # the files are in the format: history.<hostname>#<localjobid>#1#<localjobid> host = var.partition(".")[2].partition("#")[0] localjobid = var.partition(".")[2].partition("#")[2].partition("#")[0] # print 'endtime: ' + endtime # print 'starttime: ' + starttime # print 'walltime: ' + walltime # Gratia likes ints, not strings, for times.
def _convertBillingInfoToGratiaUsageRecord(self, row): """ Take a record returned from the database and convert it to a Gratia UsageRecord @param row: A dictionary-like object describing the Billing DB entry. @return: UsageRecord equivalent to the input row """ # Convert date to utc. This can't be done perfectly, alas, since we # don't have the original timezone. We assume localtime. # This code is horrible, but it should work. row['datestamp'] should # be a datetime.datetime object. # make the time into a float fltTime = time.mktime(row['datestamp'].timetuple()) startTime = time.strftime('%Y-%m-%dT%H:%M:%S', time.gmtime(fltTime)) # NOTE WELL: we need the time accurate to milliseconds. So we # add it back to the UTC time. startTime = startTime + "." + \ locale.format("%06d", row['datestamp'].microsecond) + "Z" # convert the connection time in milliseconds to a decimal in seconds connectTime = float(row['connectiontime']) / 1000.0 connectionTimeStr = 'PT' + str(connectTime) + 'S' # Check for the link to the doorinfo table being bad and log a # warning in the hope that somebody notices a bug has crept in. if row['doorlink'] == '<undefined>' and \ not row['protocol'].startswith('DCap'): self._log.warn( 'billinginfo record with datestamp ' + \ startTime + ' contained undefined initiator field' ) # Work out the end points of the data transfer. thisHost = str(row['cellname']) + '@' + self._dCacheSvrHost if row['isnew']: srcHost = row['client'] dstHost = thisHost isNew = 1 else: srcHost = thisHost dstHost = row['client'] isNew = 0 rec = Gratia.UsageRecord('Storage') rec.Njobs(row['njobs']) rec.AdditionalInfo('Source', srcHost) rec.AdditionalInfo('Destination', dstHost) rec.AdditionalInfo('Protocol', row['protocol']) rec.AdditionalInfo('IsNew', isNew) rec.LocalJobId(row['transaction']) if row['protocol'].startswith("DCap"): rec.Grid("Local") else: # Set the grid name to the default in the ProbeConfig rec.Grid(self._grid) rec.StartTime(startTime) rec.Network(row['transfersize'], 'b', connectionTimeStr, 'total', row['action']) rec.WallDuration(connectionTimeStr) # only send the initiator if it is known. if row['initiator'] != 'unknown': rec.DN(row['initiator']) # if the initiator host is "unknown", make it "Unknown". initiatorHost = row['initiatorhost'] if initiatorHost == 'unknown': initiatorHost = 'Unknown' rec.SubmitHost(initiatorHost) rec.Status(row['errorcode']) # If we included the mapped uid as the local user id, then # Gratia will make a best effort to map this to the VO name. mappedUID = row['mappeduid'] mappedGID = row['mappedgid'] if row['protocol'] == 'NFS4-4.1': username = row['initiator'] rec.LocalUserId(username) return rec try: username = '******' if row['initiator'] != 'unknown': username = row['initiator'] if mappedUID != None and int(mappedUID) >= 0: try: info = pwd.getpwuid(int(mappedUID)) username = info[0] except: try: mtime = os.stat(self._unix_gid_list_file_name).st_mtime if self.__gid_file_mod_time != mtime: self.__gid_file_mod_time = mtime self.__refresh_group_map() username = self.__group_map.get(str(mappedGID)) if not username: self._log.warn("UID %s %s not found locally; make sure " \ "/etc/passwd or %s on this host and your dCache are using " \ "the same UIDs,GIDs!" % (self._unix_gid_list_file_name,str(int(mappedUID)),str(int(mappedGID)))) except: self._log.warn("UID %s not found locally in /etc/passwed and %s does not exist or "\ "inaccessible " % (str(int(mappedUID)),self._unix_gid_list_file_name)) rec.LocalUserId(username) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self._log.info("Failed to map UID %s to VO." % mappedUID) return rec
if p.returncode != 0: raise Exception("Unable to invoke %s" % cmd) name, version = output.split() return version def register_gratia(self, name): Gratia.RegisterReporter(name) try: slurm_version = self.get_slurm_version() except Exception, e: DebugPrint(0, "Unable to get SLURM version: %s" % str(e)) raise Gratia.RegisterService("SLURM", slurm_version) Gratia.setProbeBatchManager("slurm") class SlurmCheckpoint(object): """Read and write a checkpoint file If class is instantiated without a filename, class works as expected but data is not stored to disk """ _val = None _fp = None def __init__(self, target=None): """ Create a checkpoint file
def main(): # We need the logger variable in the exception handler. # So we create it here. logger = logging.getLogger('DCacheAggregator') # Ignore hangup signals. We shouldn't die just because our parent # shell logs out. signal.signal(signal.SIGHUP, signal.SIG_IGN) # Try to catch common signals and send email before we die signal.signal(signal.SIGINT, warn_of_signal) signal.signal(signal.SIGQUIT, warn_of_signal) signal.signal(signal.SIGTERM, warn_of_signal) try: # Tell Gratia what versions we are using. # CHRIS: is there a way to automate the version extraction # using the pkg_resource package? Gratia.RegisterReporterLibrary("psycopg2", "2.0.6") #Gratia.RegisterReporterLibrary( "SQLAlchemy", "0.4.1" ) rev = Gratia.ExtractCvsRevision("$Revision: 1.13 $") tag = Gratia.ExtractCvsRevision("$Name: $") Gratia.RegisterReporter("dCacheBillingAggregator.py", str(rev) + " (tag " + str(tag) + ")") # BRIAN: attempt to pull the dCache version from RPM. version = "UNKNOWN" try: version = os.popen("rpm -q --qf '%{VERSION}-%{RELEASE}' " \ "dcache-server").read() except: pass Gratia.RegisterService("dCache", version) # Initialize gratia before attempting to read its config file. Gratia.Initialize() # Extract the configuration information into local variables. myconf = dCacheProbeConfig() # Get the name of the directory where we are to store the log files. logDir = myconf.get_LogFolder() # Make sure that the logging directory is present if not os.path.isdir(logDir): os.mkdir(logDir, 0755) logFileName = os.path.join(logDir, "dcacheTransfer.log") # Set up an alarm to send an email if the program terminates. termSubject = "dCache-transfer probe is going down" termMessage = "The dCache transfer probe for Gratia has " + \ "terminated.\nPlease check the logfile\n\n " + \ logFileName + \ "\n\nfor the cause.\n" terminationAlarm = Alarm(myconf.get_EmailServerHost(), myconf.get_EmailFromAddress(), myconf.get_EmailToList(), termSubject, termMessage, 0, 0, False) # Set up the logger with a suitable format hdlr = RotatingFileHandler(logFileName, 'a', 512000, 10) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.setLevel(myconf.get_AggrLogLevel()) logger.info("starting " + ProgramName) stopFileName = myconf.get_StopFileName() updateFreq = float(myconf.get_UpdateFrequency()) logger.warn("update freq = %.2f" % updateFreq) # Create the aggregator instance that we will use. dataDir = myconf.get_DataFolder() aggregator = DCacheAggregator(myconf, dataDir) # If profiling was requested, turn it on. profiling = sys.argv.count('-profile') > 0 if profiling: profiler = hotshot.Profile("profile.dat") logger.info("Enabling Profiling") # Now aggregate new records, then sleep, until somebody creates # the stop file... while 1: # Make sure we (still) have a connection to Gratia. if (not TestContainer.isTest() ): # no need in that during self test Gratia.Maintenance() if profiling: profiler.run("aggregator.sendBillingInfoRecordsToGratia()") else: try: aggregator.sendBillingInfoRecordsToGratia() except TestContainer.SimInterrupt: logger.info("BillingRecSimulator.SimInterrupt caught, " \ "restarting") aggregator = DCacheAggregator(myconf, dataDir) continue # Are we are shutting down? if os.path.exists(stopFileName): break if TestContainer.isTest(): break logger.warn("sleeping for = %.2f seconds" % updateFreq) sleep_check(updateFreq, stopFileName) # If we are profiling, print the results... if profiling: profiler.close() stats = hotshot.stats.load("profile.dat") stats.sort_stats('time', 'calls') stats.print_stats() logger.warn(ProgramName + " stop file detected.") except (KeyboardInterrupt, SystemExit): raise except: # format the traceback into a string tblist = traceback.format_exception(sys.exc_type, sys.exc_value, sys.exc_traceback) msg = ProgramName + " caught an exception:\n" + "".join(tblist) print msg logger.error(msg) TestContainer.dumpStatistics(logger) # shut down the logger to make sure nothing is lost. logger.critical(ProgramName + " shutting down.") logging.shutdown() # try to send an email warning of the shutdown. if terminationAlarm != None: terminationAlarm.event() sys.exit(1)
name, version = output.split() return version def register_gratia(self, name): Gratia.RegisterReporter(name, "%s (tag %s)" % \ (prog_revision, prog_version)) try: slurm_version = self.get_slurm_version() except Exception, e: DebugPrint(0, "Unable to get SLURM version: %s" % str(e)) raise Gratia.RegisterService("SLURM", slurm_version) Gratia.setProbeBatchManager("slurm") class SlurmCheckpoint(object): """Read and write a checkpoint file If class is instantiated without a filename, class works as expected but data is not stored to disk """ _val = None _fp = None def __init__(self, target=None): """ Create a checkpoint file target - checkpoint filename (optionally null) """
BASEDIR = '/home/gprobe/Data/' flist = os.listdir(BASEDIR) # test file # file = 'history.COES-MCAD120-1#1256755408#1#1256755408' # Fields that we're going to populate starttime = '' walltime = '' localjobid = '' endtime = '' user = '******' rev = '$Revision: 3273 $' Gratia.RegisterReporterLibrary('myprobe.py', Gratia.ExtractSvnRevision(rev)) for var in flist: if var.count('history') > 0: fd = open('/home/gprobe/Data/' + var) lines = fd.readlines() boincjob = False for var2 in lines: if var2.count('QDate') > 0: starttime = var2.split()[2] elif var2.count('RemoteWallClockTime') > 0: walltime = var2.split()[2] elif var2.count('CompletionDate') > 0: endtime = var2.split()[2] elif var2.count('Owner') > 0:
def send(self, record): Gratia.Send(record)
BASEDIR = '/home/gprobe/Data/' flist = os.listdir(BASEDIR) # test file # file = 'history.COES-MCAD120-1#1256755408#1#1256755408' # Fields that we're going to populate starttime = '' walltime = '' localjobid = '' endtime = '' user = '******' Gratia.RegisterReporterLibrary('myprobe.py') for var in flist: if var.count('history') > 0: fd = open('/home/gprobe/Data/' + var) lines = fd.readlines() boincjob = False for var2 in lines: if var2.count('QDate') > 0: starttime = var2.split()[2] elif var2.count('RemoteWallClockTime') > 0: walltime = var2.split()[2] elif var2.count('CompletionDate') > 0: endtime = var2.split()[2] elif var2.count('Owner') > 0:
def start(self): """Initializes Gratia (to read the option file), does random sleep (if any), acquires the lock, initializes the input and registers Gratia. Must be invoked after options and parameters are parsed (option file name is needed) """ ### Initialize Gratia if not self._opts or not self._opts.gratia_config or not os.path.exists( self._opts.gratia_config): # TODO: print a message instead of an exception? raise Exception("Gratia config file (%s) does not exist." % self._opts.gratia_config) # Print options and initial conditions DebugPrint(5, "Initial options: %s" % self._opts) # Initialization parses the config file. No debug print will work before this Gratia.Initialize(self._opts.gratia_config) # Set to verbose in case the config changed it self.set_verbose() # Sanity checks for the probe's runtime environment. if self._opts.enable: GratiaWrapper.CheckPreconditions(check_enabled=False) else: GratiaWrapper.CheckPreconditions() if self._opts.sleep: rnd = random.randint(1, int(self._opts.sleep)) DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd) time.sleep(rnd) # Make sure we have an exclusive lock for this probe. GratiaWrapper.ExclusiveLock() ### Initialize input (config file must be available) # Input must specify which parameters it requires form the config file # The probe provides static information form the config file if not self._probeinput: self._probeinput = ProbeInput() input_parameters = self._probeinput.get_init_params() input_ini = self.get_config_att_list(input_parameters) # Check for test mode: start and other methods may change if 'input' in self._opts.test: DebugPrint(3, "Running input in test mode") self._probeinput.do_test() # Finish input initialization, including DB connection (if used) self._probeinput.start(input_ini) # get_DataFileExpiration() returns the value in the config file or 31 # TODO: Do we want to always not consider values older than 31 days or only when checkpointing is # enabled? # data_expiration = Gratia.Config.get_DataFileExpiration() # Find the checkpoint filename (if enabled) - after initializing the input! if self._opts.checkpoint: checkpoint_file = self.get_config_attribute('CheckpointFile') full_checkpoint_name = True if not checkpoint_file: full_checkpoint_name = False checkpoint_file = os.path.join( Gratia.Config.get_WorkingFolder(), "checkpoint") data_expiration = Gratia.Config.get_DataFileExpiration() # Only process DataFileExpiration days of history # (unless we're resuming from a checkpoint file) # TODO: is datafileexpiration a maximum value or a default (if no checkpoint is specified)? # Do we want both? # Open the checkpoint file self._probeinput.add_checkpoint(checkpoint_file, default_val=data_expiration, fullname=full_checkpoint_name) ### Complete Gratia initialization # This uses the input version (after Input initialization) self.register_gratia()
class SlurmProbe: opts = None args = None checkpoint = None conn = None cluster = None sacct = None def __init__(self): try: self.opts, self.args = self.parse_opts() except Exception, e: print >> sys.stderr, str(e) sys.exit(1) # Initialize Gratia if not self.opts.gratia_config or not os.path.exists( self.opts.gratia_config): raise Exception("Gratia config, %s, does not exist." % self.opts.gratia_config) Gratia.Initialize(self.opts.gratia_config) if self.opts.verbose: Gratia.Config.set_DebugLevel(5) # Sanity checks for the probe's runtime environment. GratiaWrapper.CheckPreconditions() if self.opts.sleep: rnd = random.randint(1, int(self.opts.sleep)) DebugPrint(2, "Sleeping for %d seconds before proceeding." % rnd) time.sleep(rnd) # Make sure we have an exclusive lock for this probe. GratiaWrapper.ExclusiveLock() self.register_gratia("slurm_meter") # Find the checkpoint filename (if enabled) if self.opts.checkpoint: checkpoint_file = os.path.join(Gratia.Config.get_WorkingFolder(), "checkpoint") else: checkpoint_file = None # Open the checkpoint file self.checkpoint = SlurmCheckpoint(checkpoint_file) # Only process DataFileExpiration days of history # (unless we're resuming from a checkpoint file) if self.checkpoint.val is None: self.checkpoint.val = int(time.time() - (Gratia.Config.get_DataFileExpiration() * 86400)) # Connect to database self.conn = self.get_db_conn() self.cluster = Gratia.Config.getConfigAttribute('SlurmCluster') self.sacct = SlurmAcct(self.conn, self.cluster)