def __init__(self, job): super(SlurmCgroupMemTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostcounts = {} self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format( job.acct['uid'], job.job_id)
def __init__(self, job): super(CgroupMemTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostcounts = {} if job.acct['resource_manager'] == 'pbs': self._expectedcgroup = "/torque/{0}".format(job.job_id) elif job.acct['resource_manager'] == 'slurm': self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format( job.acct['uid'], job.job_id) else: raise NotApplicableError
def __init__(self, job): super(SimdInsTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} self._error = None
class SimdInsTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "simdins") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(SimdInsTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} self._error = None def process(self, nodemeta, timestamp, data, description): if len(data[0]) > 0 and data[0][0] == 0: # If active == 0 then the PMDA was switched off due to user request self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE return False if len(data[1]) == 0: # Ignore timesteps where data was not available return True hostidx = nodemeta.nodeindex if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty( (TimeseriesAccumulator.MAX_DATAPOINTS, len(data[1]))) self._hostdevnames[hostidx] = dict( (str(k), v) for k, v in zip(description[1][0], description[1][1])) if len(data) == len(NHM_METRICS): flops = numpy.array(data[1]) else: flops = 4.0 * data[1] + 2.0 * data[2] + data[3] + data[4] insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops)) if insertat != None: self._hostdata[hostidx][insertat] = flops if insertat > 1: if numpy.any( flops - self._hostdata[hostidx][insertat - 1] < 0.0): self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB return False return True def results(self): if self._error != None: return {"error": self._error} values = self._data.get() rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, 1:, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = ( numpy.diff(self._hostdata[hostidx][:dpnts, devid]) / numpy.diff(values[hostidx, :, 0])).tolist() retdata['hosts'][str( hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
def __init__(self, job): super(RateConvertingTimeseriesPlugin, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {}
class RateConvertingTimeseriesPlugin(Plugin): """ A base abstract class for generating a timeseries summary for values that should be converted to rates, one per node. The plugin name, list of required metrics and generator function must be provided by the implementation """ __metaclass__ = ABCMeta mode = property(lambda x: "timeseries") def __init__(self, job): super(RateConvertingTimeseriesPlugin, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} @abstractmethod def computetimepoint(self, data): """ Called with the data for each timepoint on each host """ pass def process(self, nodemeta, timestamp, data, description): if nodemeta.nodeindex not in self._hostdata: self._hostdata[nodemeta.nodeindex] = 1 datum = self.computetimepoint(data) if datum != None: self._data.adddata(nodemeta.nodeindex, timestamp, datum) def results(self): if len(self._hostdata) != self._job.nodecount: return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, 1:, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
def __init__(self, job): super(CpuUserTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {}
class GpuUsageTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "gpu_usage") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["nvidia.gpuactive"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(GpuUsageTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} def process(self, nodemeta, timestamp, data, description): hostidx = nodemeta.nodeindex if len(data[0]) == 0: # Skip data point with no data return True if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) self._hostdevnames[hostidx] = dict((str(k), str(v)) for k, v in zip(description[0][0], description[0][1])) avg_usage = numpy.mean(data[0]) insertat = self._data.adddata(hostidx, timestamp, avg_usage) if insertat != None: self._hostdata[hostidx][insertat] = data[0] return True def results(self): values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, :, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, int(devid)].tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
class SimdInsTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "simdins") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(SimdInsTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} self._error = None def process(self, nodemeta, timestamp, data, description): if len(data[0]) > 0 and data[0][0] == 0: # If active == 0 then the PMDA was switched off due to user request self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE return False if len(data[1]) == 0: # Ignore timesteps where data was not available return True hostidx = nodemeta.nodeindex if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[1]))) self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[1][0], description[1][1])) if len(data) == len(NHM_METRICS): flops = numpy.array(data[1]) else: flops = 4.0 * data[1] + 2.0 * data[2] + data[3] + data[4] insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops)) if insertat != None: self._hostdata[hostidx][insertat] = flops if insertat > 1: if numpy.any(flops - self._hostdata[hostidx][insertat-1] < 0.0): self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB return False return True def results(self): if self._error != None: return {"error": self._error} values = self._data.get() rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, 1:, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, devid]) / numpy.diff(values[hostidx, :, 0])).tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
class RateConvertingTimeseriesPlugin(Plugin): """ A base abstract class for generating a timeseries summary for values that should be converted to rates, one per node. The plugin name, list of required metrics and generator function must be provided by the implementation """ __metaclass__ = ABCMeta mode = property(lambda x: "timeseries") def __init__(self, job): super(RateConvertingTimeseriesPlugin, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} @abstractmethod def computetimepoint(self, data): """ Called with the data for each timepoint on each host """ pass def process(self, nodemeta, timestamp, data, description): if nodemeta.nodeindex not in self._hostdata: self._hostdata[nodemeta.nodeindex] = 1 self._data.adddata(nodemeta.nodeindex, timestamp, self.computetimepoint(data)) def results(self): if len(self._hostdata) != self._job.nodecount: return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, 1:, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
class CgroupMemTimeseries(Plugin): """ Generate timeseries summary for memory usage viewed from CGroup This code is SLURM-specific because of the SLURM cgroup naming convention. """ name = property(lambda x: "process_mem_usage") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["cgroup.memory.usage"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(CgroupMemTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostcounts = {} if job.acct['resource_manager'] == 'pbs': self._expectedcgroup = "/torque/{0}".format(job.job_id) elif job.acct['resource_manager'] == 'slurm': self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format( job.acct['uid'], job.job_id) else: raise NotApplicableError def process(self, nodemeta, timestamp, data, description): hostidx = nodemeta.nodeindex if len(data[0]) == 0: # Skip data point with no data return True if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty( (TimeseriesAccumulator.MAX_DATAPOINTS, 1)) self._hostcounts[hostidx] = {'missing': 0, 'present': 0} try: dataidx = None for idx, desc in enumerate(description[0][1]): if re.match(r"^" + re.escape(self._expectedcgroup) + r"($|\.)", desc): dataidx = idx break # No cgroup info at this datapoint if dataidx is None: return True nodemem_gb = data[0][dataidx] / 1073741824.0 self._hostcounts[hostidx]['present'] += 1 except ValueError: self._hostcounts[hostidx]['missing'] += 1 # No cgroup info at this datapoint return True insertat = self._data.adddata(hostidx, timestamp, nodemem_gb) if insertat != None: self._hostdata[hostidx][insertat] = nodemem_gb return True def results(self): if len(self._hostdata) != self._job.nodecount: return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE} for hcount in self._hostcounts.itervalues(): if hcount['missing'] > hcount['present']: return {'error': ProcessingError.CPUSET_UNKNOWN} values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, :, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
def __init__(self, job): super(SlurmCgroupMemTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostcounts = {} self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id)
class SlurmCgroupMemTimeseries(Plugin): """ Generate timeseries summary for memory usage viewed from CGroup This code is SLURM-specific because of the SLURM cgroup naming convention. """ name = property(lambda x: "process_mem_usage") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["cgroup.memory.usage"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(SlurmCgroupMemTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostcounts = {} self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id) def process(self, nodemeta, timestamp, data, description): hostidx = nodemeta.nodeindex if len(data[0]) == 0: # Skip data point with no data return True if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, 1)) self._hostcounts[hostidx] = {'missing': 0, 'present': 0} try: dataidx = description[0][1].index(self._expectedcgroup) nodemem_gb = data[0][dataidx] / 1073741824.0 self._hostcounts[hostidx]['present'] += 1 except ValueError: self._hostcounts[hostidx]['missing'] += 1 # No cgroup info at this datapoint return True insertat = self._data.adddata(hostidx, timestamp, nodemem_gb) if insertat != None: self._hostdata[hostidx][insertat] = nodemem_gb return True def results(self): if len(self._hostdata) != self._job.nodecount: return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE} for hcount in self._hostcounts.itervalues(): if hcount['missing'] > hcount['present']: return {'error': ProcessingError.CPUSET_UNKNOWN} values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, :, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
def __init__(self, job): super(CpuUserTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} self._cpusallowed = None
class PowerUsageTimeseries(Plugin): """ Generate the Power usage as a timeseries data """ name = property(lambda x: "power") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["ipmi.dcmi.power"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(PowerUsageTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} @staticmethod def computetimepoint(data): """ Get the power usage from the data """ if data[0][0] < numpy.finfo(numpy.float64).eps: return None return data[0][0] def process(self, nodemeta, timestamp, data, description): if not data[0]: # Skip data point with no data return True if nodemeta.nodeindex not in self._hostdata: self._hostdata[nodemeta.nodeindex] = 1 datum = self.computetimepoint(data) if datum != None: self._data.adddata(nodemeta.nodeindex, timestamp, datum) return True def results(self): if len(self._hostdata) != self._job.nodecount: return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} power = values[:, :, 1] if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(power.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], power), "max": self.collatedata(sortarr[:, -1], power), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], power), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = {"times": values[0, :, 0].tolist(), "hosts": {}} includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = power[hostidx, :].tolist() return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
class CpuUserTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "cpuuser") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["kernel.percpu.cpu.user"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(CpuUserTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} self._cpusallowed = None def initcpus(self): if self._job.getdata('proc'): self._cpusallowed = self._job.getdata('proc')['cpusallowed'] else: self._cpusallowed = {} def process(self, nodemeta, timestamp, data, description): if self._cpusallowed == None: self.initcpus() if len(data[0]) == 0: # Skip datapoints that have no values return True if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]: cpudata = data[0][self._cpusallowed[nodemeta.nodename]] else: cpudata = data[0] hostidx = nodemeta.nodeindex if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(cpudata))) if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]: self._hostdevnames[hostidx] = {} for i, cpuidx in enumerate(self._cpusallowed[nodemeta.nodename]): self._hostdevnames[hostidx][str(i)] = description[0][1][cpuidx] else: self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) insertat = self._data.adddata(hostidx, timestamp, numpy.mean(cpudata)/10.0) if insertat != None: self._hostdata[hostidx][insertat] = cpudata / 10.0 return True def results(self): values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, 1:, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
class CpuUserTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "cpuuser") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["kernel.percpu.cpu.user"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(CpuUserTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} def process(self, nodemeta, timestamp, data, description): if len(data[0]) == 0: # Skip datapoints that have no values return True hostidx = nodemeta.nodeindex if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) insertat = self._data.adddata(hostidx, timestamp, numpy.mean(data[0])/10.0) if insertat != None: self._hostdata[hostidx][insertat] = data[0] / 10.0 return True def results(self): values = self._data.get() if len(values[0, :, 0]) < 3: return {"error": ProcessingError.JOB_TOO_SHORT} rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts sortarr = numpy.argsort(rates.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], rates), "max": self.collatedata(sortarr[:, -1], rates), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], rates), "times": values[0, 1:, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, 1:, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, devid]) / numpy.diff(values[hostidx, :, 0])).tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result
def __init__(self, job): super(GpuUsageTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {}
class MemUsageTimeseries(Plugin): """ Generate the CPU usage as a timeseries data """ name = property(lambda x: "memused_minus_diskcache") mode = property(lambda x: "timeseries") requiredMetrics = property(lambda x: ["mem.numa.util.used", "mem.numa.util.filePages", "mem.numa.util.slab"]) optionalMetrics = property(lambda x: []) derivedMetrics = property(lambda x: []) def __init__(self, job): super(MemUsageTimeseries, self).__init__(job) self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) self._hostdata = {} self._hostdevnames = {} def process(self, nodemeta, timestamp, data, description): hostidx = nodemeta.nodeindex if len(data[0]) == 0: # Skip data point with no data return True if nodemeta.nodeindex not in self._hostdata: self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) nodemem_kb = numpy.sum(data[0]) - numpy.sum(data[1]) - numpy.sum(data[2]) insertat = self._data.adddata(hostidx, timestamp, nodemem_kb / 1048576.0) if insertat != None: self._hostdata[hostidx][insertat] = (data[0] - data[1] - data[2]) / 1048576.0 return True def results(self): values = self._data.get() if len(self._hostdata) > 64: # Compute min, max & median data and only save the host data # for these hosts memdata = values[:, :, 1] sortarr = numpy.argsort(memdata.T, axis=1) retdata = { "min": self.collatedata(sortarr[:, 0], memdata), "max": self.collatedata(sortarr[:, -1], memdata), "med": self.collatedata(sortarr[:, sortarr.shape[1] / 2], memdata), "times": values[0, :, 0].tolist(), "hosts": {} } uniqhosts = Counter(sortarr[:, 0]) uniqhosts.update(sortarr[:, -1]) uniqhosts.update(sortarr[:, sortarr.shape[1] / 2]) includelist = uniqhosts.keys() else: # Save data for all hosts retdata = { "times": values[0, :, 0].tolist(), "hosts": {} } includelist = self._hostdata.keys() for hostidx in includelist: retdata['hosts'][str(hostidx)] = {} retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() retdata['hosts'][str(hostidx)]['dev'] = {} for devid in self._hostdevnames[hostidx].iterkeys(): dpnts = len(values[hostidx, :, 0]) retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, devid].tolist() retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] return retdata @staticmethod def collatedata(args, rates): """ build output data """ result = [] for timepoint, hostidx in enumerate(args): try: result.append([rates[hostidx, timepoint], int(hostidx)]) except IndexError: pass return result