def __init__(self, cont, start, end, schema='meminfo', maxDataPoints=4096): self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.start = start self.end = end self.maxDataPoints = maxDataPoints
def __init__(self, cont): """Implements a SHA256 ---> String mapping service kernel_names and job_tag are stored as SHA256 hash values because the associated strings can be very large """ self.src = SosDataSource() self.src.config(cont=cont)
def __init__(self, cont, start, end, schema='Lustre_Client', maxDataPoints=4096): self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.start = start self.end = end self.mdp = maxDataPoints
def getJobs(self, cont, schema_name, start, end): # method to retrieve unique job_ids schema = cont.schema_by_name(schema_name) attr = schema.attr_by_name("job_id") if attr is None: return None src = SosDataSource() src.config(cont=cont) where_ = [] where_.append(['job_id', Sos.COND_GT, 1]) where_.append([ 'timestamp', Sos.COND_GT, start ]) if end > 0: where_.append([ 'timestamp', Sos.COND_LE, end ]) src.select([ 'job_id' ], from_ = [ schema_name ], where = where_, order_by = 'time_job_comp' ) jobs = src.get_df(limit=8128) if jobs is None: return {0} job_ids = np.unique(jobs['job_id']) result = {} for job_id in job_ids: result[str(int(job_id))] = int(job_id) return result
def getComponents(self, cont, schema_name, start, end): schema = cont.schema_by_name(schema_name) attr = schema.attr_by_name("component_id") if attr is None: return {0} src = SosDataSource() src.config(cont=cont) where = [] if start > 0: where.append([ 'timestamp', Sos.COND_GE, start ]) if end > 0: where.append([ 'timestamp', Sos.COND_LE, end ]) src.select([ 'component_id' ], from_ = [ schema_name ], where = where, order_by = 'timestamp' ) comps = src.get_df() if comps is None: return {0} comp_ids = np.unique(comps['component_id']) result = {} for comp_id in comp_ids: result[str(int(comp_id))] = int(comp_id) return result
def __init__(self, cont, start, end, schema='Lustre_Client', maxDataPoints=4096): self.start = start self.end = end self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.job_metrics = [ 'mt-slurm[job_name]', 'mt-slurm[job_user]', 'mt-slurm[job_id]', 'mt-slurm[job_start]', 'mt-slurm[job_end]', 'mt-slurm[job_size]' ] self.where_ = [] self.where_ = [['job_id', Sos.COND_GT, 1]] if self.start > 0: self.where_.append(['timestamp', Sos.COND_GE, self.start]) if self.end > 0: self.where_.append(['timestamp', Sos.COND_LE, self.end])
class Analysis(object): def __init__(self, cont, start, end, schema=None, maxDataPoints=4096): self.cont = cont self.schema = schema self.start = start self.end = end self.src = SosDataSource() self.src.config(cont=self.cont) self.mdp = maxDataPoints def parse_params(self, params): if params is None: self.threshold = 5 else: if 'threshold' in params: self.threshold = int(params.split('=')[1]) else: self.threshold = 5 if 'idle' in params: self.idle = True else: self.idle = False if 'summary' in params: self.summary = True else: self.summary = False if 'meta' in params: self._meta = True else: self._meta = False if 'bins' in params: self.bins = int(params.split('=')[1]) else: self.bins = 20
class SHA256_Mapper: def __init__(self, cont): """Implements a SHA256 ---> String mapping service kernel_names and job_tag are stored as SHA256 hash values because the associated strings can be very large """ self.src = SosDataSource() self.src.config(cont=cont) def string(self, sha256): self.src.select( ['*'], from_=['sha256_string'], where=[ ['sha256', Sos.COND_EQ, sha256], ], order_by='sha256', ) res = self.src.get_results(limit=1) if res: return res.array('string')[0] return ""
def getJobComponents(self, job_id): """Get components for a particular job""" src = SosDataSource() src.config(cont=self.cont) src.select([ 'component_id' ], from_ = [ self.schemaName ], where = [ [ 'job_id', Sos.COND_EQ, job_id ] ], order_by = 'job_comp_time' ) res = src.get_df(limit=4096) if res: ucomps = np.unique(res['component_id']) return ucomps return None
def getTable(self, index, metricNames, start, end): src = SosDataSource() src.config(cont=self.cont) if self.schemaName == 'kokkos_app': src.select(metricNames, from_ = [ self.schemaName ], where = [ [ 'start_time', Sos.COND_GE, start ], ], order_by = 'time_job_comp' ) else: src.select(metricNames, from_ = [ self.schemaName ], where = [ [ 'timestamp', Sos.COND_GE, start ], [ 'timestamp', Sos.COND_LE, end ] ], order_by = "time_job_comp" ) res = src.get_results() return res
def getJobMarkers(self, start, end, jobId=None, compId=None): """Query Job Marker annotations Positional Parameters: -- The start of the date/time range -- The end of the date/time range Keyword Parameters: jobId - Show only markers for the specified job compId - Show only markers for the specified component """ src = SosDataSource() src.config(cont=self.cont) by = 'comp_time' if jobId != None: # ignore the start/end time for the job markers jobId = int(jobId) where = [ [ 'job_id', Sos.COND_EQ, jobId ] ] by = 'job_rank_time' elif compId != None: where = [ [ 'component_id', Sos.COND_EQ, compId ], [ 'timestamp', Sos.COND_GE, start ], [ 'timestamp', Sos.COND_LE, end ], ] else: where = [ [ 'timestamp', Sos.COND_GE, start ], [ 'timestamp', Sos.COND_LE, end ], ] by = 'time_job' src.select([ 'job_id', 'job_start', 'job_end', 'component_id' ], from_ = [ 'mt-slurm' ], where = where, order_by = by ) x = Transform(src, None, limit=12384) res = x.begin() if not res: return res # x.top().show() result = x.dup() x.min([ 'job_start' ], group_name='job_id', keep=['component_id'], xfrm_suffix='') result.concat(x.pop()) x.max([ 'job_end' ], group_name='job_id', keep=['component_id'], xfrm_suffix='') result.concat(x.pop()) nda = result.array('job_start') nda *= 1000 nda1 = result.array('job_end') nda1 *= 1000 return result
def getComponents(self, start, end): """Return unique components with data for this schema""" src = SosDataSource() src.config(cont=self.cont) src.select([ 'component_id' ], from_ = [ self.schemaName ], where = [ [ timestamp, Sos.COND_GE, start ], [ timestamp, Sos.COND_LE, end ] ], order_by = 'time_comp' ) res = src.get_results(limit=4096) if res: ucomps = np.unique(res['component_id']) return ucomps return None
def getJobCompEnd(self, job_id): """Get job end""" src = SosDataSource() src.config(cont=self.cont) src.select([ self.schemaName+'.*' ], from_ = [ self.schemaName ], where = [ [ 'job_id', Sos.COND_EQ, job_id ], [ 'job_status', Sos.COND_EQ, 2 ] ], order_by = 'job_comp_time') res = src.get_results() if res is None: return None xfrm = Transform(src, None, limit=4096) res = xfrm.begin() xfrm.max([ 'job_end' ], group_name='component_id') comp_time = xfrm.pop() nodes = np.arange(comp_time.get_series_size()) comp_time.append_array(comp_time.get_series_size(), 'node_id', nodes) return comp_time
def derived_metrics(self, job_id): """Calculate derived papi metrics for a given job_id""" try: self.derived_names = [ "tot_ins", "tot_cyc", "ld_ins", "sr_ins", "br_ins", "fp_ops", "l1_icm", "l1_dcm", "l2_ica", "l2_tca", "l2_tcm", "l3_tca", "l3_tcm" ] src = SosDataSource() src.config(cont=self.cont) src.select([ 'PAPI_TOT_INS[timestamp]', 'PAPI_TOT_INS[component_id]', 'PAPI_TOT_INS[job_id]', 'PAPI_TOT_INS[rank]' ] + list(self.event_name_map.keys()), from_=list(self.event_name_map.keys()), where=[['job_id', Sos.COND_EQ, int(job_id)]], order_by='job_rank_time') xfrm = Transform(src, None) res = xfrm.begin() if res is None: # Job was too short to record data return (None, None) while res is not None: res = next(xfrm) if res is not None: # concatenate TOP and TOP~1 xfrm.concat() # result now on top of stack result = xfrm.pop() # result on top # "Normalize" the event names for name in self.event_name_map: result.rename(name, self.event_name_map[name]) xfrm.push(result) job = xfrm.pop() # cpi = tot_cyc / tot_ins job <<= job['tot_cyc'] / job['tot_ins'] >> 'cpi' # memory accesses mem_acc = job['ld_ins'] + job['sr_ins'] >> 'mem_acc' # uopi = (ld_ins + sr_ins) / tot_ins job <<= mem_acc / job['tot_ins'] >> 'uopi' # l1_miss_rate = (l1_icm + l1_dcm) / tot_ins l1_tcm = job['l1_icm'] + job['l1_dcm'] job <<= l1_tcm / job['tot_ins'] >> 'l1_miss_rate' # l1_miss_ratio = (l1_icm + l1_dcm) / (ld_ins + sr_ins) job <<= l1_tcm / mem_acc >> 'l1_miss_ratio' # l2_miss_rate = l2_tcm / tot_ins job <<= job['l2_tcm'] / job['tot_ins'] >> 'l2_miss_rate' # l2_miss_ratio = l2_tcm / mem_acc job <<= job['l2_tcm'] / mem_acc >> 'l2_miss_ratio' # l3_miss_rate = l3_tcm / tot_ins job <<= job['l3_tcm'] / job['tot_ins'] >> 'l3_miss_rate' # l3_miss_ratio = l3_tcm / mem_acc job <<= job['l3_tcm'] / mem_acc >> 'l3_miss_ratio' # l2_bandwidth = l2_tca * 64e-6 job <<= job['l2_tca'] * 64e-6 >> 'l2_bw' # l3_bandwidth = (l3_tca) * 64e-6 job <<= job['l3_tca'] * 64e-6 >> 'l3_bw' # floating_point job <<= job['fp_ops'] / job['tot_ins'] >> 'fp_rate' # branch job <<= job['br_ins'] / job['tot_ins'] >> 'branch_rate' # load job <<= job['ld_ins'] / job['tot_ins'] >> 'load_rate' # store job <<= job['sr_ins'] / job['tot_ins'] >> 'store_rate' return xfrm, job except Exception as e: a, b, c = sys.exc_info() print('derived_metrics: Error: ' + str(e) + ' ' + str(c.tb_lineno)) return None
class metricRateBin(Analysis): def __init__(self, cont, start, end, schema='Lustre_Client', maxDataPoints=4096): self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.start = start self.end = end self.mdp = maxDataPoints def get_data(self, metrics, job_id=0, user_id=0, params='bins=10'): self.bins = 10 result = [] datapoints = [] time_range = self.end - self.start offset = time_range * .01 if offset < 1: offset = 1 where_ = [['timestamp', Sos.COND_GE, self.start - offset], ['timestamp', Sos.COND_LE, self.end + offset]] if job_id > 0: where_.append(['job_id', Sos.COND_EQ, job_id]) try: self.src.select(metrics + ['timestamp', 'component_id'], from_=[self.schema], where=where_, order_by='time_comp_job') inp = None # default for now is dataframe - will update with dataset vs dataframe option self.xfrm = Transform(self.src, None, limit=self.mdp) resp = self.xfrm.begin() if resp is None: print('resp == None') return None while resp is not None: resp = next(self.xfrm) if resp is not None: self.xfrm.concat() self.xfrm.dup() data = self.xfrm.pop() self.xfrm.diff(metrics, group_name="component_id", keep=['timestamp'], xfrm_suffix='') data = self.xfrm.pop() hsum = None data_time = (data.array('timestamp')[-1].astype('float') - data.array('timestamp')[0].astype('float')) data_time = data_time / 1000000 if data_time < time_range: bins = int(data_time / time_range * 20) if bins < 2: bins = 2 else: bins = 20 for met_diff in metrics: os = data.array(met_diff) h = np.histogram(data.array('timestamp').astype('float'), bins=bins, weights=os, density=False) if hsum is None: ts = h[1][:-1] / 1000 hsum = np.zeros(h[0].shape) hsum += h[0] res = DataSet() res.append_array(len(hsum), str(metrics), hsum) res.append_array(len(ts), 'timestamp', ts) return res except Exception as e: a, b, c = sys.exc_info() print(str(e) + ' ' + str(c.tb_lineno))
def papiGetLikeJobs(self, job_id, start, end): """Return jobs similar to requested job_id based on similar instance data""" try: src = SosDataSource() src.config(cont=self.cont) src.select(['inst_data'], from_ = [ 'kokkos_app'], where = [ [ 'job_id', Sos.COND_EQ, job_id ] ], order_by = 'job_comp_time', ) res = src.get_results() if res is None: return None result = {} jobData = SosDataSource() jobData.config(cont=self.cont) where = [ [ 'inst_data' , Sos.COND_EQ, res.array('inst_data')[0] ] ] jobData.select([ 'job_id', 'user_id', 'job_name' ], from_ = [ 'kokkos_app' ], where = where, order_by = 'inst_job_app_time', unique = True ) res = jobData.get_results() result["columns"] = [ { "text" : "Job Id" }, { "text" : "User Id" }, { "text" : "Name" } ] result["rows"] = res.tolist() result["type"] = "table" return [ result ] except Exception as e: a, b, c = sys.exc_info() log.write('PapiLikeJobs '+str(e)+' '+str(c.tb_lineno)) return None
def getCompTimeseries(self, compIds, metricNames, start, end, intervalMs, maxDataPoints, jobId=0): """Return time series data for a particular component/s""" src = SosDataSource() src.config(cont=self.cont) if type(metricNames) != list: metricNames = [ metricNames ] result = [] if compIds: if type(compIds) != list: compIds = [ int(compIds) ] elif jobId != 0: src.select([ 'component_id'], from_ = [ self.schemaName ], where = [ [ 'job_id', Sos.COND_EQ, jobId ] ], order_by = 'job_time_comp' ) comps = src.get_df(limit=maxDataPoints) if comps.empty: compIds = np.zeros(1) else: compIds = np.unique(comps['component_id']) else: src.select([ 'component_id' ], from_ = [ self.schemaName ], where = [ [ 'timestamp', Sos.COND_GE, start ], [ 'timestamp', Sos.COND_LE, end ], ], order_by = 'time_comp_job' ) comps = src.get_df(limit=maxDataPoints) if comps.empty: compIds = np.zeros(1) else: compIds = np.unique(comps['component_id']) for comp_id in compIds: for metric in metricNames: if comp_id != 0: where_ = [ [ 'component_id', Sos.COND_EQ, comp_id ] ] else: where_ = [] if jobId != 0: self.index = "job_comp_time" where_.append([ 'job_id', Sos.COND_EQ, int(jobId) ]) else: self.index = "time_comp" where_.append([ 'timestamp', Sos.COND_GE, start ]) where_.append([ 'timestamp', Sos.COND_LE, end ]) src.select([ metric, 'timestamp' ], from_ = [ self.schemaName ], where = where_, order_by = self.index ) time_delta = end - start res = src.get_df(limit=100000) res['timestamp'] = res['timestamp'].values.astype(np.int64) / int(1e6) if res is None: continue result.append({ "target" : '['+str(comp_id)+']'+metric, "datapoints" : res.to_numpy().tolist() }) return result
class compMinMeanMax(Analysis): def __init__(self, cont, start, end, schema='meminfo', maxDataPoints=4096): self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.start = start self.end = end self.maxDataPoints = maxDataPoints def get_data(self, metric, job_id, user_id=0, params=None): metric = metric[0] if job_id == 0: return [{ 'target': 'Error: Please specify valid job_id', 'datapoints': [] }] # Get components with data during given time range self.src.select(['component_id'], from_=[self.schema], where=[['job_id', Sos.COND_EQ, job_id], ['timestamp', Sos.COND_GE, self.start - 300], ['timestamp', Sos.COND_LE, self.end + 300]], order_by='job_time_comp') comps = self.src.get_results(limit=self.maxDataPoints) if not comps: return [{ 'target': 'Error: component_id not found for Job ' + str(job_id), 'datapoints': [] }] else: compIds = np.unique(comps['component_id'].tolist()) print(compIds) result = [] datapoints = [] time_range = self.end - self.start if time_range > 4096: bin_width = int(time_range // 200) else: bin_width = 1 dfs = [] for comp_id in compIds: where_ = [['component_id', Sos.COND_EQ, comp_id], ['job_id', Sos.COND_EQ, job_id], ['timestamp', Sos.COND_GE, self.start], ['timestamp', Sos.COND_LE, self.end]] self.src.select([metric, 'timestamp'], from_=[self.schema], where=where_, order_by='job_comp_time') # default for now is dataframe - will update with dataset vs dataframe option res = self.src.get_df(limit=self.maxDataPoints, index='timestamp') if res is None: continue rs = res.resample(str(bin_width) + 'S').fillna("backfill") dfs.append(rs) df = pd.concat(dfs, axis=1, ignore_index=True) res_ = DataSet() min_datapoints = df.min(axis=1, skipna=True) mean_datapoints = df.mean(axis=1, skipna=True) max_datapoints = df.max(axis=1, skipna=True) res_ = pd.DataFrame({ "min_" + metric: min_datapoints.values, "mean_" + metric: mean_datapoints.values, "max_" + metric: max_datapoints.values, "timestamp": min_datapoints.index }) return res_
class lustreData(Analysis): def __init__(self, cont, start, end, schema='Lustre_Client', maxDataPoints=4096): self.start = start self.end = end self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.job_metrics = [ 'mt-slurm[job_name]', 'mt-slurm[job_user]', 'mt-slurm[job_id]', 'mt-slurm[job_start]', 'mt-slurm[job_end]', 'mt-slurm[job_size]' ] self.where_ = [] self.where_ = [['job_id', Sos.COND_GT, 1]] if self.start > 0: self.where_.append(['timestamp', Sos.COND_GE, self.start]) if self.end > 0: self.where_.append(['timestamp', Sos.COND_LE, self.end]) def get_data(self, metrics, job_id=None, user_id=0, params=None): self.user_id = user_id self.parse_params(params) res = self.get_lustre_avg(metrics) return res def _sum_metrics(self, metrics): try: ''' Return tuple of (metric_per_second nd array, dataset) ''' self.src.select(['job_id', 'component_id'] + metrics, from_=[self.schema], where=self.where_, order_by='time_job_comp') self.xfrm = Xfrm(self.src, None) # set metrics in Xfrm class self.xfrm.set_metrics(metrics) resp = self.xfrm.begin() if resp is None: return None while resp is not None: resp = next(self.xfrm) if resp is not None: self.xfrm.concat() self.xfrm.for_each(series_list=['job_id'], xfrm_fn=self.xfrm.job_diff) return self.xfrm.sum_ except Exception as e: a, b, c = sys.exc_info() print(str(e) + ' ' + str(c.tb_lineno)) return None, None def get_lustre_avg(self, metrics): try: sumbytes = self._sum_metrics(metrics) if sumbytes is None: return None ret_bps = [] ret_jobs = [] ret_name = [] ret_start = [] ret_end = [] ret_user = [] ret_state = [] ret_size = [] i = 0 jids = self.xfrm.job_ids res = [] while i < self.threshold: if len(sumbytes) < 1: break index, val = max(enumerate(sumbytes), key=operator.itemgetter(1)) where_ = [['job_id', Sos.COND_EQ, jids[index]]] if self.user_id != 0: where_.append(['uid', Sos.COND_EQ, self.user_id]) self.src.select(self.job_metrics, from_=['mt-slurm'], where=where_, order_by='job_rank_time') job = self.src.get_results() res.append(job) if job is None: sumbytes = np.delete(sumbytes, index) jids = np.delete(jids, index) continue job_start = np.min(job.array('job_start')) if job.array('job_end')[0] < 1: job_end = time.time() ret_end.append(job_end * 1000) ret_state.append("In process") else: job_end = np.max(job.array('job_end')) ret_end.append(job_end * 1000) ret_state.append("Completed") ret_bps.append(val / (job_end - job_start)) ret_jobs.append(job.array('job_id')[0]) ret_size.append(job.array('job_size')[0]) ret_name.append(job.array('job_name')[0].decode()) ret_start.append(job_start * 1000) ret_user.append(job.array('job_user')[0].decode()) # remove job with highest bps from list of jobs sumbytes = np.delete(sumbytes, index) jids = np.delete(jids, index) i += 1 res_ = DataSet() if not self._meta: res_.append_array(len(ret_bps), 'bps', ret_bps) else: res_.append_array(len(ret_bps), 'ios', ret_bps) res_.append_array(len(ret_jobs), 'job_id', ret_jobs) res_.append_array(len(ret_size), 'ranks', ret_size) res_.append_array(len(ret_name), 'job_name', ret_name) res_.append_array(len(ret_user), 'job_user', ret_user) res_.append_array(len(ret_start), 'job_start', ret_start) res_.append_array(len(ret_end), 'job_end', ret_end) res_.append_array(len(ret_state), 'job_state', ret_state) return res_ except Exception as e: a, b, c = sys.exc_info() print(str(e) + ' ' + str(c.tb_lineno)) return None
class meanMetricRate(Analysis): def __init__(self, cont, start, end, schema='Lustre_Client', maxDataPoints=4096): self.schema = schema self.src = SosDataSource() self.src.config(cont=cont) self.start = start self.end = end self.maxDataPoints = maxDataPoints def get_data(self, metrics, job_id=0, user_id=0, params=None): result = [] datapoints = [] where_ = [['timestamp', Sos.COND_GE, self.start], ['timestamp', Sos.COND_LE, self.end]] self.src.select(metrics + ['timestamp'], from_=[self.schema], where=where_, order_by='time_comp_job') inp = None # default for now is dataframe - will update with dataset vs dataframe option res = self.src.get_df() if res is None: return None mets = res.drop(res.tail(1).index) mets = mets.mean() time_range = self.end - self.start if time_range > 4096: bin_width = int(time_range / 200) else: bin_width = 1 start_d = dt.datetime.utcfromtimestamp( self.start).strftime('%m/%d/%Y %H:%M:%S') end_d = dt.datetime.utcfromtimestamp( self.end).strftime('%m/%d/%Y %H:%M:%S') ts = pd.date_range(start=start_d, end=end_d, periods=len(mets.values)) series = pd.DataFrame(mets.values, index=ts, dtype=float) rs = series.resample(str(bin_width) + 'S').mean() dps = rs.values.flatten() if len(dps) > 1: dps = np.diff(dps) tstamp = rs.index i = 0 tstamps = [] if len(tstamp) > 1: x = 1 else: x = 0 while i < len(tstamp[x:]): ts = pd.Timestamp(tstamp[i]) ts = np.int_(ts.timestamp() * 1000) tstamps.append(ts) i += 1 res_ = DataSet() res_.append_array(len(dps), str(metrics) + " Rate", dps) res_.append_array(len(tstamps), 'timestamp', tstamps) return res_