def job_params(self, job, firsttime=True): """Request the Slurm REST API of the cluster to get Job params. Raises IndexError if job is not found or ValueError if not well formatted JSON data sent by the API. """ profiler = Profiler() profiler.start('slurm_auth') self.ensure_auth() profiler.stop('slurm_auth') url = "{base}/job/{job}".format(base=self.base_url, job=job) try: profiler.start('slurm_req') if self.auth_enabled is True: payload = {'token': self.auth_token} resp = requests.post(url=url, json=payload) else: resp = requests.post(url=url) profiler.stop('slurm_req') except ConnectionError, err: # reformat the exception raise ValueError("connection error while trying to connect to " "{url}: {error}".format(url=url, error=err))
def job_params(self, job, firsttime=True): """Request the Slurm REST API of the cluster to get Job params. Raises IndexError if job is not found or ValueError if not well formatted JSON data sent by the API. """ profiler = Profiler() profiler.start('slurm_auth') self.ensure_auth() profiler.stop('slurm_auth') url = "{base}/job/{job}".format(base=self.base_url, job=job) try: profiler.start('slurm_req') if self.auth_enabled is True: headers = {'Authorization': "Bearer %s" % self.auth_token} resp = requests.get(url=url, headers=headers, verify=self.ca_filepath) else: resp = requests.get(url=url, verify=self.ca_filepath) profiler.stop('slurm_req') except ConnectionError as err: # reformat the exception raise ValueError("connection error while trying to connect to " "{url}: {error}".format(url=url, error=err)) if resp.status_code == 404: raise IndexError("job ID {jobid} not found in API {api}".format( jobid=job, api=self.base_url)) if resp.status_code == 403: logger.debug("Error 403 received: %s", resp.content) if firsttime: # We probably get this error because of invalidated token. # Invalidate cache, trigger check_auth() and call this method # again. logger.info("token in cache invalidated") self.auth_token = None self.auth_enabled = None self.cache.invalidate() return self.job_params(job, firsttime=False) else: # We have already tried twice. This means the app is not able # to auth on slurm-web API with current params. Just throw the # error and give-up here. raise Exception( "get 403/forbidden from {url} with new token".format( url=self.base_url)) try: json_job = json.loads(resp.text) except ValueError: # reformat the exception raise ValueError("not JSON data for GET {url}".format(url=url)) return json_job
def metrics(cluster, jobid, period): if 'JOBMETRICS_CONF_FILE' in request.environ.keys(): conf = Conf(request.environ['JOBMETRICS_CONF_FILE']) elif 'JOBMETRICS_CONF_FILE' in os.environ.keys(): conf = Conf(os.environ.get('JOBMETRICS_CONF_FILE')) else: conf = Conf() init_logger(conf) cache = Cache(conf.cache_path) cluster_cache = cache.get(cluster) slurm_api = SlurmAPI(conf, cluster, cluster_cache) profiler = Profiler() job = JobParams(jobid) app.logger.info("GET cluster %s jobid %d" % (cluster, jobid)) try: job.request_params(slurm_api) except IndexError as err: # IndexError here means the job is unknown according to Slurm API. # Return 404 with error message abort(404, {'error': str(err)}) except (ValueError, ConnectionError, Exception) as err: # ValueError means the Slurm API responded something that was not # JSON formatted. ConnectionError means there was a problem while # connection to the slurm API. Return 500 with error message. abort(500, {'error': err.message}) # Write the cache at this point since it will not be modified then cache.write() # Check the period given in parameter is valid. If not, return 500. if period not in periods.keys(): abort(500, {'error': "period %s is not valid" % (period)}) try: db = MetricsDB(conf) job_data = JobData(cluster, job, period) job_data.request(db) resp = {} resp['data'] = job_data.dump() resp['debug'] = profiler.dump() return jsonify(resp) except Exception as err: app.logger.exception(err) abort(500, {'error': str(err)})
def metrics(cluster, jobid, period): conf = Conf() init_logger(conf) cache = Cache(conf.cache_path) cluster_cache = cache.get(cluster) slurm_api = SlurmAPI(conf, cluster, cluster_cache) profiler = Profiler() job = JobParams(jobid) app.logger.info("GET cluster %s jobid %d" % (cluster, jobid)) try: job.request_params(slurm_api) except IndexError, err: # IndexError here means the job is unknown according to Slurm API. # Return 404 with error message abort(404, {'error': str(err)})
def request(self, db): (self.metrics, self.nodeset) = \ db.get_metrics_results(self.cluster, self.job, ['cpus', 'cpu-user', 'cpu-system', 'memory-pss'], self.period) self.stack_cpu_idle() profiler = Profiler() profiler.meta('producers', str(self.nodeset)) profiler.meta('nodes', str(self.job.nodeset)) profiler.meta('mutes', str(self.job.nodeset - self.nodeset))
def request(self, db): (self.metrics, self.nodeset) = \ db.get_metrics_results(self.cluster, self.job, ['cpu-system', 'cpu-iowait', 'cpu-user', 'cpu-softirq', 'cpu-idle', 'memory-pss', 'memory-rss', 'utilization_gpu', 'utilization_memory', 'cpus'], self.period) #self.stack_cpu_idle() profiler = Profiler() profiler.meta('producers', str(self.nodeset)) profiler.meta('nodes', str(self.job.nodeset)) profiler.meta('mutes', str(self.job.nodeset - self.nodeset))
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where time > now() - {period} " \ "and cluster = '{cluster}' " \ "and job = 'job_{job}' " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, time_group=time_group) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}" .format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} series = data['results'][0]['series'] results = {} nodeset = NodeSet() for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: results[timestamp] = list() for xidx in range(len(metrics)): if xidx == metrics.index(metric): results[timestamp].append(value) else: results[timestamp].append(0) else: # The cpus/nodes metrics can be produced by several batch # servers and thus returned multiple times by InfluxDB # server in the result of the request. We must take care to # not add the multiple results of this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where time > now() - {period} " \ "and cluster = '{cluster}' " \ "and job = 'job_{job}' " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, time_group=time_group) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}".format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} series = data['results'][0]['series'] results = {} nodeset = NodeSet() for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: results[timestamp] = list() for xidx in range(len(metrics)): if xidx == metrics.index(metric): results[timestamp].append(value) else: results[timestamp].append(0) else: # The cpus/nodes metrics can be produced by several batch # servers and thus returned multiple times by InfluxDB # server in the result of the request. We must take care to # not add the multiple results of this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)
def get_metrics_results(self, cluster, job, metrics, period): """Get the metrics of the job on the cluster for the period in parameters. It sends an HTTP request to InfluxDB service to download the metric values in JSON format and returns a list. """ timejob = job.end_time - job.start_time logger.debug("time job: %d", timejob) if timejob < 3600: period = "1h" if timejob < 21600 and timejob > 3600: period = "6h" time_group = periods[period] profiler = Profiler() metrics_s = "\"" + "\", \"".join(metrics) + "\"" req = "select mean(value) from {metrics} " \ "where cluster = '{cluster}' " \ "and (( job = 'job_{job}' and time > now() - {period} ) or" \ " ( job = 'none' and plugin = 'cuda' and time >= {start_time}000000000 and time <= {end_time}000000000 and node = '{nodes}' )) " \ "group by time({time_group}), node fill(0)" \ .format(metrics=metrics_s, period=period, cluster=cluster, job=job.jobid, nodes=job.nodeset, start_time=job.start_time, end_time=job.end_time, time_group=time_group) logger.debug("req influx: %s", req) profiler.meta('metrics_req', req) payload = {'db': self.db, 'q': req, 'epoch': 'ms'} profiler.start('metrics_req') resp = requests.get(url=self.url, params=payload) profiler.stop('metrics_req') if resp.status_code == 404: raise LookupError("metrics not found for job {job} on cluster " "{cluster}".format(job=job.jobid, cluster=cluster)) profiler.start('metrics_proc') json_data = json.loads(resp.text) # data is a dict with 'results' key that is itself a list of dict with # 'series' key that is as well a list of dict, one dict per node/node # association. Each dict has it own list of values. We have to compute # the sum the values for all nodes at every timestampsi, for each # metric. # # Ex: # # { "results": [ # { "series": [ # { "name": "cpu-system", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "cpu-system", # "tags": {"node":"cn2"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-system for cn3 ...) # # { "name": "cpu-user", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then cpu-user for cn[2-3] ...) # # { "name": "cpus", # "tags": {"node":"admin"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",6], # ["2015-10-16T12:37:20Z",0] # ] # }, # { "name": "memory-pss", # "tags": {"node":"cn1"}, # "columns": ["time","mean"], # "values": [ # ["2015-10-16T11:37:20Z",0], # ["2015-10-16T11:37:30Z",0], # ... # ["2015-10-16T12:37:10Z",0], # ["2015-10-16T12:37:20Z",0] # ] # }, # # ( ... then memory-pss for cn[2-3] ...) # # ]} # ]} results = {} nodeset = NodeSet() for result in json_data['results']: if 'series' in result: series = result['series'] else: logger.warn("No series in one result for query: %s", req) series = {} for serie in series: metric = serie['name'] node = serie['tags']['node'].encode('utf-8') if node not in nodeset: nodeset.update(node) for pair in serie['values']: timestamp = str(pair[0]) value = pair[1] if timestamp not in results: # init all values for timestamp to 0 results[timestamp] = [0] * len(metrics) # The cpus/nodes metrics can be produced by several # batch servers and thus returned multiple times by # InfluxDB server in the result of the request. We # must take care to not add the multiple results of # this metric here! if metric in ['cpus', 'nodes']: results[timestamp][metrics.index(metric)] = value else: results[timestamp][metrics.index(metric)] += value profiler.stop('metrics_proc') return (results, nodeset)