def _topology_deatails_loader(self, ids): try: for id in ids: jdata = self.request( '%s%s?%s' % (REST_API["topology_details"], id, "window=600")) ts = time.time() if jdata: for topology in jdata['topologyStats']: for metric in TOPOLOGY_DETAILS['topologyStats']: # time window range if topology['window'] == '600': self._readq.nput( 'storm.topology.topologyStats.%s %d %d topology_id=%s' % (metric, ts, topology[metric], utils.remove_invalid_characters(id))) for spouts in jdata['spouts']: for metric in TOPOLOGY_DETAILS['spouts']: self._readq.nput( 'storm.topology.spouts.%s %d %d id=%s topology_id=%s' % (metric, ts, spouts[metric], spouts['spoutId'], utils.remove_invalid_characters(id))) for bolts in jdata['bolts']: for metric in TOPOLOGY_DETAILS['bolts']: self._readq.nput( 'storm.topology.bolts.%s %d %d id=%s topology_id=%s' % (metric, ts, bolts[metric], bolts['boltId'], utils.remove_invalid_characters(id))) except Exception as e: self._readq.nput("storm.state %s %s" % (int(time.time()), '1')) self.log_exception( 'exception collecting storm topology details metric \n %s' % e)
def parse(self, json_dict, readq, port): status = json_dict["status"] if status != 200: raise IOError("status code %d" % status) ts = json_dict["timestamp"] vals = json_dict["value"] for mbean_name_str, val in vals.iteritems(): # iterate over all the values returned and apply filter try: mbean_name_str.index("Catalina:") mbean_name_str = mbean_name_str[len("Catalina:"):] except ValueError: pass additional_tag = "" reserved = False for mbean_part in mbean_name_str.split(","): # iterate over mbean name like Catalina:J2EEApplication=none,J2EEServer=none,WebModule=*,name=jsp,type=JspMonitor mbean_part_name_val_pair = mbean_part.split("=") if mbean_part_name_val_pair[0] == "WebModule" or mbean_part_name_val_pair[0] == "context": additional_tag += (" %s=%s" % (mbean_part_name_val_pair[0], utils.remove_invalid_characters(mbean_part_name_val_pair[1]))) for reserved_module in self.reserved_modules: if mbean_part_name_val_pair[1].endswith(reserved_module): reserved = True break elif mbean_part_name_val_pair[0] == "name": additional_tag += (" %s=%s" % (mbean_part_name_val_pair[0], utils.remove_invalid_characters(mbean_part_name_val_pair[1]))) if not reserved: self._process(readq, port, ts, val, additional_tag)
def _mapreduce_job_metrics(self, running_apps): ''' Get metrics for each MapReduce job. Return a dictionary for each MapReduce job { job_id: { 'job_name': job_name, 'app_name': app_name, 'user_name': user_name, 'tracking_url': tracking_url } ''' try: running_jobs = {} for app_id, (app_name, tracking_url) in running_apps.iteritems(): ts = time.time() metrics_json = self.request_url( "%s%s" % (tracking_url, REST_API['MAPREDUCE_JOBS_PATH'])) if metrics_json.get('jobs'): if metrics_json['jobs'].get('job'): for job_json in metrics_json['jobs']['job']: job_id = job_json.get('id') job_name = job_json.get('name') user_name = job_json.get('user') if job_id and job_name and user_name: # Build the structure to hold the information for each job ID running_jobs[str(job_id)] = { 'job_name': str(job_name), 'app_name': str(app_name), 'user_name': str(user_name), 'tracking_url': "%s%s/%s" % (tracking_url, REST_API['MAPREDUCE_JOBS_PATH'], job_id) } for metric in JOB: self._readq.nput( 'mapreduce.job.%s %d %d app_name=%s user_name=%s job_name=%s' % (metric, ts, job_json[metric], utils.remove_invalid_characters( str(app_name)), utils.remove_invalid_characters( str(user_name)), utils.remove_invalid_characters( str(job_name)))) except Exception as e: self._readq.nput("mapreduce.state %s %s" % (int(time.time()), '1')) self.log_exception( 'exception collecting mapreduce jobs metric \n %s', e) return running_jobs
def __call__(self): if len(self.urls): for service in self.urls: ts = time.time() try: tag = "url=%s" % remove_invalid_characters(service) r = requests.get(self.urls[service]['url'], timeout=int( self.urls[service]['timeout_sec'])) self.response_time[service] = time.time() - ts r.raise_for_status() self.log_info("The requested url is " + self.urls[service]['url'] + ",and status code is " + str(r.status_code)) self._readq.nput( "responsetime.duration %s %s %s" % (int(time.time()), self.response_time[service], tag)) self._readq.nput("responsetime.state %s %s %s" % (int(time.time()), "0", tag)) except Exception: self._readq.nput( "responsetime.duration %s %s %s" % (int(time.time()), str(self.urls[service]['timeout_sec']), tag)) self._readq.nput("responsetime.state %s %s %s" % (int(time.time()), "1", tag)) self.log_error("The requested url is " + self.urls[service]['url'] + ",and status code is " + str(r.status_code))
def _send_metrics(self): if not self.set_region_on_host(): return # this is not a master with psutil.Popen(self.hbase_cmd, stdout=subprocess.PIPE) as proc: out = proc.stdout line = out.readline() stack = [] while line: spaces = self.recomplite.findall(line) if len(spaces) > 0 and len(spaces[0]) == 12: tags_line = stack.pop() tags = tags_line.replace('"', '').split(',') table_name = utils.remove_invalid_characters( tags[0].strip()) if tags[0].strip() == 'hbase:meta': stack.append(tags_line) table_region_node = tags[-1].strip().split('.')[-1] else: table_region_node = tags[-1].strip().split('.')[-2] region_host = self.get_region_host(table_region_node) if region_host != '_': region_host = utils.remove_invalid_characters( region_host) tags = self.tags % (table_name, table_region_node, region_host) for metric in line.replace('NaN', "0").replace(r'\s+', '').split(','): metric_name, value = utils.remove_invalid_characters( metric.split('=')[0].strip()), metric.split( '=')[1].strip() if metric_name.endswith("Count"): self.table_stats.add_stat( table_name, metric_name, table_region_node, region_host, value) else: self._readq.nput( "hbase.regionserver.info.%s %d %s %s" % (metric_name, self.ts, value, tags)) else: stack.append(line) line = out.readline() self.table_stats.send_metrics(self.ts)
def process(self, line, metric): if not ("COMMAND" in line and "PID" in line): tokens = line.split() #cmd = utils.remove_invalid_characters(tokens[0]) pid = tokens[1] #print cmd, pid, tokens[2] value = float(tokens[2]) # cpu or mem full_command = ''.join(tokens[3:len(tokens)]) # full command tag = "pid_cmd=%s_%s"%(pid,utils.remove_invalid_characters(full_command)) self.print_metric(metric, (int(time.time())), value, tag)
def swap(self): stop_services = [ item for item in self.last_started_services if item not in self.curr_started_services ] for service in stop_services: self._readq.nput("service.stopAtSec %d %s %s" % (int(time.time()), 1, "service=%s" % utils.remove_invalid_characters(service))) self.last_started_services = self.curr_started_services self.curr_started_services = []
def parse(self, json_dict, readq, port): status = json_dict["status"] if status != 200: raise IOError("status code %d" % status) ts = json_dict["timestamp"] vals = json_dict["value"] for mbean_name_str, val in vals.iteritems( ): # iterate over all the values returned and apply filter try: mbean_name_str.index("Catalina:") mbean_name_str = mbean_name_str[len("Catalina:"):] except ValueError: pass additional_tag = "" reserved = False for mbean_part in mbean_name_str.split( "," ): # iterate over mbean name like Catalina:J2EEApplication=none,J2EEServer=none,WebModule=*,name=jsp,type=JspMonitor mbean_part_name_val_pair = mbean_part.split("=") if mbean_part_name_val_pair[ 0] == "WebModule" or mbean_part_name_val_pair[ 0] == "context": additional_tag += (" %s=%s" % (mbean_part_name_val_pair[0], utils.remove_invalid_characters( mbean_part_name_val_pair[1]))) for reserved_module in self.reserved_modules: if mbean_part_name_val_pair[1].endswith( reserved_module): reserved = True break elif mbean_part_name_val_pair[0] == "name": additional_tag += (" %s=%s" % (mbean_part_name_val_pair[0], utils.remove_invalid_characters( mbean_part_name_val_pair[1]))) if not reserved: self._process(readq, port, ts, val, additional_tag)
def __call__(self): try: self.db_connect() try: for key, sql in self.sqls.iteritems(): try: self.cur.execute(sql) res = self.cur.fetchone() metric = key.split("|")[0] if res[0] is not None: self._readq.nput("oracle.%s %s %s" % (metric ,int(time.time()), res[0])) except Exception as e: self.log_error("Some exception when execute exception, key=%s %s" % (key,e)) pass for cus_key,cus_sql in self.customize.iteritems(): try: self.cur.execute(cus_sql) res = self.cur.fetchall() customer = cus_key.split("|") cus_metric = customer[0] for row in res: i = 1 tags_str = " " for col in row[1:]: tag_key = customer[i+1] tag_val = utils.remove_invalid_characters(str(col)) tags_str = tags_str + " %s=%s" %(tag_key, tag_val) i = i+1 self.log_info(cus_metric) self.log_info(row[0]) self.log_info(tags_str) self._readq.nput("oracle.%s %s %s %s" % (cus_metric ,int(time.time()), row[0], tags_str)) except Exception as e: self.log_error("Some exception when customer sql execute exception, key=%s %s" % (cus_key,e)) pass self._readq.nput("oracle.state %s 0" % int(time.time())) finally: self.cur.close() self.db_close() except Exception as e: self.log_error(e) self._readq.nput("oracle.state %s 1" % int(time.time())) self.log_error("oralce collector does not work")
def process(self, line): for service in self.services: service = service.strip() if service in line: self.curr_started_services.append(service) tokens = line.split() time_str = "%s %s %s %s" % (tokens[2], tokens[3], tokens[4], tokens[5]) d = time.strptime(time_str, "%b %d %H:%M:%S %Y") startup_sec = int(time.mktime(d)) service_tag = "service=%s.%s" % ( tokens[0], utils.remove_invalid_characters(service)) self.print_metric(startup_sec, startup_sec, service_tag)
def __call__(self): try: # In config, we define a map from svc name to its unique command. # For each svc, let's check if it's cmd is in list of processes detected by psutil. for svc in self.svc_name_to_cmd_map: svc_state = 1 # assume not there. for proc in psutil.process_iter(): cmd = "" try: cmd = " ".join(proc.cmdline()) except: pass cmd = proc.name() # if find the svc cmd in list of process, its state is 0 self.log_info("svcCmd=%s, cmd=%s" % (self.svc_name_to_cmd_map[svc], cmd)) if self.svc_name_to_cmd_map[svc] in cmd: svc_state = 0 break utils.remove_invalid_characters(svc) self._readq.nput("%s.state %s %s" % (svc, int(time.time()), svc_state)) except Exception as e: self.log_error("can't find processes. %s" % e)
def send_metrics(self, ts): for table in self.tables: metrics = self.tables[table] for metric in metrics: datapoint = metrics[metric] _n = datapoint[0] _avg = datapoint[1] _min = datapoint[2] _max = datapoint[3] tbl = utils.remove_invalid_characters(table) val = max(abs(float(_max) - float(_avg)), abs(float(_avg) - float(_min))) self.readq.nput( "hbase.regionserver.info.%s.delta %d %f host=%s" % (metric, ts, val, tbl)) if float(_avg) > 0.001 and float(val) > float(self.threshold): val = val / abs(float(_avg)) self.readq.nput( "hbase.regionserver.info.%s.dev %d %f host=%s" % (metric, ts, val, tbl)) else: self.readq.nput( "hbase.regionserver.info.%s.dev %d 0.0 host=%s" % (metric, ts, tbl)) if metric.endswith("Count"): try: real_metric = metric.replace("Count", "Cnt") curr_regions = self.curr_state[table][metric] prev_regions = self.prev_state[table][metric] for region in curr_regions: tags = self.tags % (tbl, region, curr_regions[region][1]) if region in prev_regions: _cnt = abs( int(curr_regions[region][0]) - int(prev_regions[region][0])) else: _cnt = int(curr_regions[region][0]) self.readq.nput( "hbase.regionserver.info.%s %d %d %s" % (real_metric, ts, _cnt, tags)) except: pass self.save_state() self.tables = {}
def _mapreduce_task_metrics(self, running_jobs): ''' Get metrics for each MapReduce task Return a dictionary of {task_id: 'tracking_url'} for each MapReduce task ''' try: for job_id, job_stats in running_jobs.iteritems(): ts = time.time() metrics_json = self.request_url( "%s%s" % (job_stats['tracking_url'], '/tasks')) if metrics_json.get('tasks'): if metrics_json['tasks'].get('task'): for task in metrics_json['tasks']['task']: task_type = task.get('type') if task_type: if task_type == 'MAP': self._readq.nput( 'mapreduce.job.map.task.progress %d %d app_name=%s user_name=%s job_name=%s task_type=%s' % (ts, task['progress'], utils.remove_invalid_characters( job_stats.get('app_name')), utils.remove_invalid_characters( job_stats.get('user_name')), utils.remove_invalid_characters( job_stats.get('job_name')), utils.remove_invalid_characters( str(task_type).lower()))) elif task_type == 'REDUCE': self._readq.nput( 'mapreduce.job.reduce.task.progress %d %d app_name=%s user_name=%s job_name=%s task_type=%s' % (ts, task['progress'], utils.remove_invalid_characters( job_stats.get('app_name')), utils.remove_invalid_characters( job_stats.get('user_name')), utils.remove_invalid_characters( job_stats.get('job_name')), utils.remove_invalid_characters( str(task_type).lower()))) except Exception as e: self._readq.nput("mapreduce.state %s %s" % (int(time.time()), '1')) self.log_exception( 'exception collecting mapreduce task metric \n %s', e)
def _nodes_loader(self): try: metrics_json = self.request(REST_API["nodes"]) ts = time.time() if metrics_json: if metrics_json['nodes'] is not None: if metrics_json['nodes']['node'] is not None: for node_json in metrics_json['nodes']['node']: for metric in NODES: self._readq.nput( 'yarn.nodes.%s %d %d id=%s' % (metric, ts, node_json[metric], utils.remove_invalid_characters( node_json['id']))) except Exception as e: self._readq.nput("yarn.state %s %s" % (int(time.time()), '1')) self.log_exception( 'exception collecting yarn metric form : %s \n %s' % ('%s%s' % (self.http_prefix, REST_API["nodes"]), e))
def _get_metrics(self, data, object_type): # Converting data to metrics tags = [] tag_list = TAGS_MAP[object_type] for t in tag_list: try: tag = remove_invalid_characters(data.get(t)) if tag: tag = ('%s_%s=%s' % (TAG_PREFIX, tag_list[t], tag)).encode('utf-8') tags.append(tag) except Exception as e: msg = "Warning executing _get_metrics: {}. it is because of missing {}. " \ "You can specify the regular regression in conf.".format(e, t) self.log_warn(msg) # Regulating tag tags = str(tags)[1:-1].replace("'", "").replace(",", "") for attribute, metric_name, operation in ATTRIBUTES[object_type]: # Walk down through the data path, e.g. foo/bar => d['foo']['bar'] root = data keys = attribute.split('/') for path in keys[:-1]: root = root.get(path, {}) value = root.get(keys[-1], None) if type(value) == unicode: value = None ts = time.time() # If value exists, send metrics to OpenTSDB if value is not None: try: self._readq.nput('rabbitmq.%s.%s %d %d %s' % (METRIC_SUFFIX[object_type], metric_name, ts, operation(value), tags)) except ValueError as e: self.log_error( "Caught ValueError for %s %s = %s with tags: %s" % (METRIC_SUFFIX[object_type], attribute, value, str(tags))) self.log_error(e)
def _apps_loader(self): try: metrics_json = self.request(REST_API["apps"]) ts = time.time() if metrics_json: if metrics_json['apps'] is not None: if metrics_json['apps']['app'] is not None: for app_json in metrics_json['apps']['app']: for metric in APPS: self._readq.nput( 'yarn.apps.%s %d %d name=%s' % (metric, ts, app_json[metric], utils.remove_invalid_characters( app_json['name']))) except Exception as e: self._readq.nput("yarn.state %s %s" % (int(time.time()), '1')) self.log_exception( 'exception collecting yarn metric form : %s \n %s' % ('%s%s' % (self.http_prefix, REST_API["apps"]), e))
def _mapreduce_task_metrics(self, running_jobs): ''' Get metrics for each MapReduce task Return a dictionary of {task_id: 'tracking_url'} for each MapReduce task ''' try: for job_id, job_stats in running_jobs.iteritems(): ts = time.time() metrics_json = self.request_url("%s%s" % (job_stats['tracking_url'],'/tasks')) if metrics_json.get('tasks'): if metrics_json['tasks'].get('task'): for task in metrics_json['tasks']['task']: task_type = task.get('type') if task_type: if task_type == 'MAP': self._readq.nput('mapreduce.job.map.task.progress %d %d app_name=%s user_name=%s job_name=%s task_type=%s' % (ts, task['progress'], utils.remove_invalid_characters(job_stats.get('app_name')), utils.remove_invalid_characters(job_stats.get('user_name')), utils.remove_invalid_characters(job_stats.get('job_name')), utils.remove_invalid_characters(str(task_type).lower()))) elif task_type == 'REDUCE': self._readq.nput('mapreduce.job.reduce.task.progress %d %d app_name=%s user_name=%s job_name=%s task_type=%s' % (ts, task['progress'], utils.remove_invalid_characters(job_stats.get('app_name')), utils.remove_invalid_characters(job_stats.get('user_name')), utils.remove_invalid_characters(job_stats.get('job_name')), utils.remove_invalid_characters(str(task_type).lower()))) except Exception as e: self.log_exception('exception collecting mapreduce task metric \n %s',e)
def metric_name(self, name): return "%s.%s" % ("weblogic.ejb", utils.remove_invalid_characters(name))
def _mapreduce_job_metrics(self, running_apps): ''' Get metrics for each MapReduce job. Return a dictionary for each MapReduce job { job_id: { 'job_name': job_name, 'app_name': app_name, 'user_name': user_name, 'tracking_url': tracking_url } ''' try: running_jobs = {} for app_id, (app_name, tracking_url) in running_apps.iteritems(): ts = time.time() metrics_json = self.request_url("%s%s" % (tracking_url,REST_API['MAPREDUCE_JOBS_PATH'])) if metrics_json.get('jobs'): if metrics_json['jobs'].get('job'): for job_json in metrics_json['jobs']['job']: job_id = job_json.get('id') job_name = job_json.get('name') user_name = job_json.get('user') if job_id and job_name and user_name: # Build the structure to hold the information for each job ID running_jobs[str(job_id)] = {'job_name': str(job_name), 'app_name': str(app_name), 'user_name': str(user_name), 'tracking_url': "%s%s/%s" % (tracking_url, REST_API['MAPREDUCE_JOBS_PATH'], job_id)} for metric in JOB: self._readq.nput('mapreduce.job.%s %d %d app_name=%s user_name=%s job_name=%s' % (metric, ts, job_json[metric], utils.remove_invalid_characters(str(app_name)), utils.remove_invalid_characters(str(user_name)), utils.remove_invalid_characters(str(job_name)))) except Exception as e: self.log_exception('exception collecting mapreduce jobs metric \n %s',e) return running_jobs
def _mapreduce_job_counters_metrics(self, running_jobs): ''' Get custom metrics specified for each counter ''' try: for job_id, job_metrics in running_jobs.iteritems(): ts = time.time() job_name = job_metrics['job_name'] if job_name: metrics_json = self.request_url("%s%s" % (job_metrics['tracking_url'],'/counters')) if metrics_json.get('jobCounters'): if metrics_json['jobCounters'].get('counterGroup'): for counter_group in metrics_json['jobCounters']['counterGroup']: group_name = counter_group.get('counterGroupName') if group_name: if counter_group.get('counter'): for counter in counter_group['counter']: counter_name = counter.get('name') for metric in JOB_COUNTER: self._readq.nput('mapreduce.job.counter.%s %d %d app_name=%s user_name=%s job_name=%s counter_name=%s' % (metric, ts, counter[metric], utils.remove_invalid_characters(job_metrics.get('app_name')), utils.remove_invalid_characters(job_metrics.get('user_name')), utils.remove_invalid_characters(job_name), utils.remove_invalid_characters(str(counter_name).lower()))) except Exception as e: self.log_exception('exception collecting mapreduce jobs counter metric \n %s',e)
def _apps_loader(self): try: metrics_json = self.request(REST_API["apps"]) ts = time.time() if metrics_json: if metrics_json['apps'] is not None: if metrics_json['apps']['app'] is not None: for app_json in metrics_json['apps']['app']: for metric in APPS: self._readq.nput('yarn.apps.%s %d %d name=%s' % (metric, ts, app_json[metric], utils.remove_invalid_characters(app_json['name']))) except Exception as e: self.log_exception('exception collecting yarn metric form : %s \n %s' % ('%s%s' % (self.http_prefix, REST_API["apps"]), e))
def __call__(self): metrics = self.WMISampler("Win32_PerfFormattedData_PerfProc_Process", \ ["IDProcess", "Name", "PercentProcessorTime", "PrivateBytes"], \ provider="64", timeout_duration=50) metrics.sample() ts = int(time.time()) processes = [] for metric in metrics: id = metric.get("IDProcess") name = metric.get("Name") cpuPct = metric.get("PercentProcessorTime") memBytes = metric.get("PrivateBytes") process = (id, name, cpuPct, memBytes) processes.append(process) # Sort by cpu tmpsorted = sorted(processes, key=lambda process: process[2], reverse=True) # Take the top 5 processes with highest memory usage top_n_cpu = tmpsorted[:7] #self.log_info(top_n_cpu) total_mem_B = 1000000000 for p in top_n_cpu: id = p[0] name = utils.remove_invalid_characters(p[1]) cpu = p[2] if name == "_Total": total_mem_B = p[3] elif name != "Idle": self._readq.nput("cpu.topN %d %f pid_cmd=%d_%s" % (ts, cpu, id, name)) # Sort by memory tmpsorted = sorted(processes, key=lambda process: process[3], reverse=True) # Take the top N processes with highest memory usage top_n_mem = tmpsorted[:7] #self.log_info(top_n_mem) for p in top_n_mem: id = p[0] name = utils.remove_invalid_characters(p[1]) mem_pct = p[3] * 1.0 / total_mem_B if name != "_Total" and name != "Idle": self._readq.nput("mem.topN %d %f pid_cmd=%d_%s" % (ts, mem_pct, id, name)) # Sort by cpu tmpsorted = sorted(processes, key=lambda process: process[2], reverse=True) # Take the top 5 processes with highest memory usage top_n_cpu = tmpsorted[:7] #self.log_info(top_n_cpu) for p in top_n_cpu: id = p[0] name = utils.remove_invalid_characters(p[1]) cpu = p[2] if name != "_Total" and name != "Idle": self._readq.nput("cpu.topN %d %f pid_cmd=%d_%s" % (ts, cpu, id, name))
def _topology_deatails_loader(self,ids): try: for id in ids: jdata = self.request('%s%s?%s' % (REST_API["topology_details"], id, "window=600")) ts = time.time() if jdata: for topology in jdata['topologyStats']: for metric in TOPOLOGY_DETAILS['topologyStats']: # time window range if topology['window'] == '600' : self._readq.nput('storm.topology.topologyStats.%s %d %d topology_id=%s' % (metric, ts, topology[metric], utils.remove_invalid_characters(id))) for spouts in jdata['spouts']: for metric in TOPOLOGY_DETAILS['spouts']: self._readq.nput('storm.topology.spouts.%s %d %d id=%s topology_id=%s' % (metric, ts, spouts[metric], spouts['spoutId'], utils.remove_invalid_characters(id))) for bolts in jdata['bolts']: for metric in TOPOLOGY_DETAILS['bolts']: self._readq.nput('storm.topology.bolts.%s %d %d id=%s topology_id=%s' % (metric, ts, bolts[metric], bolts['boltId'], utils.remove_invalid_characters(id))) except Exception as e: self.log_exception('exception collecting storm topology details metric \n %s' % e)
def _nodes_loader(self): try: metrics_json = self.request(REST_API["nodes"]) ts = time.time() if metrics_json: if metrics_json['nodes'] is not None: if metrics_json['nodes']['node'] is not None: for node_json in metrics_json['nodes']['node']: for metric in NODES: self._readq.nput('yarn.nodes.%s %d %d id=%s' % (metric, ts, node_json[metric], utils.remove_invalid_characters(node_json['id']))) except Exception as e: self.log_exception('exception collecting yarn metric form : %s \n %s' % ('%s%s' % (self.http_prefix, REST_API["nodes"]), e))