def __init__(self, cluster, rmc): MetricCollector.__init__(self, cluster, "yarn", "nodemanager") self.target = "-" self.rmc = rmc self.hadoop_nodemanager_metrics = {} for i in range(len(self.file_list)): self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "yarn", "nodemanager")
def __init__(self, cluster, nnc): MetricCollector.__init__(self, cluster, "hdfs", "datanode") self.target = "-" self.nnc = nnc self.hadoop_datanode_metrics = {} for i in range(len(self.file_list)): self.hadoop_datanode_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "hdfs", "datanode")
def __init__(self, cluster, urls): MetricCollector.__init__(self, cluster, "hdfs", "journalnode") self.target = "-" self.urls = urls self.hadoop_journalnode_metrics = {} for i in range(len(self.file_list)): self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "hdfs", "journalnode") self.scrape_metrics = ScrapeMetrics(urls)
def __init__(self, cluster, urls, queue_regexp): MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") self.target = "-" self.queue_regexp = queue_regexp self.nms = set() self.hadoop_resourcemanager_metrics = {} for i in range(len(self.file_list)): self.hadoop_resourcemanager_metrics.setdefault( self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "yarn", "resourcemanager") self.scrape_metrics = ScrapeMetrics(urls)
class NameNodeMetricCollector(MetricCollector): def __init__(self, cluster, urls): MetricCollector.__init__(self, cluster, "hdfs", "namenode") self.target = "-" self.urls = urls self.dns = set() self.hadoop_namenode_metrics = {} for i in range(len(self.file_list)): self.hadoop_namenode_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "hdfs", "namenode") self.scrape_metrics = ScrapeMetrics(urls) def collect(self): isSetup = False beans_list = self.scrape_metrics.scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) self.setup_metrics_labels(beans) isSetup = True for i in range(len(beans)): if 'tag.Hostname' in beans[i]: self.target = beans[i]["tag.Hostname"] break self.hadoop_namenode_metrics.update( self.common_metric_collector.get_metrics(beans, self.target)) self.get_metrics(beans) for i in range(len(self.merge_list)): service = self.merge_list[i] if service in self.hadoop_namenode_metrics: for metric in self.hadoop_namenode_metrics[service]: yield self.hadoop_namenode_metrics[service][metric] def setup_nnactivity_labels(self): num_namenode_flag, avg_namenode_flag, ops_namenode_flag = 1, 1, 1 for metric in self.metrics['NameNodeActivity']: label = ["cluster", "method", "_target"] if "NumOps" in metric: if num_namenode_flag: key = "MethodNumOps" name = "_".join( [self.prefix, "nnactivity_method_ops_total"]) description = "Total number of the times the method is called." self.hadoop_namenode_metrics['NameNodeActivity'][ key] = GaugeMetricFamily(name, description, labels=label) num_namenode_flag = 0 else: continue elif "AvgTime" in metric: if avg_namenode_flag: key = "MethodAvgTime" name = "_".join([ self.prefix, "nnactivity_method_avg_time_milliseconds" ]) descripton = "Average turn around time of the method in milliseconds." self.hadoop_namenode_metrics['NameNodeActivity'][ key] = GaugeMetricFamily(name, descripton, labels=label) avg_namenode_flag = 0 else: continue elif ops_namenode_flag: key = "Operations" name = "_".join([self.prefix, "nnactivity_operations_total"]) description = "Total number of each operation." self.hadoop_namenode_metrics['NameNodeActivity'][ key] = GaugeMetricFamily(name, description, labels=label) ops_namenode_flag = 0 def setup_startupprogress_labels(self): sp_count_flag, sp_elapsed_flag, sp_total_flag, sp_complete_flag = 1, 1, 1, 1 for metric in self.metrics['StartupProgress']: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() if "ElapsedTime" == metric: key = "ElapsedTime" name = "total_elapsed_time_milliseconds" descriptions = "Total elapsed time in milliseconds." elif "PercentComplete" == metric: key = "PercentComplete" name = "complete_rate" descriptions = "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0)." elif "Count" in metric: if sp_count_flag: sp_count_flag = 0 key = "PhaseCount" name = "phase_count" descriptions = "Total number of steps completed in the phase." else: continue elif "ElapsedTime" in metric: if sp_elapsed_flag: sp_elapsed_flag = 0 key = "PhaseElapsedTime" name = "phase_elapsed_time_milliseconds" descriptions = "Total elapsed time in the phase in milliseconds." else: continue elif "Total" in metric: if sp_total_flag: sp_total_flag = 0 key = "PhaseTotal" name = "phase_total" descriptions = "Total number of steps in the phase." else: continue elif "PercentComplete" in metric: if sp_complete_flag: sp_complete_flag = 0 key = "PhasePercentComplete" name = "phase_complete_rate" descriptions = "Current rate completed in the phase (The max value is not 100 but 1.0)." else: continue else: key = metric name = snake_case descriptions = self.metrics['StartupProgress'][metric] label = ["cluster", "phase", "_target"] name = "_".join([self.prefix, "startup_process", name]) self.hadoop_namenode_metrics['StartupProgress'][ key] = GaugeMetricFamily(name, descriptions, labels=label) def setup_fsnamesystem_labels(self): cap_flag = 1 for metric in self.metrics['FSNamesystem']: if metric.startswith('Capacity'): if cap_flag: cap_flag = 0 key = "capacity" label = ["cluster", "mode"] name = "capacity_bytes" descriptions = "Current DataNodes capacity in each mode in bytes" else: continue else: key = metric label = ["cluster"] name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() descriptions = self.metrics['FSNamesystem'][metric] label.append("_target") name = "_".join([self.prefix, "fsname_system", name]) self.hadoop_namenode_metrics['FSNamesystem'][ key] = GaugeMetricFamily(name, descriptions, labels=label) def setup_fsnamesystem_state_labels(self): num_flag = 1 for metric in self.metrics['FSNamesystemState']: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() if 'DataNodes' in metric: if num_flag: num_flag = 0 key = "datanodes_num" label = ["cluster", "state"] descriptions = "Number of datanodes in each state" else: continue else: key = metric label = ["cluster"] descriptions = self.metrics['FSNamesystemState'][metric] label.append("_target") name = "_".join([self.prefix, "fsname_system_state", snake_case]) self.hadoop_namenode_metrics['FSNamesystemState'][ key] = GaugeMetricFamily(name, descriptions, labels=label) def setup_retrycache_labels(self): cache_flag = 1 for metric in self.metrics['RetryCache']: if cache_flag: cache_flag = 0 key = "cache" label = ["cluster", "mode", "_target"] name = "_".join([self.prefix, "cache_total"]) description = "Total number of RetryCache in each mode" self.hadoop_namenode_metrics['RetryCache'][ key] = GaugeMetricFamily(name, description, labels=label) def setup_nninfo_labels(self): for metric in self.metrics['NameNodeInfo']: if "LiveNodes" in metric: name = "_".join([self.prefix, "nninfo_live_nodes_count"]) description = "Count of live data node" self.hadoop_namenode_metrics['NameNodeInfo'][ "LiveNodeCount"] = GaugeMetricFamily( name, description, labels=["cluster", "_target"]) label = [ "cluster", "datanode", "infoAddr", "infoSecureAddr", "xferaddr", "version", "_target" ] items = [ "lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails" ] for item in items: item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() name = "_".join([self.prefix, "nninfo_live_nodes", item]) key = "LiveNodes-" + item description = "Live node " + item if item == "admin_state": description += " 0: In Service, 1: Decommission In Progress, 2: Decommissioned" self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "DeadNodes" in metric: name = "_".join([self.prefix, "nninfo_dead_nodes_count"]) description = "Count of dead data node" self.hadoop_namenode_metrics['NameNodeInfo'][ "DeadNodeCount"] = GaugeMetricFamily( name, description, labels=["cluster", "_target"]) label = ["cluster", "datanode", "decommissioned", "xferaddr"] name = "_".join( [self.prefix, "nninfo_dead_nodes_last_contact"]) key = "DeadNodes" description = "Dead node last contact in milions" self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "DecomNodes" in metric: name = "_".join([self.prefix, "nninfo_decom_nodes_count"]) description = "Count of decommissioned data node" self.hadoop_namenode_metrics['NameNodeInfo'][ "DecomNodeCount"] = GaugeMetricFamily( name, description, labels=["cluster", "_target"]) label = ["cluster", "datanode", "xferaddr", "_target"] items = [ "underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles" ] for item in items: item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() name = "_".join([self.prefix, "nninfo_decom_nodes", item]) key = "DecomNodes-" + item description = "Decom Node " + item self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "EnteringMaintenanceNodes" in metric: name = "_".join( [self.prefix, "nninfo_maintenance_nodes_count"]) description = "Count of maintenance data node" self.hadoop_namenode_metrics['NameNodeInfo'][ "MaintenanceNodeCount"] = GaugeMetricFamily( name, description, labels=["cluster", "_target"]) label = ["cluster", "datanode", "xferaddr", "_target"] items = [ "underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles" ] for item in items: item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() name = "_".join([ self.prefix, "nninfo_entering_maintenance_nodes", item ]) key = "EnteringMaintenanceNodes-" + item description = "Entering maintenance node " + item self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "CorruptFiles" in metric: label = ["cluster", "_target"] name = "_".join([self.prefix, "nninfo_corrupt_file_count"]) key = "CorruptFiles" description = "Corrupt file count" self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "NodeUsage" in metric: label = ["cluster", "_target"] items = ["min", "median", "max", "stdDev"] for item in items: item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() name = "_".join([self.prefix, "nninfo_node_usage", item]) key = "NodeUsage-" + item description = "Node usage " + item self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, description, labels=label) continue elif "SoftwareVersion" in metric: label = ["cluster", "software_version"] name = "_".join([self.prefix, "nninfo_software_version"]) key = "SoftwareVersion" elif "Safemode" in metric: label = ["cluster"] name = "_".join([self.prefix, "nninfo_safe_mode"]) key = "Safemode" else: label = ["cluster"] snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, "nninfo", snake_case]) key = metric label.append("_target") self.hadoop_namenode_metrics['NameNodeInfo'][ key] = GaugeMetricFamily(name, self.metrics["NameNodeInfo"][metric], labels=label) def setup_metrics_labels(self, beans): for i in range(len(beans)): if 'NameNodeActivity' in beans[i]['name']: self.setup_nnactivity_labels() if 'StartupProgress' in beans[i]['name']: self.setup_startupprogress_labels() if 'FSNamesystem' in beans[i]['name']: self.setup_fsnamesystem_labels() if 'FSNamesystemState' in beans[i]['name']: self.setup_fsnamesystem_state_labels() if 'RetryCache' in beans[i]['name']: self.setup_retrycache_labels() if "NameNodeInfo" in beans[i]['name']: self.setup_nninfo_labels() def get_nnactivity_metrics(self, bean): for metric in self.metrics['NameNodeActivity']: if "NumOps" in metric: method = metric.split('NumOps')[0] key = "MethodNumOps" elif "AvgTime" in metric: method = metric.split('AvgTime')[0] key = "MethodAvgTime" else: if "Ops" in metric: method = metric.split('Ops')[0] else: method = metric key = "Operations" label = [self.cluster, method, self.target] self.hadoop_namenode_metrics['NameNodeActivity'][key].add_metric( label, bean[metric] if metric in bean else 0) def get_startupprogress_metrics(self, bean): for metric in self.metrics['StartupProgress']: if "Count" in metric: key = "PhaseCount" phase = metric.split("Count")[0] elif "ElapsedTime" in metric and "ElapsedTime" != metric: key = "PhaseElapsedTime" phase = metric.split("ElapsedTime")[0] elif "Total" in metric: key = "PhaseTotal" phase = metric.split("Total")[0] elif "PercentComplete" in metric and "PercentComplete" != metric: key = "PhasePercentComplete" phase = metric.split("PercentComplete")[0] else: key = metric phase = "-" label = [self.cluster, phase, self.target] self.hadoop_namenode_metrics['StartupProgress'][key].add_metric( label, bean[metric] if metric in bean else 0) def get_fsnamesystem_metrics(self, bean): for metric in self.metrics['FSNamesystem']: key = metric if 'HAState' in metric: label = [self.cluster] if 'initializing' == bean['tag.HAState']: value = 0.0 elif 'active' == bean['tag.HAState']: value = 1.0 elif 'standby' == bean['tag.HAState']: value = 2.0 elif 'stopping' == bean['tag.HAState']: value = 3.0 else: value = 9999 label.append(self.target) self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric( label, value) elif metric.startswith("Capacity"): key = 'capacity' mode = metric.split("Capacity")[1] label = [self.cluster, mode] label.append(self.target) self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric( label, bean[metric] if metric in bean else 0) else: label = [self.cluster] label.append(self.target) self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric( label, bean[metric] if metric in bean else 0) def get_fsnamesystem_state_metrics(self, bean): for metric in self.metrics['FSNamesystemState']: label = [self.cluster] key = metric if 'FSState' in metric: if 'Safemode' == bean['FSState']: value = 0.0 elif 'Operational' == bean['FSState']: value = 1.0 else: value = 9999 label.append(self.target) self.hadoop_namenode_metrics['FSNamesystemState'][ key].add_metric(label, value) elif "TotalSyncTimes" in metric: label.append(self.target) self.hadoop_namenode_metrics['FSNamesystemState'][ key].add_metric( label, float(re.sub(r'\s', '', bean[metric])) if metric in bean and bean[metric] else 0) elif "DataNodes" in metric: key = 'datanodes_num' state = metric.split("DataNodes")[0].split("Num")[1] label = [self.cluster, state, self.target] self.hadoop_namenode_metrics['FSNamesystemState'][ key].add_metric( label, bean[metric] if metric in bean and bean[metric] else 0) else: label.append(self.target) self.hadoop_namenode_metrics['FSNamesystemState'][ key].add_metric( label, bean[metric] if metric in bean and bean[metric] else 0) def get_retrycache_metrics(self, bean): for metric in self.metrics['RetryCache']: key = "cache" label = [self.cluster, metric.split('Cache')[1], self.target] self.hadoop_namenode_metrics['RetryCache'][key].add_metric( label, bean[metric] if metric in bean and bean[metric] else 0) def get_nninfo_metrics(self, bean): for metric in self.metrics["NameNodeInfo"]: if "LiveNodes" in metric and "LiveNodes" in bean: live_node_dict = yaml.safe_load(bean["LiveNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"][ "LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict)) dns = set() for node, info in live_node_dict.items(): label = [ self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target ] items = [ "lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails" ] dns.add("http://" + info["infoAddr"] + "/jmx") for item in items: value = info[item] if item in info else 0 if item == "adminState": if value == "In Service": value = 0 elif value == "Decommission In Progress": value = 1 else: # Decommissioned value = 2 item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() key = "LiveNodes-" + item self.hadoop_namenode_metrics["NameNodeInfo"][ key].add_metric(label, value) self.dns = dns elif "DeadNodes" in metric and "DeadNodes" in bean: dead_node_dict = yaml.safe_load(bean["DeadNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"][ "DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict)) for node, info in dead_node_dict.items(): label = [ self.cluster, node, str(info["decommissioned"]), info["xferaddr"], self.target ] value = info["lastContact"] self.hadoop_namenode_metrics["NameNodeInfo"][ "DeadNodes"].add_metric(label, value) elif "DecomNodes" in metric and "DecomNodes" in bean: decom_node_dict = yaml.safe_load(bean["DecomNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"][ "DecomNodeCount"].add_metric([self.cluster, self.target], len(decom_node_dict)) for node, info in decom_node_dict.items(): label = [self.cluster, node, info["xferaddr"], self.target] items = [ "underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles" ] for item in items: value = info[item] if item in info else 0 item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() key = "DecomNodes-" + item self.hadoop_namenode_metrics["NameNodeInfo"][ key].add_metric(label, value) elif "EnteringMaintenanceNodes" in metric and "EnteringMaintenanceNodes" in bean: node_dict = yaml.safe_load(bean["EnteringMaintenanceNodes"]) self.hadoop_namenode_metrics["NameNodeInfo"][ "MaintenanceNodeCount"].add_metric( [self.cluster, self.target], len(node_dict)) for node, info in node_dict.items(): label = [self.cluster, node, info["xferaddr"], self.target] items = [ "underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles" ] for item in items: value = info[item] if item in info else 0 item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() key = "EnteringMaintenanceNodes-" + item self.hadoop_namenode_metrics["NameNodeInfo"][ key].add_metric(label, value) elif "CorruptFiles" in metric and "CorruptFiles" in bean: file_list = yaml.safe_load(bean["CorruptFiles"]) label = [self.cluster, self.target] self.hadoop_namenode_metrics["NameNodeInfo"][ "CorruptFiles"].add_metric(label, len(file_list)) elif "NodeUsage" in metric and "NodeUsage" in bean: node_usage_dict = yaml.safe_load( bean["NodeUsage"])["nodeUsage"] label = [self.cluster, self.target] items = ["min", "median", "max", "stdDev"] for item in items: value = node_usage_dict[ item] if item in node_usage_dict else 0 value = float(value.strip("%")) item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() key = "NodeUsage-" + item self.hadoop_namenode_metrics["NameNodeInfo"][ key].add_metric(label, value) elif "SoftwareVersion" in metric and "SoftwareVersion" in bean: label = [self.cluster, bean["SoftwareVersion"], self.target] self.hadoop_namenode_metrics["NameNodeInfo"][ "SoftwareVersion"].add_metric(label, 0) elif "Safemode" in metric and "Safemode" in bean: label = [self.cluster, self.target] self.hadoop_namenode_metrics["NameNodeInfo"][ "Safemode"].add_metric( label, 0 if metric in bean and bean[metric] == "" else 1) else: label = [self.cluster, self.target] self.hadoop_namenode_metrics['NameNodeInfo'][ metric].add_metric( label, bean[metric] if metric in bean and bean[metric] else 0) def get_metrics(self, beans): for i in range(len(beans)): if 'NameNodeActivity' in beans[i]['name']: self.get_nnactivity_metrics(beans[i]) if 'StartupProgress' in beans[i]['name']: self.get_startupprogress_metrics(beans[i]) if 'FSNamesystem' in beans[i][ 'name'] and 'FSNamesystemState' not in beans[i]['name']: self.get_fsnamesystem_metrics(beans[i]) if 'FSNamesystemState' in beans[i]['name']: self.get_fsnamesystem_state_metrics(beans[i]) if 'RetryCache' in beans[i]['name']: self.get_retrycache_metrics(beans[i]) if 'NameNodeInfo' in beans[i]['name']: self.get_nninfo_metrics(beans[i])
class JournalNodeMetricCollector(MetricCollector): def __init__(self, cluster, urls): MetricCollector.__init__(self, cluster, "hdfs", "journalnode") self.target = "-" self.urls = urls self.hadoop_journalnode_metrics = {} for i in range(len(self.file_list)): self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "hdfs", "journalnode") self.scrape_metrics = ScrapeMetrics(urls) def collect(self): isSetup = False beans_list = self.scrape_metrics.scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) self.setup_metrics_labels(beans) isSetup = True for i in range(len(beans)): if 'tag.Hostname' in beans[i]: self.target = beans[i]["tag.Hostname"] break self.hadoop_datanode_metrics.update( self.common_metric_collector.get_metrics(beans, self.target)) self.get_metrics(beans) for i in range(len(self.merge_list)): service = self.merge_list[i] if service in self.hadoop_journalnode_metrics: for metric in self.hadoop_datanode_metrics[service]: yield self.hadoop_datanode_metrics[service][metric] def setup_journalnode_labels(self): a_60_latency_flag, a_300_latency_flag, a_3600_latency_flag = 1, 1, 1 for metric in self.metrics['JournalNode']: label = ["cluster", "host", "_target"] if 'Syncs60s' in metric: if a_60_latency_flag: a_60_latency_flag = 0 key = "Syncs60" name = "_".join( [self.prefix, 'sync60s_latency_microseconds']) descriptions = "The percentile of sync latency in microseconds in 60s granularity" self.hadoop_journalnode_metrics['JournalNode'][ key] = HistogramMetricFamily(name, descriptions, labels=label) else: continue elif 'Syncs300s' in metric: if a_300_latency_flag: a_300_latency_flag = 0 key = "Syncs300" name = "_".join( [self.prefix, 'sync300s_latency_microseconds']) descriptions = "The percentile of sync latency in microseconds in 300s granularity" self.hadoop_journalnode_metrics['JournalNode'][ key] = HistogramMetricFamily(name, descriptions, labels=label) else: continue elif 'Syncs3600s' in metric: if a_3600_latency_flag: a_3600_latency_flag = 0 key = "Syncs3600" name = "_".join( [self.prefix, 'sync3600s_latency_microseconds']) descriptions = "The percentile of sync latency in microseconds in 3600s granularity" self.hadoop_journalnode_metrics['JournalNode'][ key] = HistogramMetricFamily(name, descriptions, labels=label) else: continue else: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) self.hadoop_journalnode_metrics['JournalNode'][ metric] = GaugeMetricFamily( name, self.metrics['JournalNode'][metric], labels=label) def setup_metrics_labels(self, beans): for i in range(len(beans)): if 'name=Journal-' in beans[i]['name']: self.setup_journalnode_labels() def get_metrics(self, beans): for i in range(len(beans)): if 'name=Journal-' in beans[i][ 'name'] and 'JournalNode' in self.metrics: host = beans[i]['tag.Hostname'] label = [self.cluster, host, self.target] a_60_sum, a_300_sum, a_3600_sum = 0.0, 0.0, 0.0 a_60_value, a_300_value, a_3600_value = [], [], [] a_60_percentile, a_300_percentile, a_3600_percentile = [], [], [] for metric in beans[i]: if not metric[0].isupper(): continue if "Syncs60s" in metric: if 'NumOps' in metric: a_60_count = beans[i][metric] else: tmp = metric.split("thPercentileLatencyMicros")[ 0].split("Syncs")[1].split("s") a_60_percentile.append(str(float(tmp[1]) / 100.0)) a_60_value.append(beans[i][metric]) a_60_sum += beans[i][metric] elif 'Syncs300' in metric: if 'NumOps' in metric: a_300_count = beans[i][metric] else: tmp = metric.split("thPercentileLatencyMicros")[ 0].split("Syncs")[1].split("s") a_300_percentile.append(str(float(tmp[1]) / 100.0)) a_300_value.append(beans[i][metric]) a_300_sum += beans[i][metric] elif 'Syncs3600' in metric: if 'NumOps' in metric: a_3600_count = beans[i][metric] else: tmp = metric.split("thPercentileLatencyMicros")[ 0].split("Syncs")[1].split("s") a_3600_percentile.append(str( float(tmp[1]) / 100.0)) a_3600_value.append(beans[i][metric]) a_3600_sum += beans[i][metric] else: key = metric self.hadoop_journalnode_metrics['JournalNode'][ key].add_metric(label, beans[i][metric]) a_60_bucket = zip(a_60_percentile, a_60_value) a_300_bucket = zip(a_300_percentile, a_300_value) a_3600_bucket = zip(a_3600_percentile, a_3600_value) a_60_bucket.sort() a_300_bucket.sort() a_3600_bucket.sort() a_60_bucket.append(("+Inf", a_60_count)) a_300_bucket.append(("+Inf", a_300_count)) a_3600_bucket.append(("+Inf", a_3600_count)) self.hadoop_journalnode_metrics['JournalNode'][ 'Syncs60'].add_metric(label, buckets=a_60_bucket, sum_value=a_60_sum) self.hadoop_journalnode_metrics['JournalNode'][ 'Syncs300'].add_metric(label, buckets=a_300_bucket, sum_value=a_300_sum) self.hadoop_journalnode_metrics['JournalNode'][ 'Syncs3600'].add_metric(label, buckets=a_3600_bucket, sum_value=a_3600_sum)
class NodeManagerMetricCollector(MetricCollector): def __init__(self, cluster, rmc): MetricCollector.__init__(self, cluster, "yarn", "nodemanager") self.target = "-" self.rmc = rmc self.hadoop_nodemanager_metrics = {} for i in range(len(self.file_list)): self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "yarn", "nodemanager") def collect(self): isSetup = False beans_list = ScrapeMetrics(self.rmc.nms).scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) self.setup_metrics_labels(beans) isSetup = True for i in range(len(beans)): if 'tag.Hostname' in beans[i]: self.target = beans[i]["tag.Hostname"] break self.hadoop_nodemanager_metrics.update( self.common_metric_collector.get_metrics(beans, self.target)) self.get_metrics(beans) for i in range(len(self.merge_list)): service = self.merge_list[i] if service in self.hadoop_nodemanager_metrics: for metric in self.hadoop_nodemanager_metrics[service]: yield self.hadoop_nodemanager_metrics[service][metric] def setup_metrics_labels(self, beans): for i in range(len(beans)): for service in self.metrics: if service in beans[i]['name']: container_flag = 1 for metric in self.metrics[service]: label = ["cluster", "host"] if metric.startswith("Containers"): if container_flag: container_flag = 0 label.append("status") key = "containers" name = "_".join( [self.prefix, "container_count"]) description = "Count of container" else: continue else: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) key = metric description = self.metrics[service][metric] label.append("target") self.hadoop_nodemanager_metrics[service][ key] = GaugeMetricFamily(name, description, labels=label) def get_metrics(self, beans): for i in range(len(beans)): for service in self.metrics: if service not in beans[i]['name']: continue for metric in beans[i]: if metric not in self.metrics[service]: continue label = [self.cluster, self.target] if metric.startswith("Containers"): key = "containers" label.append(metric.split("Containers")[1]) else: key = metric label.append(self.target) value = beans[i][metric] if beans[i][ metric] > 0 else 0 # incase vcore or memory < 0 self.hadoop_nodemanager_metrics[service][key].add_metric( label, value)
class DataNodeMetricCollector(MetricCollector): def __init__(self, cluster, nnc): MetricCollector.__init__(self, cluster, "hdfs", "datanode") self.target = "-" self.nnc = nnc self.hadoop_datanode_metrics = {} for i in range(len(self.file_list)): self.hadoop_datanode_metrics.setdefault(self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "hdfs", "datanode") def collect(self): isSetup = False if self.nnc.dns == "": return beans_list = ScrapeMetrics(self.nnc.dns).scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) self.setup_metrics_labels(beans) isSetup = True for i in range(len(beans)): if 'tag.Hostname' in beans[i]: self.target = beans[i]["tag.Hostname"] break self.hadoop_datanode_metrics.update( self.common_metric_collector.get_metrics(beans, self.target)) self.get_metrics(beans) for i in range(len(self.merge_list)): service = self.merge_list[i] if service in self.hadoop_datanode_metrics: for metric in self.hadoop_datanode_metrics[service]: yield self.hadoop_datanode_metrics[service][metric] def setup_dninfo_labels(self): for metric in self.metrics['DataNodeInfo']: if 'VolumeInfo' in metric: label = ["cluster", "version", "path", "state"] name = "_".join([self.prefix, 'volume_state']) else: label = ["cluster", "version"] snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) label.append("_target") self.hadoop_datanode_metrics['DataNodeInfo'][ metric] = GaugeMetricFamily( name, self.metrics['DataNodeInfo'][metric], labels=label) def setup_dnactivity_labels(self): block_flag, client_flag = 1, 1 for metric in self.metrics['DataNodeActivity']: if 'Blocks' in metric: if block_flag: label = ['cluster', 'host', 'oper'] key = "Blocks" name = "block_operations_total" descriptions = "Total number of blocks in different oprations" block_flag = 0 else: continue elif 'Client' in metric: if client_flag: label = ['cluster', 'host', 'oper', 'client'] key = "Client" name = "from_client_total" descriptions = "Total number of each operations from different client" client_flag = 0 else: continue else: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() label = ['cluster', 'host'] key = metric name = snake_case descriptions = self.metrics['DataNodeActivity'][metric] label.append("_target") self.hadoop_datanode_metrics['DataNodeActivity'][ key] = GaugeMetricFamily("_".join([self.prefix, name]), descriptions, labels=label) def setup_fsdatasetstate_labels(self): for metric in self.metrics['FSDatasetState']: label = ['cluster', 'host', "_target"] if "Num" in metric: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric.split("Num")[1]).lower() else: snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) self.hadoop_datanode_metrics['FSDatasetState'][ metric] = GaugeMetricFamily( name, self.metrics['FSDatasetState'][metric], labels=label) def setup_metrics_labels(self, beans): for i in range(len(beans)): if 'DataNodeInfo' in beans[i]['name']: self.setup_dninfo_labels() if 'DataNodeActivity' in beans[i]['name']: self.setup_dnactivity_labels() if 'FSDatasetState' in beans[i]['name']: self.setup_fsdatasetstate_labels() def get_dninfo_metrics(self, bean): for metric in self.metrics['DataNodeInfo']: version = bean['Version'] if 'VolumeInfo' in metric: if 'VolumeInfo' in bean: volume_info_dict = yaml.safe_load(bean['VolumeInfo']) for k, v in volume_info_dict.items(): path = k for key, val in v.items(): state = key label = [ self.cluster, version, path, state, self.target ] value = val self.hadoop_datanode_metrics['DataNodeInfo'][ metric].add_metric(label, value) else: continue else: label = [self.cluster, version, self.target] value = bean[metric] self.hadoop_datanode_metrics['DataNodeInfo'][ metric].add_metric(label, value) def get_dnactivity_metrics(self, bean): for metric in self.metrics['DataNodeActivity']: host = bean['tag.Hostname'] label = [self.cluster, host] if 'Blocks' in metric: oper = metric.split("Blocks")[1] label.append(oper) key = "Blocks" elif 'Client' in metric: oper = metric.split("Client")[0].split("From")[0] client = metric.split("Client")[0].split("From")[1] label.extend([oper, client]) key = "Client" else: key = metric label.append(self.target) self.hadoop_datanode_metrics['DataNodeActivity'][key].add_metric( label, bean[metric] if metric in bean else 0) def get_fsdatasetstate_metrics(self, bean): for metric in self.metrics['FSDatasetState']: label = [self.cluster, self.target, self.target] self.hadoop_datanode_metrics['FSDatasetState'][metric].add_metric( label, bean[metric] if metric in bean else 0) def get_metrics(self, beans): for i in range(len(beans)): if 'DataNodeInfo' in beans[i]['name']: self.get_dninfo_metrics(beans[i]) if 'DataNodeActivity' in beans[i]['name']: self.get_dnactivity_metrics(beans[i]) if 'FSDatasetState' in beans[i]['name']: self.get_fsdatasetstate_metrics(beans[i])
class ResourceManagerMetricCollector(MetricCollector): NODE_STATE = { 'NEW': 1, 'RUNNING': 2, 'UNHEALTHY': 3, 'DECOMMISSIONED': 4, 'LOST': 5, 'REBOOTED': 6, } def __init__(self, cluster, urls, queue_regexp): MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") self.target = "-" self.queue_regexp = queue_regexp self.nms = set() self.hadoop_resourcemanager_metrics = {} for i in range(len(self.file_list)): self.hadoop_resourcemanager_metrics.setdefault( self.file_list[i], {}) self.common_metric_collector = CommonMetricCollector( cluster, "yarn", "resourcemanager") self.scrape_metrics = ScrapeMetrics(urls) def collect(self): isSetup = False beans_list = self.scrape_metrics.scrape() for beans in beans_list: if not isSetup: self.common_metric_collector.setup_labels(beans) self.setup_metrics_labels(beans) isSetup = True for i in range(len(beans)): if 'tag.Hostname' in beans[i]: self.target = beans[i]["tag.Hostname"] break self.hadoop_resourcemanager_metrics.update( self.common_metric_collector.get_metrics(beans, self.target)) self.get_metrics(beans) for i in range(len(self.merge_list)): service = self.merge_list[i] if service in self.hadoop_resourcemanager_metrics: for metric in self.hadoop_resourcemanager_metrics[service]: yield self.hadoop_resourcemanager_metrics[service][metric] def setup_rmnminfo_labels(self): for metric in self.metrics['RMNMInfo']: label = ["cluster", "host", "version", "rack", "_target"] if 'NumContainers' in metric: name = "_".join([self.prefix, 'node_containers_total']) elif 'State' in metric: name = "_".join([self.prefix, 'node_state']) elif 'UsedMemoryMB' in metric: name = "_".join([self.prefix, 'node_memory_used_mb']) elif 'AvailableMemoryMB' in metric: name = "_".join([self.prefix, 'node_memory_available_mb']) else: continue self.hadoop_resourcemanager_metrics['RMNMInfo'][ metric] = GaugeMetricFamily(name, self.metrics['RMNMInfo'][metric], labels=label) def setup_queue_labels(self): running_flag, mb_flag, vcore_flag, container_flag, apps_flag = 1, 1, 1, 1, 1 for metric in self.metrics['QueueMetrics']: label = ["cluster", "modeler_type", "queue", "user"] if "running_" in metric: if running_flag: running_flag = 0 label.append("elapsed_time") key = "running_app" name = "_".join([self.prefix, "running_app_total"]) description = "Current number of running applications in each elapsed time ( < 60min, 60min < x < 300min, 300min < x < 1440min and x > 1440min )" else: continue elif metric.endswith("VCores"): if vcore_flag: vcore_flag = 0 label.append("status") key = "vcore" name = "_".join([self.prefix, "vcore_count"]) description = "Count of vcore" else: continue elif metric.endswith("Containers"): if container_flag: container_flag = 0 label.append("status") key = "containers" name = "_".join([self.prefix, "container_count"]) description = "Count of container" else: continue elif metric.endswith("MB"): if mb_flag: mb_flag = 0 label.append("status") key = "memory" name = "_".join([self.prefix, "memory_in_mb"]) description = "Memory in MB" else: continue elif metric.startswith("Apps"): if apps_flag: apps_flag = 0 label.append("status") key = "apps" name = "_".join([self.prefix, "application_count"]) description = "Count of application" else: continue else: key = metric snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() name = "_".join([self.prefix, snake_case]) description = self.metrics['QueueMetrics'][metric] label.append("_target") self.hadoop_resourcemanager_metrics['QueueMetrics'][ key] = GaugeMetricFamily(name, description, labels=label) def setup_cluster_labels(self): nm_flag, cm_num_flag, cm_avg_flag = 1, 1, 1 for metric in self.metrics['ClusterMetrics']: if "NMs" in metric: if nm_flag: nm_flag = 0 label = ["cluster", "status"] key = "NMs" name = "nodemanager_total" description = "Current number of NodeManagers in each status" else: continue elif "NumOps" in metric: if cm_num_flag: cm_num_flag = 0 label = ["cluster", "oper"] key = "NumOps" name = "ams_total" description = "Total number of Applications Masters in each operation" else: continue elif "AvgTime" in metric: if cm_avg_flag: cm_avg_flag = 0 label = ["cluster", "oper"] key = "AvgTime" name = "average_time_milliseconds" description = "Average time in milliseconds AM spends in each operation" else: continue else: key = metric name = metric description = self.metrics['ClusterMetrics'][metric] label = ["cluster"] label.append("_target") self.hadoop_resourcemanager_metrics['ClusterMetrics'][ key] = GaugeMetricFamily("_".join([self.prefix, name]), description, labels=label) def setup_metrics_labels(self, beans): for i in range(len(beans)): if 'RMNMInfo' in beans[i]['name']: self.setup_rmnminfo_labels() if 'QueueMetrics' in self.metrics: self.setup_queue_labels() if 'ClusterMetrics' in self.metrics: self.setup_cluster_labels() def get_rmnminfo_metrics(self, bean): for metric in self.metrics['RMNMInfo']: nms = set() live_nm_list = yaml.safe_load(bean['LiveNodeManagers']) for j in range(len(live_nm_list)): nms.add("http://" + live_nm_list[j]["NodeHTTPAddress"] + "/jmx") host = live_nm_list[j]['HostName'] version = live_nm_list[j]['NodeManagerVersion'] rack = live_nm_list[j]['Rack'] label = [self.cluster, host, version, rack, self.target] if 'State' == metric: value = self.NODE_STATE[live_nm_list[j]['State']] else: value = live_nm_list[j][metric] if metric in live_nm_list[ j] else 0.0 self.hadoop_resourcemanager_metrics['RMNMInfo'][ metric].add_metric(label, value) self.nms = nms def get_queue_metrics(self, bean): for metric in self.metrics['QueueMetrics']: label = [ self.cluster, bean.get("modelerType", "-"), bean.get("tag.Queue", "-"), bean.get("tag.User", "-") ] if "running_0" in metric: key = "running_app" label.append("0to60") elif "running_60" in metric: key = "running_app" label.append("60to300") elif "running_300" in metric: key = "running_app" label.append("300to1440") elif "running_1440" in metric: key = "running_app" label.append("1440up") elif metric.endswith("VCores"): label.append(metric.split("VCores")[0]) key = "vcore" elif metric.endswith("Containers"): label.append(metric.split("Containers")[0]) key = "containers" elif metric.endswith("MB"): label.append(metric.split("MB")[0]) key = "memory" elif metric.startswith("Apps"): label.append(metric.split("Apps")[1]) key = "apps" else: key = metric label.append(self.target) self.hadoop_resourcemanager_metrics['QueueMetrics'][ key].add_metric(label, bean[metric] if metric in bean else 0) def get_cluster_metrics(self, bean): for metric in self.metrics['ClusterMetrics']: label = [self.cluster] if "NMs" in metric: label.append(metric.split('NMs')[0].split('Num')[1]) key = "NMs" elif "NumOps" in metric: key = "NumOps" label.append(metric.split("DelayNumOps")[0].split('AM')[1]) elif "AvgTime" in metric: key = "AvgTime" label.append(metric.split("DelayAvgTime")[0].split('AM')[1]) else: continue label.append(self.target) self.hadoop_resourcemanager_metrics['ClusterMetrics'][ key].add_metric(label, bean[metric] if metric in bean else 0) def get_metrics(self, beans): for i in range(len(beans)): if 'RMNMInfo' in beans[i]['name']: self.get_rmnminfo_metrics(beans[i]) if 'QueueMetrics' in beans[i]['name'] and re.match( self.queue_regexp, beans[i]['tag.Queue']): self.get_queue_metrics(beans[i]) if 'ClusterMetrics' in beans[i]['name']: self.get_cluster_metrics(beans[i])