def analyze_hbase_region_server_metrics(metric_task, metrics): region_server_name = None region_operation_metrics_dict = {} replication_metrics_dict = {} for bean in metrics['beans']: try: # because root and meta region have the names, we must use region server # name and region name to locate a region if bean['name'] == REGION_SERVER_BEAN_NAME: region_server_name = bean['ServerName'] elif bean['name'] == REGION_SERVER_DYNAMIC_STATISTICS_BEAN_NAME: for metricName in bean.keys(): if Region.is_region_operation_metric_name(metricName): encodeName = Region.get_encode_name_from_region_operation_metric_name(metricName) region_operation_metrics = region_operation_metrics_dict.setdefault(encodeName, {}) region_operation_metrics[metricName] = bean[metricName] elif bean['name'].startswith(REGION_SERVER_REPLICATION_BEAN_NAME_PREFIX): peerId = metric_helper.parse_replication_source(bean['name']) replication_metrics = replication_metrics_dict.setdefault(peerId, {}) for metricName in bean.keys(): replication_metrics[metricName] = bean[metricName] except Exception as e: logger.warning("%r failed to analyze metrics: %r", metric_task, e) continue region_server = None if region_server_name is None: return else: try: region_server = RegionServer.objects.get(name = region_server_name) except RegionServer.DoesNotExist: logger.warning("%r failed to find region_server with region_server_name=%s", metric_task, region_server_name) return # save replication metrics for region server region_server.replication_last_attempt_time = metric_task.last_attempt_time region_server.replicationMetrics = json.dumps(replication_metrics_dict) region_server.save() region_record_need_save = [] for encodeName, operationMetrics in region_operation_metrics_dict.iteritems(): region_record = dbutil.get_region_by_regionserver_and_encodename( region_server, encodeName) # we must wait region saved after analyzing master task if region_record is None: continue region_record.analyze_from_region_server_operation_metrics(operationMetrics, metric_task.last_attempt_time) # we first buffer the regions needed to update, then do batch update region_record_need_save.append(region_record) # we do batch update begin = datetime.datetime.now() dbutil.update_regions_for_region_server_metrics(region_record_need_save) logger.info("%r batch save region record for region_server, " \ "saved regions=%d, consume=%s", metric_task, len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds()))
def analyze_hbase_region_server_metrics(self, metrics): region_server_name = None region_operation_metrics_dict = {} replication_metrics_dict = {} for bean in metrics['beans']: try: # because root and meta region have the names, we must use region server # name and region name to locate a region if bean['name'] == REGION_SERVER_BEAN_NAME: region_server_name = bean['ServerName'] elif bean['name'] == REGION_SERVER_DYNAMIC_STATISTICS_BEAN_NAME: for metricName in bean.keys(): if Region.is_region_operation_metric_name(metricName): encodeName = Region.get_encode_name_from_region_operation_metric_name(metricName) region_operation_metrics = region_operation_metrics_dict.setdefault(encodeName, {}) region_operation_metrics[metricName] = bean[metricName] elif bean['name'].startswith(REGION_SERVER_REPLICATION_BEAN_NAME_PREFIX): peerId = metric_helper.parse_replication_source(bean['name']) replication_metrics = replication_metrics_dict.setdefault(peerId, {}) for metricName in bean.keys(): replication_metrics[metricName] = bean[metricName] except Exception as e: logger.warning("%r failed to analyze metrics: %r", self.task, e) continue region_server = None if region_server_name is None: return else: try: region_server = RegionServer.objects.get(name = region_server_name) except RegionServer.DoesNotExist: logger.warning("%r failed to find region_server with region_server_name=%s", self.task, region_server_name) return # save replication metrics for region server region_server.replication_last_attempt_time = self.task.last_attempt_time region_server.replicationMetrics = json.dumps(replication_metrics_dict) region_server.save() region_record_need_save = [] for encodeName, operationMetrics in region_operation_metrics_dict.iteritems(): region_record = dbutil.get_region_by_regionserver_and_encodename(region_server, encodeName) # we must wait region saved after analyzing master task if region_record is None: continue region_record.analyze_from_region_server_operation_metrics(operationMetrics, self.task.last_attempt_time) # we first buffer the regions needed to update, then do batch update region_record_need_save.append(region_record) # we do batch update begin = datetime.datetime.now() dbutil.update_regions_for_region_server_metrics(region_record_need_save) logger.info("%r batch save region record for region_server, saved regions=%d, consume=%s", self.task, len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds()))
def analyze_hbase_master_metrics(self, metrics): cluster = self.task.job.cluster hbase_cluster_record, created = HBaseCluster.objects.get_or_create(cluster = cluster) self.reset_aggregated_metrics(hbase_cluster_record) tables = {} region_record_need_save = [] for bean in metrics['beans']: try: if 'RegionServers' not in bean: continue for rs_metrics in bean['RegionServers']: rs_name = rs_metrics['key'] [rs_hostname, rs_port] = self.get_host_and_port_from_region_server_name(rs_name) rs_task = dbutil.get_task_by_host_and_port(rs_hostname, rs_port) rs_record, created = RegionServer.objects.get_or_create(cluster = cluster, task = rs_task) # region server name includes startTime, which means the same region server # will lead different RegionServer records if the region server restarts. # Therefore, we won't create region server by its name. rs_record.name = rs_name rs_value = rs_metrics['value'] rs_record.last_attempt_time = self.task.last_attempt_time rs_record.load = int(rs_value['load']) rs_record.numberOfRegions = int(rs_value['numberOfRegions']) self.reset_aggregated_metrics(rs_record) # we read out all regions belong to this region server and build a map all_regions_in_rs = Region.objects.filter(region_server = rs_record) all_regions_map = {} for region in all_regions_in_rs: all_regions_map[region.name] = region regionsLoad = rs_value['regionsLoad'] for region_metrics in regionsLoad: region_value = region_metrics['value'] region_name = region_value['nameAsString'] table_name, startkey, region_id = region_name.split(',') region_metrics = {} if table_name not in tables: table_record, created = Table.objects.get_or_create(cluster = cluster, name = table_name) self.reset_aggregated_metrics(table_record) tables[table_name] = table_record table_record = tables[table_name] region_record = None if region_name in all_regions_map: region_record = all_regions_map[region_name] else: # if region record not in buffer, we get_or_create from db begin = datetime.datetime.now() region_record, created = Region.objects.get_or_create(table = table_record, name = region_name, encodeName = Region.get_encode_name(region_name), defaults={"region_server":rs_record}) logger.info("%r get_or_create region in region_server from mysql, consume=%s, region_name=%s, buffered_rs=%s, get_rs=%s", self.task, str((datetime.datetime.now() - begin).total_seconds()), region_name, rs_record.name, region_record.region_server.name) region_record.region_server = rs_record region_record.analyze_region_record(region_value, self.task.last_attempt_time) # we buffer the regions needed update for batch update region_record_need_save.append(region_record) self.aggregate_metrics(region_record, rs_record) self.aggregate_metrics(region_record, table_record) self.aggregate_metrics(region_record, hbase_cluster_record) rs_record.save() for table_record in tables.itervalues(): table_record.last_attempt_time = self.task.last_attempt_time table_record.availability = dbutil.getTableAvailability(table_record.cluster.name, table_record.name) table_record.save() hbase_cluster_record.save() # do batch update begin = datetime.datetime.now() dbutil.update_regions_for_master_metrics(region_record_need_save) logger.info("%r batch save region record for master, saved regions=%d, consume=%s", self.task, len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds())) except Exception as e: traceback.print_exc() logger.warning("%r failed to analyze metrics: %r", self.task, e) continue
def analyze_hbase_master_metrics(metric_task, metrics): cluster = metric_task.job.cluster hbase_cluster_record, created = HBaseCluster.objects.get_or_create( cluster=cluster) reset_aggregated_metrics(hbase_cluster_record) tables = {} region_record_need_save = [] for bean in metrics['beans']: try: if 'RegionServers' not in bean: continue for rs_metrics in bean['RegionServers']: rs_name = rs_metrics['key'] [rs_hostname, rs_port] = get_host_and_port_from_region_server_name(rs_name) rs_task = dbutil.get_task_by_host_and_port( rs_hostname, rs_port) rs_record, created = RegionServer.objects.get_or_create( cluster=cluster, task=rs_task) # region server name includes startTime, which means the same region server # will lead different RegionServer records if the region server restarts. # Therefore, we won't create region server by its name. rs_record.name = rs_name rs_value = rs_metrics['value'] rs_record.last_attempt_time = metric_task.last_attempt_time rs_record.load = int(rs_value['load']) rs_record.numberOfRegions = int(rs_value['numberOfRegions']) reset_aggregated_metrics(rs_record) # we read out all regions belong to this region server and build a map all_regions_in_rs = Region.objects.filter( region_server=rs_record) all_regions_in_rs = dbutil.get_alive_regions_by_rs(rs_record) all_regions_map = {} logger.info("%r Finish get region: %d", metric_task, len(all_regions_in_rs)) for region in all_regions_in_rs: all_regions_map[region.name] = region regionsLoad = rs_value['regionsLoad'] for region_metrics in regionsLoad: region_value = region_metrics['value'] region_name = region_value['nameAsString'] try: table_name = region_name.split(',')[0] except Exception as e: logger.warning("%r failed to get region name: %r, %s", metric_task, e, region_name) continue region_metrics = {} if table_name not in tables: table_record, created = Table.objects.get_or_create( cluster=cluster, name=table_name) reset_aggregated_metrics(table_record) tables[table_name] = table_record table_record = tables[table_name] region_record = None if region_name in all_regions_map: region_record = all_regions_map[region_name] else: # if region record not in buffer, we get_or_create from db begin = datetime.datetime.now() region_record, created = Region.objects.get_or_create( table=table_record, name=region_name, encodeName=Region.get_encode_name(region_name), defaults={"region_server": rs_record}) logger.info("%r get_or_create region in region_server from mysql, " \ "consume=%s, region_name=%s, buffered_rs=%s, get_rs=%s", metric_task, str((datetime.datetime.now() - begin).total_seconds()), region_name, rs_record.name, region_record.region_server.name) logger.info("%r Finish analyze regionsLoad", metric_task) region_record.region_server = rs_record region_record.analyze_region_record( region_value, metric_task.last_attempt_time) # we buffer the regions needed update for batch update region_record_need_save.append(region_record) aggregate_metrics(region_record, rs_record) aggregate_metrics(region_record, table_record) aggregate_metrics(region_record, hbase_cluster_record) rs_record.save() for table_record in tables.itervalues(): table_record.last_attempt_time = metric_task.last_attempt_time table_record.availability = dbutil.getTableAvailability( table_record.cluster.name, table_record.name) table_record.save() hbase_cluster_record.save() # do batch update begin = datetime.datetime.now() dbutil.update_regions_for_master_metrics(region_record_need_save) logger.info("%r batch save region record for master, " \ "saved regions=%d, consume=%s", metric_task, len(region_record_need_save), str((datetime.datetime.now() - begin).total_seconds())) except Exception as e: traceback.print_exc() logger.warning("%r failed to analyze metrics: %r", metric_task, e) continue