def analyze_failed_jobs(self): """ Analyze the Jobs of the defined apps and return the metric details :param app: Application object :return: Metric details """ all_jobs_count = len(self.jobs) if all_jobs_count == 0: return EmptyMetric(severity=Severity.NONE) failed_jobs = [ j for j in self.jobs if j.status in ["FAILED", "KILLED"] ] failed_jobs_count = len(failed_jobs) if failed_jobs_count > 0: severity = Severity.HIGH overall_info = f"{failed_jobs_count}/{all_jobs_count} jobs failed" details = MetricDetailsList(ascending=True) for fj in failed_jobs: details.add( MetricDetails( entity_id=fj.job_id, detail_string=f"Job {fj.job_id} {fj.status}")) return JobFailureMetric(severity, overall_info, details) else: return EmptyMetric(severity=Severity.NONE)
def analyze_failed_stages(self): """ Analyze the Stages of the defined apps :return: StageFailureMetric if at least one of the stages failed or was killed. Otherwise, EmptyMetric is returned. """ all_stages_count = len(self.stages) if all_stages_count == 0: return EmptyMetric(severity=Severity.NONE) failed_stages = [ s for s in self.stages if s.status in ["FAILED", "KILLED"] ] failed_stages_count = len(failed_stages) if failed_stages_count > 0: severity = Severity.HIGH overall_info = f"{failed_stages_count}/{all_stages_count} stages failed" details = MetricDetailsList(ascending=True) for fs in failed_stages: details.add( MetricDetails( entity_id=fs.stage_id, detail_string=f"Stage {fs.stage_id} {fs.status}", subdetails=[fs.failure_reason])) return StageFailureMetric(severity, overall_info, details) else: return EmptyMetric(severity=Severity.NONE)
def analyze_executors_gc_time(self): """ Analyze garbage collection time of the executors. :return: ExecutorGcTimeMetric object if an issue is found for at leas one executor. Otherwise, EmptyMetric is returned. """ if self.executors is None or len(self.executors) == 0: return EmptyMetric(severity=Severity.NONE) total_gc_time = sum(e.total_gc_time for e in self.executors) total_duration = sum(e.total_duration for e in self.executors) if total_duration == 0: return EmptyMetric(severity=Severity.NONE) ratio = total_gc_time / total_duration severity_low = EXECUTOR_TOO_LOW_GC_THRESHOLDS.severity_of(ratio) severity_high = EXECUTOR_TOO_HIGH_GC_THRESHOLDS.severity_of(ratio) if severity_low > severity_high: overall_info = f"Executors spent too low time with Garbage Collection: {fmt_time(total_gc_time/1000)} " \ f"/ {fmt_time(total_duration/1000)} ({(ratio*100):.2f} %)" details = MetricDetailsList(ascending=True) severity = severity_low elif severity_low < severity_high: overall_info = f"Executors spent too much time with Garbage Collection: {fmt_time(total_gc_time/1000)} " \ f"/ {fmt_time(total_duration/1000)} ({(ratio*100):.2f} %)" details = MetricDetailsList(ascending=True) severity = severity_high else: return EmptyMetric(severity=Severity.NONE) return ExecutorGcTimeMetric(severity, overall_info, details)
def analyze_driver_gc_time(self): """ Analyze garbage collection time of the driver :return: DriverGcTimeMetric object if an issue is found. Otherwise, EmptyMetric is returned. """ if self.driver is None or self.driver.total_duration == 0: return EmptyMetric(severity=Severity.NONE) ratio = self.driver.total_gc_time / self.driver.total_duration severity_low = DRIVER_TOO_LOW_GC_THRESHOLDS.severity_of(ratio) severity_high = DRIVER_TOO_HIGH_GC_THRESHOLDS.severity_of(ratio) if severity_low > severity_high: overall_info = f"Driver spent too low time with Garbage Collection: " \ f"{fmt_time(self.driver.total_gc_time/1000)} out of " \ f"{fmt_time(self.driver.total_duration/1000)} ({(ratio*100):.2f} %)" details = MetricDetailsList(ascending=True) severity = severity_low elif severity_low < severity_high: overall_info = f"Driver spent too much time with Garbage Collection: " \ f"{fmt_time(self.driver.total_gc_time/1000)} out of " \ f"{fmt_time(self.driver.total_duration/1000)} ({(ratio*100):.2f} %)" details = MetricDetailsList(ascending=True) severity = severity_high else: return EmptyMetric(severity=Severity.NONE) return DriverGcTimeMetric(severity, overall_info, details)
def analyze_dynamic_allocation(self): """ Analyzes dynamic allocation configuration. When dynamic allocation is preferred (this can be configured, True is default), but not used, an issue with HIGH severity is reported. If the dynamic allocation is used (the respective configuration is enabled), shuffle tracking or shuffle service should be enabled. If this is not the case, an issue with HIGH severity is reported. The purpose for enabling the shuffle tracking or shuffle service is to keep the respective shuffle files even after the executor is decommisioned. :return: DynamicAllocationMetric if issues are detected or EmptyMetric if no issues are detected. """ try: is_dynamic_allocation_enabled = cast_to_bool( self.app.get_spark_property(DYNAMIC_ALLOCATION_KEY)) except ValueError as e: # TODO add logging is_dynamic_allocation_enabled = False try: is_shuffle_tracking_enabled = cast_to_bool( self.app.get_spark_property(SHUFFLE_TRACKING_ENABLED_KEY)) except ValueError as e: # TODO add logging is_shuffle_tracking_enabled = False try: is_shuffle_service_enabled = cast_to_bool( self.app.get_spark_property(SHUFFLE_SERVICE_ENABLED_KEY)) except ValueError as e: # TODO add logging is_shuffle_service_enabled = False if IS_DYNAMIC_ALLOCATION_PREFERRED and not is_dynamic_allocation_enabled: overall_info = f"Dynamic allocation was disabled for this application." return DynamicAllocationConfigMetric(severity=Severity.HIGH, overall_info=overall_info, details=MetricDetailsList()) if is_dynamic_allocation_enabled and not is_shuffle_service_enabled and not is_shuffle_tracking_enabled: overall_info = f"If Dynamic allocation is enabled, an external shuffle service or " \ f"shuffle tracking should be enabled." details = MetricDetailsList() details.add( MetricDetails( entity_id=None, detail_string="Current settings: ", sort_attr=0, subdetails=[ f"{SHUFFLE_TRACKING_ENABLED_KEY} = {is_shuffle_tracking_enabled}", f"{SHUFFLE_SERVICE_ENABLED_KEY} = {is_shuffle_service_enabled}" ])) return DynamicAllocationConfigMetric(severity=Severity.HIGH, overall_info=overall_info, details=details) else: return EmptyMetric(Severity.NONE)
def analyze_serializer_config(self): """ Analyze configuration of the Serializer. :return: SerializerConfigMetric if an issue with serialization is found. Otherwise, EmptyMetric is returned. """ used_serializer = self.app.get_spark_property( SERIALIZER_KEY) or DEFAULT_SERIALIZER if used_serializer != PREFERRED_SERIALIZER: severity = Severity.HIGH overall_info = f"{used_serializer} was used, but {PREFERRED_SERIALIZER} could have better performance." details = MetricDetailsList() return SerializerConfigMetric(severity, overall_info, details) else: return EmptyMetric(Severity.NONE)
def analyze_yarn_queue(self): """ If usage of the default YARN queue is not allowed in the config file, this method investigates if the particular application was submitted to the default queue, and if so, it will report an issue. :return: YarnQueueMetric if issue is detected or EmptyMetric if no issues are detected. """ queue_name = self.app.get_spark_property( SPARK_YARN_QUEUE_KEY) or SPARK_YARN_QUEUE_DEFAULT_VALUE if not IS_DEFAULT_QUEUE_ALLOWED and queue_name == SPARK_YARN_QUEUE_DEFAULT_VALUE: overall_info = f"This application was submitted to YARN Queue '{SPARK_YARN_QUEUE_DEFAULT_VALUE}'. " \ f"Another queue should be used." return YarnQueueMetric(overall_info=overall_info, severity=Severity.HIGH, details=MetricDetailsList()) else: return EmptyMetric(severity=Severity.NONE)
def analyze_core_number(self): """ This method analyzes whether the number of cores per driver or executors was configured within defined limits. If any of the values gets out of limits, an issue is reported. :return: CoreNumberMetric if issue is detected or EmptyMetric if no issues are detected. """ executor_cores = cast_to_int( self.app.get_spark_property( EXECUTOR_CORES_KEY)) or EXECUTOR_CORES_DEFAULT_VALUE driver_cores = cast_to_int( self.app.get_spark_property( DRIVER_CORES_KEY)) or DRIVER_CORES_DEFAULT_VALUE severity_executor_cores = EXECUTOR_CORES_THRESHOLDS.severity_of( executor_cores) severity_driver_cores = DRIVER_CORES_THRESHOLDS.severity_of( driver_cores) severity = max(severity_driver_cores, severity_executor_cores) if severity > Severity.NONE: overall_info = f"Number of cores per driver or executor might be not optimal" details = MetricDetailsList() if severity_executor_cores > Severity.NONE: details.add( MetricDetails( detail_string= f"{EXECUTOR_CORES_KEY} = {executor_cores}, but ideally, it should be between {EXECUTOR_CORES_THRESHOLDS.lower_threshold_low_severity} and {EXECUTOR_CORES_THRESHOLDS.upper_threshold_low_severity}." )) if severity_driver_cores > Severity.NONE: details.add( MetricDetails( detail_string= f"{DRIVER_CORES_KEY} = {driver_cores}, but ideally, it should be between {DRIVER_CORES_THRESHOLDS.lower_threshold_low_severity} and {DRIVER_CORES_THRESHOLDS.upper_threshold_low_severity}." )) return CoreNumberMetric(overall_info=overall_info, severity=severity, details=details) else: return EmptyMetric(severity=Severity.NONE)
def analyze_stage_skews(self): """ Analyze the Stages for stage skews :return: StageSkewMetric if a stage skew is found for at least one stage. Otherwise, EmptyMetric is returned. """ # filter only relevant stages (don't bother with five-second stages, focus on the large ones) relevant_stages = [ s for s in self.stages if s.executor_run_time > STAGE_SKEW_MIN_RUNTIME_MILLIS ] severity = Severity.NONE # just an initialization; will be updated details = MetricDetailsList(ascending=True) for stage in relevant_stages: if stage.ss_executor_run_time is None: # sometimes the stage statistics data are not available return EmptyMetric(severity=Severity.NONE) idx = [2, 4] # indexes for median and maximum runtime_med, runtime_max = [ stage.ss_executor_run_time[i] / 1000 for i in idx ] # seconds bytes_read_med, bytes_read_max = [ stage.ss_bytes_read[i] for i in idx ] bytes_written_med, bytes_written_max = [ stage.ss_bytes_written[i] for i in idx ] shuffle_read_bytes_med, shuffle_read_bytes_max = [ stage.ss_shuffle_read_bytes[i] for i in idx ] shuffle_write_bytes_med, shuffle_write_bytes_max = [ stage.ss_shuffle_write_bytes[i] for i in idx ] id = stage.stage_id total_runtime = stage.executor_run_time / 1000 # seconds # severity should be calculated from runtime skew if runtime_med == 0: # check division by 0 stage_severity = Severity.HIGH else: stage_severity = STAGE_SKEW_THRESHOLDS.severity_of( runtime_max / runtime_med) if stage_severity > severity: severity = stage_severity if stage_severity > Severity.NONE: details.add( MetricDetails( entity_id=id, detail_string= f"Stage {stage.stage_id}: Total runtime {fmt_time(total_runtime)}, Executor runtime {fmt_time(runtime_max)} (max), {fmt_time(runtime_med)} (median)", sort_attr=runtime_max - runtime_med, # the details can be sorted by (max - median) difference, which might represent potential time loss subdetails=[ f"Read {fmt_bytes(bytes_read_max)} (max), {fmt_bytes(bytes_read_med)} (median)", f"Wrote {fmt_bytes(bytes_written_max)} (max), {fmt_bytes(bytes_written_med)} (median)", f"Shuffle read {fmt_bytes(shuffle_read_bytes_max)} (max), {fmt_bytes(shuffle_read_bytes_med)} (median)", f"Shuffle write {fmt_bytes(shuffle_write_bytes_max)} (max), {fmt_bytes(shuffle_write_bytes_max)} (median)" ])) if details.length() == 0: # no stages with significant severity return EmptyMetric(severity=Severity.NONE) else: overall_info = f"{details.length()} stages with significant skew found" return StageSkewMetric(severity=severity, overall_info=overall_info, details=details)
def analyze_disk_spills(self): """ Analyze the Stages for disk spills :return: StageDiskSpillMetric if a disk spill skew is found for at least one stage. Otherwise, EmptyMetric is returned. """ # filter only relevant stages (with non-zero spill) relevant_stages = [ s for s in self.stages if s.memory_bytes_spilled > 0 ] severity = Severity.NONE details = MetricDetailsList( ascending=True ) # the details can be sortable by amount of spilled data per stage total_bytes_spilled = 0 stages_count = 0 for stage in relevant_stages: memory_bytes_spilled = stage.memory_bytes_spilled or 0 disk_bytes_spilled = stage.disk_bytes_spilled or 0 input_bytes = stage.input_bytes or 0 output_bytes = stage.output_bytes or 0 shuffle_read_bytes = stage.shuffle_read_bytes or 0 shuffle_write_bytes = stage.shuffle_write_bytes or 0 stage_key = stage.stage_key id = stage.stage_id # also find which tasks is the most responsible for the spill worst_task = self.db.query(TaskEntity).filter(TaskEntity.stage_key == stage_key)\ .order_by(desc(TaskEntity.memory_bytes_spilled)).first() max_memory_usage = max(input_bytes, output_bytes, shuffle_read_bytes, shuffle_write_bytes) stage_severity = STAGE_DISK_SPILL_THRESHOLDS.severity_of( memory_bytes_spilled / max_memory_usage) total_bytes_spilled += memory_bytes_spilled stages_count += 1 if stage_severity > severity: severity = stage_severity if stage_severity > Severity.NONE: subdetails = [ f"Input: {fmt_bytes(input_bytes)}, output: {fmt_bytes(output_bytes)}.", f"Shuffle read: {fmt_bytes(shuffle_read_bytes)}, shuffle write: {fmt_bytes(shuffle_write_bytes)}." ] if worst_task is not None: memory_bytes_spilled_by_worst_task = worst_task.memory_bytes_spilled disk_bytes_spilled_by_worst_task = worst_task.disk_bytes_spilled subdetails.append( f"Biggest contributor: task {worst_task.task_id}, " f"{fmt_bytes(memory_bytes_spilled_by_worst_task)} spilled " f"({fmt_bytes(disk_bytes_spilled_by_worst_task)} on disk)." ) detail_string = f"Stage {id} spilled {fmt_bytes(memory_bytes_spilled)} ({fmt_bytes(disk_bytes_spilled)} on disk)." details.add( MetricDetails(entity_id=id, detail_string=detail_string, sort_attr=memory_bytes_spilled, subdetails=subdetails)) if details.length() == 0: return EmptyMetric(severity=Severity.NONE) else: overall_info = f"{fmt_bytes(total_bytes_spilled)} spilled in {stages_count} stages." return StageDiskSpillMetric(severity=severity, overall_info=overall_info, details=details)
def analyze_memory_configuration(self): """ This method analyzes if the application requested more memory (driver, executor, memory+overhead) than recommended. If so, it will report an issue. :return: MemoryConfigMetric if issue is detected or EmptyMetric if no issues are detected. """ executor_memory = size_in_bytes( self.app.get_spark_property(EXECUTOR_MEMORY_KEY), EXECUTOR_MEMORY_DEFAULT_VALUE) executor_memory_overhead = size_in_bytes(self.app.get_spark_property(EXECUTOR_MEMORY_OVERHEAD_KEY), EXECUTOR_MEMORY_DEFAULT_VALUE) \ or constants.get_default_memory_overhead(executor_memory) driver_memory = size_in_bytes( self.app.get_spark_property(DRIVER_MEMORY_KEY), DRIVER_MEMORY_DEFAULT_VALUE) driver_memory_overhead = size_in_bytes(self.app.get_spark_property(DRIVER_MEMORY_OVERHEAD_KEY), DRIVER_MEMORY_DEFAULT_VALUE) \ or constants.get_default_memory_overhead(driver_memory) severity_executor_memory = EXECUTOR_MEMORY_CONFIG_THRESHOLDS.severity_of( executor_memory) severity_executor_memory_overhead = EXECUTOR_MEMORY_OVERHEAD_CONFIG_THRESHOLDS.severity_of( executor_memory_overhead) severity_driver_memory = DRIVER_MEMORY_CONFIG_THRESHOLDS.severity_of( driver_memory) severity_driver_memory_overhead = DRIVER_MEMORY_CONFIG_OVERHEAD_THRESHOLDS.severity_of( driver_memory_overhead) severity = max(severity_executor_memory, severity_executor_memory_overhead, severity_driver_memory, severity_driver_memory_overhead) if severity > Severity.NONE: overall_info = f"Memory allocation limits were exceeded." details = MetricDetailsList() if severity_executor_memory > Severity.NONE: details.add( MetricDetails( detail_string= f"{EXECUTOR_MEMORY_KEY} = {fmt_bytes(executor_memory)}, " f"but ideally, it should be <= {fmt_bytes(EXECUTOR_MEMORY_CONFIG_LOW)}", )) if severity_executor_memory_overhead > Severity.NONE: details.add( MetricDetails( detail_string= f"{EXECUTOR_MEMORY_OVERHEAD_KEY} = {fmt_bytes(executor_memory_overhead)}, " f"but ideally, it should be <= {fmt_bytes(EXECUTOR_MEMORY_OVERHEAD_CONFIG_LOW)}", )) if severity_driver_memory > Severity.NONE: details.add( MetricDetails( detail_string= f"{DRIVER_MEMORY_KEY} = {fmt_bytes(driver_memory)}, " f"but ideally, it should be <= {fmt_bytes(DRIVER_MEMORY_CONFIG_LOW)}", )) if severity_driver_memory_overhead > Severity.NONE: details.add( MetricDetails( detail_string= f"{DRIVER_MEMORY_OVERHEAD_KEY} = {fmt_bytes(driver_memory_overhead)}, " f"but ideally, it should be <= {fmt_bytes(DRIVER_MEMORY_OVERHEAD_CONFIG_LOW)}", )) return MemoryConfigMetric(overall_info=overall_info, severity=severity, details=details) else: return EmptyMetric(severity=Severity.NONE)
def analyze_min_max_executors(self): """ If dynamic allocation is enabled, this method analyzes a configuration of minimum and maximum executors required. If the minimum value is larger than recommended, it should report an issue (as the resources might be wasted). If the maximum value is larger than recommended, it should report an issue. On multi-tenant clusters, where separate YARN queues should be used, these issues are not reported if the non-default queue is used (this should prevent eating up all the resources, one the queues are set up correctly). :return: DynamicAllocationMinMaxExecutorsMetric if issues are detected or EmptyMetric if no issues are detected. """ try: is_dynamic_allocation_enabled = cast_to_bool( self.app.get_spark_property(DYNAMIC_ALLOCATION_KEY)) except ValueError as e: # TODO add logging is_dynamic_allocation_enabled = False if not is_dynamic_allocation_enabled: return EmptyMetric(Severity.NONE) min_executors = cast_to_int( self.app.get_spark_property( DYNAMIC_ALLOCATION_MIN_EXECUTORS_KEY)) or 0 max_executors = cast_to_int( self.app.get_spark_property( DYNAMIC_ALLOCATION_MAX_EXECUTORS_KEY)) or math.inf queue_name = self.app.get_spark_property( SPARK_YARN_QUEUE_KEY) or SPARK_YARN_QUEUE_DEFAULT_VALUE severity_min = DYNAMIC_ALLOCATION_MIN_EXECUTORS_THRESHOLDS.severity_of( min_executors) severity_max = DYNAMIC_ALLOCATION_MAX_EXECUTORS_THRESHOLDS.severity_of(max_executors) \ if queue_name == SPARK_YARN_QUEUE_DEFAULT_VALUE else Severity.NONE severity = severity_max if ( severity_max > severity_min) else severity_min if severity > Severity.NONE: overall_info = "Non-optimal configuration of minimum or maximum number of executors" details = MetricDetailsList() if severity_min > Severity.NONE: details.add( MetricDetails( entity_id=None, detail_string= f"{DYNAMIC_ALLOCATION_MIN_EXECUTORS_KEY} is currently set to " f"{min_executors}, but ideally, it should be <= " f"{DYNAMIC_ALLOCATION_MIN_EXECUTORS_LOW}", sort_attr=0, subdetails=[])) if severity_max > Severity.NONE: details.add( MetricDetails( entity_id=None, detail_string= f"{DYNAMIC_ALLOCATION_MAX_EXECUTORS_KEY} is currently set to " f"{max_executors}, but ideally, it should be <= " f"{DYNAMIC_ALLOCATION_MAX_EXECUTORS_LOW}", sort_attr=0, subdetails=[ "Also consider using non-default YARN queue to prevent the application to eat up all the resources" ])) return DynamicAllocationMinMaxExecutorsMetric( overall_info=overall_info, severity=severity, details=details) else: return EmptyMetric(Severity.NONE)