コード例 #1
0
    def analyze_failed_jobs(self):
        """
        Analyze the Jobs of the defined apps and return the metric details
        :param app: Application object
        :return: Metric details
        """
        all_jobs_count = len(self.jobs)
        if all_jobs_count == 0:
            return EmptyMetric(severity=Severity.NONE)
        failed_jobs = [
            j for j in self.jobs if j.status in ["FAILED", "KILLED"]
        ]
        failed_jobs_count = len(failed_jobs)

        if failed_jobs_count > 0:
            severity = Severity.HIGH
            overall_info = f"{failed_jobs_count}/{all_jobs_count} jobs failed"
            details = MetricDetailsList(ascending=True)
            for fj in failed_jobs:
                details.add(
                    MetricDetails(
                        entity_id=fj.job_id,
                        detail_string=f"Job {fj.job_id} {fj.status}"))
            return JobFailureMetric(severity, overall_info, details)
        else:
            return EmptyMetric(severity=Severity.NONE)
コード例 #2
0
    def analyze_failed_stages(self):
        """
        Analyze the Stages of the defined apps
        :return: StageFailureMetric if at least one of the stages failed or was killed. Otherwise, EmptyMetric is
        returned.
        """
        all_stages_count = len(self.stages)
        if all_stages_count == 0:
            return EmptyMetric(severity=Severity.NONE)
        failed_stages = [
            s for s in self.stages if s.status in ["FAILED", "KILLED"]
        ]
        failed_stages_count = len(failed_stages)

        if failed_stages_count > 0:
            severity = Severity.HIGH
            overall_info = f"{failed_stages_count}/{all_stages_count} stages failed"
            details = MetricDetailsList(ascending=True)
            for fs in failed_stages:
                details.add(
                    MetricDetails(
                        entity_id=fs.stage_id,
                        detail_string=f"Stage {fs.stage_id} {fs.status}",
                        subdetails=[fs.failure_reason]))

            return StageFailureMetric(severity, overall_info, details)
        else:
            return EmptyMetric(severity=Severity.NONE)
コード例 #3
0
    def analyze_executors_gc_time(self):
        """
        Analyze garbage collection time of the executors.
        :return: ExecutorGcTimeMetric object if an issue is found for at leas one executor. Otherwise, EmptyMetric is
        returned.
        """
        if self.executors is None or len(self.executors) == 0:
            return EmptyMetric(severity=Severity.NONE)

        total_gc_time = sum(e.total_gc_time for e in self.executors)
        total_duration = sum(e.total_duration for e in self.executors)

        if total_duration == 0:
            return EmptyMetric(severity=Severity.NONE)

        ratio = total_gc_time / total_duration

        severity_low = EXECUTOR_TOO_LOW_GC_THRESHOLDS.severity_of(ratio)
        severity_high = EXECUTOR_TOO_HIGH_GC_THRESHOLDS.severity_of(ratio)

        if severity_low > severity_high:
            overall_info = f"Executors spent too low time with Garbage Collection: {fmt_time(total_gc_time/1000)} " \
                           f"/ {fmt_time(total_duration/1000)} ({(ratio*100):.2f} %)"
            details = MetricDetailsList(ascending=True)
            severity = severity_low
        elif severity_low < severity_high:
            overall_info = f"Executors spent too much time with Garbage Collection: {fmt_time(total_gc_time/1000)} " \
                           f"/ {fmt_time(total_duration/1000)} ({(ratio*100):.2f} %)"
            details = MetricDetailsList(ascending=True)
            severity = severity_high
        else:
            return EmptyMetric(severity=Severity.NONE)

        return ExecutorGcTimeMetric(severity, overall_info, details)
コード例 #4
0
    def analyze_driver_gc_time(self):
        """
        Analyze garbage collection time of the driver
        :return: DriverGcTimeMetric object if an issue is found. Otherwise, EmptyMetric is returned.
        """
        if self.driver is None or self.driver.total_duration == 0:
            return EmptyMetric(severity=Severity.NONE)

        ratio = self.driver.total_gc_time / self.driver.total_duration

        severity_low = DRIVER_TOO_LOW_GC_THRESHOLDS.severity_of(ratio)
        severity_high = DRIVER_TOO_HIGH_GC_THRESHOLDS.severity_of(ratio)

        if severity_low > severity_high:
            overall_info = f"Driver spent too low time with Garbage Collection: " \
                           f"{fmt_time(self.driver.total_gc_time/1000)} out of " \
                           f"{fmt_time(self.driver.total_duration/1000)} ({(ratio*100):.2f} %)"
            details = MetricDetailsList(ascending=True)
            severity = severity_low
        elif severity_low < severity_high:
            overall_info = f"Driver spent too much time with Garbage Collection: " \
                           f"{fmt_time(self.driver.total_gc_time/1000)} out of " \
                           f"{fmt_time(self.driver.total_duration/1000)} ({(ratio*100):.2f} %)"
            details = MetricDetailsList(ascending=True)
            severity = severity_high
        else:
            return EmptyMetric(severity=Severity.NONE)

        return DriverGcTimeMetric(severity, overall_info, details)
コード例 #5
0
    def analyze_dynamic_allocation(self):
        """
        Analyzes dynamic allocation configuration. When dynamic allocation is preferred (this can be configured, True is
        default), but not used, an issue with HIGH severity is reported.
        If the dynamic allocation is used (the respective configuration is enabled), shuffle tracking or shuffle service
        should be enabled. If this is not the case, an issue with HIGH severity is reported. The purpose for enabling
        the shuffle tracking or shuffle service is to keep the respective shuffle files even after the executor is
        decommisioned.
        :return: DynamicAllocationMetric if issues are detected or EmptyMetric if no issues are detected.
        """
        try:
            is_dynamic_allocation_enabled = cast_to_bool(
                self.app.get_spark_property(DYNAMIC_ALLOCATION_KEY))
        except ValueError as e:
            # TODO add logging
            is_dynamic_allocation_enabled = False

        try:
            is_shuffle_tracking_enabled = cast_to_bool(
                self.app.get_spark_property(SHUFFLE_TRACKING_ENABLED_KEY))
        except ValueError as e:
            # TODO add logging
            is_shuffle_tracking_enabled = False

        try:
            is_shuffle_service_enabled = cast_to_bool(
                self.app.get_spark_property(SHUFFLE_SERVICE_ENABLED_KEY))
        except ValueError as e:
            # TODO add logging
            is_shuffle_service_enabled = False

        if IS_DYNAMIC_ALLOCATION_PREFERRED and not is_dynamic_allocation_enabled:
            overall_info = f"Dynamic allocation was disabled for this application."
            return DynamicAllocationConfigMetric(severity=Severity.HIGH,
                                                 overall_info=overall_info,
                                                 details=MetricDetailsList())

        if is_dynamic_allocation_enabled and not is_shuffle_service_enabled and not is_shuffle_tracking_enabled:
            overall_info = f"If Dynamic allocation is enabled, an external shuffle service or " \
                           f"shuffle tracking should be enabled."
            details = MetricDetailsList()
            details.add(
                MetricDetails(
                    entity_id=None,
                    detail_string="Current settings: ",
                    sort_attr=0,
                    subdetails=[
                        f"{SHUFFLE_TRACKING_ENABLED_KEY} = {is_shuffle_tracking_enabled}",
                        f"{SHUFFLE_SERVICE_ENABLED_KEY} = {is_shuffle_service_enabled}"
                    ]))
            return DynamicAllocationConfigMetric(severity=Severity.HIGH,
                                                 overall_info=overall_info,
                                                 details=details)
        else:
            return EmptyMetric(Severity.NONE)
コード例 #6
0
    def analyze_serializer_config(self):
        """
        Analyze configuration of the Serializer.
        :return: SerializerConfigMetric if an issue with serialization is found. Otherwise, EmptyMetric is returned.
        """
        used_serializer = self.app.get_spark_property(
            SERIALIZER_KEY) or DEFAULT_SERIALIZER

        if used_serializer != PREFERRED_SERIALIZER:
            severity = Severity.HIGH
            overall_info = f"{used_serializer} was used, but {PREFERRED_SERIALIZER} could have better performance."
            details = MetricDetailsList()
            return SerializerConfigMetric(severity, overall_info, details)
        else:
            return EmptyMetric(Severity.NONE)
コード例 #7
0
 def analyze_yarn_queue(self):
     """
     If usage of the default YARN queue is not allowed in the config file, this method investigates if the
     particular application was submitted to the default queue, and if so, it will report an issue.
     :return: YarnQueueMetric if issue is detected or EmptyMetric if no issues are detected.
     """
     queue_name = self.app.get_spark_property(
         SPARK_YARN_QUEUE_KEY) or SPARK_YARN_QUEUE_DEFAULT_VALUE
     if not IS_DEFAULT_QUEUE_ALLOWED and queue_name == SPARK_YARN_QUEUE_DEFAULT_VALUE:
         overall_info = f"This application was submitted to YARN Queue '{SPARK_YARN_QUEUE_DEFAULT_VALUE}'. " \
                        f"Another queue should be used."
         return YarnQueueMetric(overall_info=overall_info,
                                severity=Severity.HIGH,
                                details=MetricDetailsList())
     else:
         return EmptyMetric(severity=Severity.NONE)
コード例 #8
0
    def analyze_core_number(self):
        """
        This method analyzes whether the number of cores per driver or executors was configured within defined limits.
        If any of the values gets out of limits, an issue is reported.
        :return: CoreNumberMetric if issue is detected or EmptyMetric if no issues are detected.
        """
        executor_cores = cast_to_int(
            self.app.get_spark_property(
                EXECUTOR_CORES_KEY)) or EXECUTOR_CORES_DEFAULT_VALUE
        driver_cores = cast_to_int(
            self.app.get_spark_property(
                DRIVER_CORES_KEY)) or DRIVER_CORES_DEFAULT_VALUE

        severity_executor_cores = EXECUTOR_CORES_THRESHOLDS.severity_of(
            executor_cores)
        severity_driver_cores = DRIVER_CORES_THRESHOLDS.severity_of(
            driver_cores)

        severity = max(severity_driver_cores, severity_executor_cores)

        if severity > Severity.NONE:
            overall_info = f"Number of cores per driver or executor might be not optimal"
            details = MetricDetailsList()
            if severity_executor_cores > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{EXECUTOR_CORES_KEY} = {executor_cores}, but ideally, it should be between {EXECUTOR_CORES_THRESHOLDS.lower_threshold_low_severity} and {EXECUTOR_CORES_THRESHOLDS.upper_threshold_low_severity}."
                    ))
            if severity_driver_cores > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{DRIVER_CORES_KEY} = {driver_cores}, but ideally, it should be between {DRIVER_CORES_THRESHOLDS.lower_threshold_low_severity} and {DRIVER_CORES_THRESHOLDS.upper_threshold_low_severity}."
                    ))
            return CoreNumberMetric(overall_info=overall_info,
                                    severity=severity,
                                    details=details)
        else:
            return EmptyMetric(severity=Severity.NONE)
コード例 #9
0
    def analyze_stage_skews(self):
        """
        Analyze the Stages for stage skews
        :return: StageSkewMetric if a stage skew is found for at least one stage. Otherwise, EmptyMetric is returned.
        """
        # filter only relevant stages (don't bother with five-second stages, focus on the large ones)
        relevant_stages = [
            s for s in self.stages
            if s.executor_run_time > STAGE_SKEW_MIN_RUNTIME_MILLIS
        ]

        severity = Severity.NONE  # just an initialization; will be updated
        details = MetricDetailsList(ascending=True)
        for stage in relevant_stages:
            if stage.ss_executor_run_time is None:  # sometimes the stage statistics data are not available
                return EmptyMetric(severity=Severity.NONE)

            idx = [2, 4]  # indexes for median and maximum
            runtime_med, runtime_max = [
                stage.ss_executor_run_time[i] / 1000 for i in idx
            ]  # seconds
            bytes_read_med, bytes_read_max = [
                stage.ss_bytes_read[i] for i in idx
            ]
            bytes_written_med, bytes_written_max = [
                stage.ss_bytes_written[i] for i in idx
            ]
            shuffle_read_bytes_med, shuffle_read_bytes_max = [
                stage.ss_shuffle_read_bytes[i] for i in idx
            ]
            shuffle_write_bytes_med, shuffle_write_bytes_max = [
                stage.ss_shuffle_write_bytes[i] for i in idx
            ]
            id = stage.stage_id
            total_runtime = stage.executor_run_time / 1000  # seconds

            # severity should be calculated from runtime skew
            if runtime_med == 0:  # check division by 0
                stage_severity = Severity.HIGH
            else:
                stage_severity = STAGE_SKEW_THRESHOLDS.severity_of(
                    runtime_max / runtime_med)

            if stage_severity > severity:
                severity = stage_severity

            if stage_severity > Severity.NONE:
                details.add(
                    MetricDetails(
                        entity_id=id,
                        detail_string=
                        f"Stage {stage.stage_id}: Total runtime {fmt_time(total_runtime)}, Executor runtime {fmt_time(runtime_max)} (max), {fmt_time(runtime_med)} (median)",
                        sort_attr=runtime_max -
                        runtime_med,  # the details can be sorted by (max - median) difference, which might represent potential time loss
                        subdetails=[
                            f"Read {fmt_bytes(bytes_read_max)} (max), {fmt_bytes(bytes_read_med)} (median)",
                            f"Wrote {fmt_bytes(bytes_written_max)} (max), {fmt_bytes(bytes_written_med)} (median)",
                            f"Shuffle read {fmt_bytes(shuffle_read_bytes_max)} (max), {fmt_bytes(shuffle_read_bytes_med)} (median)",
                            f"Shuffle write {fmt_bytes(shuffle_write_bytes_max)} (max), {fmt_bytes(shuffle_write_bytes_max)} (median)"
                        ]))

        if details.length() == 0:  # no stages with significant severity
            return EmptyMetric(severity=Severity.NONE)
        else:
            overall_info = f"{details.length()} stages with significant skew found"
            return StageSkewMetric(severity=severity,
                                   overall_info=overall_info,
                                   details=details)
コード例 #10
0
    def analyze_disk_spills(self):
        """
        Analyze the Stages for disk spills
        :return: StageDiskSpillMetric if a disk spill skew is found for at least one stage. Otherwise, EmptyMetric is
        returned.
        """
        # filter only relevant stages (with non-zero spill)
        relevant_stages = [
            s for s in self.stages if s.memory_bytes_spilled > 0
        ]

        severity = Severity.NONE
        details = MetricDetailsList(
            ascending=True
        )  # the details can be sortable by amount of spilled data per stage

        total_bytes_spilled = 0
        stages_count = 0

        for stage in relevant_stages:
            memory_bytes_spilled = stage.memory_bytes_spilled or 0
            disk_bytes_spilled = stage.disk_bytes_spilled or 0
            input_bytes = stage.input_bytes or 0
            output_bytes = stage.output_bytes or 0
            shuffle_read_bytes = stage.shuffle_read_bytes or 0
            shuffle_write_bytes = stage.shuffle_write_bytes or 0
            stage_key = stage.stage_key
            id = stage.stage_id

            # also find which tasks is the most responsible for the spill
            worst_task = self.db.query(TaskEntity).filter(TaskEntity.stage_key == stage_key)\
                .order_by(desc(TaskEntity.memory_bytes_spilled)).first()

            max_memory_usage = max(input_bytes, output_bytes,
                                   shuffle_read_bytes, shuffle_write_bytes)
            stage_severity = STAGE_DISK_SPILL_THRESHOLDS.severity_of(
                memory_bytes_spilled / max_memory_usage)

            total_bytes_spilled += memory_bytes_spilled
            stages_count += 1

            if stage_severity > severity:
                severity = stage_severity

            if stage_severity > Severity.NONE:
                subdetails = [
                    f"Input: {fmt_bytes(input_bytes)}, output: {fmt_bytes(output_bytes)}.",
                    f"Shuffle read: {fmt_bytes(shuffle_read_bytes)}, shuffle write: {fmt_bytes(shuffle_write_bytes)}."
                ]
                if worst_task is not None:
                    memory_bytes_spilled_by_worst_task = worst_task.memory_bytes_spilled
                    disk_bytes_spilled_by_worst_task = worst_task.disk_bytes_spilled
                    subdetails.append(
                        f"Biggest contributor: task {worst_task.task_id}, "
                        f"{fmt_bytes(memory_bytes_spilled_by_worst_task)} spilled "
                        f"({fmt_bytes(disk_bytes_spilled_by_worst_task)} on disk)."
                    )

                detail_string = f"Stage {id} spilled {fmt_bytes(memory_bytes_spilled)} ({fmt_bytes(disk_bytes_spilled)} on disk)."

                details.add(
                    MetricDetails(entity_id=id,
                                  detail_string=detail_string,
                                  sort_attr=memory_bytes_spilled,
                                  subdetails=subdetails))

        if details.length() == 0:
            return EmptyMetric(severity=Severity.NONE)
        else:
            overall_info = f"{fmt_bytes(total_bytes_spilled)} spilled in {stages_count} stages."
            return StageDiskSpillMetric(severity=severity,
                                        overall_info=overall_info,
                                        details=details)
コード例 #11
0
    def analyze_memory_configuration(self):
        """
        This method analyzes if the application requested more memory (driver, executor, memory+overhead) than
        recommended. If so, it will report an issue.
        :return: MemoryConfigMetric if issue is detected or EmptyMetric if no issues are detected.
        """
        executor_memory = size_in_bytes(
            self.app.get_spark_property(EXECUTOR_MEMORY_KEY),
            EXECUTOR_MEMORY_DEFAULT_VALUE)
        executor_memory_overhead = size_in_bytes(self.app.get_spark_property(EXECUTOR_MEMORY_OVERHEAD_KEY),
                                                 EXECUTOR_MEMORY_DEFAULT_VALUE) \
            or constants.get_default_memory_overhead(executor_memory)
        driver_memory = size_in_bytes(
            self.app.get_spark_property(DRIVER_MEMORY_KEY),
            DRIVER_MEMORY_DEFAULT_VALUE)
        driver_memory_overhead = size_in_bytes(self.app.get_spark_property(DRIVER_MEMORY_OVERHEAD_KEY),
                                               DRIVER_MEMORY_DEFAULT_VALUE) \
            or constants.get_default_memory_overhead(driver_memory)

        severity_executor_memory = EXECUTOR_MEMORY_CONFIG_THRESHOLDS.severity_of(
            executor_memory)
        severity_executor_memory_overhead = EXECUTOR_MEMORY_OVERHEAD_CONFIG_THRESHOLDS.severity_of(
            executor_memory_overhead)
        severity_driver_memory = DRIVER_MEMORY_CONFIG_THRESHOLDS.severity_of(
            driver_memory)
        severity_driver_memory_overhead = DRIVER_MEMORY_CONFIG_OVERHEAD_THRESHOLDS.severity_of(
            driver_memory_overhead)

        severity = max(severity_executor_memory,
                       severity_executor_memory_overhead,
                       severity_driver_memory, severity_driver_memory_overhead)

        if severity > Severity.NONE:
            overall_info = f"Memory allocation limits were exceeded."
            details = MetricDetailsList()
            if severity_executor_memory > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{EXECUTOR_MEMORY_KEY} = {fmt_bytes(executor_memory)}, "
                        f"but ideally, it should be <= {fmt_bytes(EXECUTOR_MEMORY_CONFIG_LOW)}",
                    ))
            if severity_executor_memory_overhead > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{EXECUTOR_MEMORY_OVERHEAD_KEY} = {fmt_bytes(executor_memory_overhead)}, "
                        f"but ideally, it should be <= {fmt_bytes(EXECUTOR_MEMORY_OVERHEAD_CONFIG_LOW)}",
                    ))
            if severity_driver_memory > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{DRIVER_MEMORY_KEY} = {fmt_bytes(driver_memory)}, "
                        f"but ideally, it should be <= {fmt_bytes(DRIVER_MEMORY_CONFIG_LOW)}",
                    ))
            if severity_driver_memory_overhead > Severity.NONE:
                details.add(
                    MetricDetails(
                        detail_string=
                        f"{DRIVER_MEMORY_OVERHEAD_KEY} = {fmt_bytes(driver_memory_overhead)}, "
                        f"but ideally, it should be <= {fmt_bytes(DRIVER_MEMORY_OVERHEAD_CONFIG_LOW)}",
                    ))
            return MemoryConfigMetric(overall_info=overall_info,
                                      severity=severity,
                                      details=details)
        else:
            return EmptyMetric(severity=Severity.NONE)
コード例 #12
0
    def analyze_min_max_executors(self):
        """
        If dynamic allocation is enabled, this method analyzes a configuration of minimum and maximum executors
        required.
        If the minimum value is larger than recommended, it should report an issue (as the resources might be wasted).
        If the maximum value is larger than recommended, it should report an issue. On multi-tenant clusters, where
        separate YARN queues should be used, these issues are not reported if the non-default queue is used (this
        should prevent eating up all the resources, one the queues are set up correctly).
        :return: DynamicAllocationMinMaxExecutorsMetric if issues are detected or EmptyMetric if no issues are detected.
        """
        try:
            is_dynamic_allocation_enabled = cast_to_bool(
                self.app.get_spark_property(DYNAMIC_ALLOCATION_KEY))
        except ValueError as e:
            # TODO add logging
            is_dynamic_allocation_enabled = False

        if not is_dynamic_allocation_enabled:
            return EmptyMetric(Severity.NONE)

        min_executors = cast_to_int(
            self.app.get_spark_property(
                DYNAMIC_ALLOCATION_MIN_EXECUTORS_KEY)) or 0
        max_executors = cast_to_int(
            self.app.get_spark_property(
                DYNAMIC_ALLOCATION_MAX_EXECUTORS_KEY)) or math.inf
        queue_name = self.app.get_spark_property(
            SPARK_YARN_QUEUE_KEY) or SPARK_YARN_QUEUE_DEFAULT_VALUE

        severity_min = DYNAMIC_ALLOCATION_MIN_EXECUTORS_THRESHOLDS.severity_of(
            min_executors)

        severity_max = DYNAMIC_ALLOCATION_MAX_EXECUTORS_THRESHOLDS.severity_of(max_executors) \
            if queue_name == SPARK_YARN_QUEUE_DEFAULT_VALUE else Severity.NONE

        severity = severity_max if (
            severity_max > severity_min) else severity_min

        if severity > Severity.NONE:
            overall_info = "Non-optimal configuration of minimum or maximum number of executors"
            details = MetricDetailsList()
            if severity_min > Severity.NONE:
                details.add(
                    MetricDetails(
                        entity_id=None,
                        detail_string=
                        f"{DYNAMIC_ALLOCATION_MIN_EXECUTORS_KEY} is currently set to "
                        f"{min_executors}, but ideally, it should be <= "
                        f"{DYNAMIC_ALLOCATION_MIN_EXECUTORS_LOW}",
                        sort_attr=0,
                        subdetails=[]))
            if severity_max > Severity.NONE:
                details.add(
                    MetricDetails(
                        entity_id=None,
                        detail_string=
                        f"{DYNAMIC_ALLOCATION_MAX_EXECUTORS_KEY} is currently set to "
                        f"{max_executors}, but ideally, it should be <= "
                        f"{DYNAMIC_ALLOCATION_MAX_EXECUTORS_LOW}",
                        sort_attr=0,
                        subdetails=[
                            "Also consider using non-default YARN queue to prevent the application to eat up all the resources"
                        ]))
            return DynamicAllocationMinMaxExecutorsMetric(
                overall_info=overall_info, severity=severity, details=details)
        else:
            return EmptyMetric(Severity.NONE)