def get_benchmark_result_markdown(benchmark_files: Sequence[str],
                                  query_base: bool,
                                  verbose: bool = False) -> Tuple[str, str]:
  """Gets the full/abbreviated markdown summary of all benchmarks in files."""
  all_benchmarks = aggregate_all_benchmarks(benchmark_files)

  build_url = get_required_env_var("BUILDKITE_BUILD_URL")
  pr_number = get_required_env_var("BUILDKITE_PULL_REQUEST")
  pr_commit = get_required_env_var("BUILDKITE_COMMIT")
  pr_commit = md.link(pr_commit,
                      f"{GITHUB_IREE_REPO_PREFIX}/commit/{pr_commit}")

  commit_info = f"@ commit {pr_commit}"
  if query_base:
    # Try to query some base benchmark to diff against, from the top of the
    # tree. Bail out if the maximal trial number is exceeded.
    for i in range(MAX_BASE_COMMIT_QUERY_COUNT):
      base_commit = get_origin_tree_commit(i, verbose)
      base_benchmarks = query_base_benchmark_results(base_commit, verbose)
      base_commit = md.link(base_commit,
                            f"{GITHUB_IREE_REPO_PREFIX}/commit/{base_commit}")

      if len(base_benchmarks) == 0:
        commit_info = (f"@ commit {pr_commit} (no previous benchmark results to"
                       f" compare against since {base_commit})")
        continue

      # Update the aggregate benchmarks with base numbers.
      for bench in base_benchmarks:
        if bench in all_benchmarks:
          all_benchmarks[bench].base_mean_time = base_benchmarks[bench]
      commit_info = f"@ commit {pr_commit} (vs. base {base_commit})"
      break

  pr_info = md.link("Pull request",
                    f"{GITHUB_IREE_REPO_PREFIX}/pull/{pr_number}")
  buildkite_info = md.link("Buildkite build", build_url)

  # Compose the full benchmark tables.
  full_table = [md.header("Full Benchmark Summary", 2)]
  full_table.append(md.unordered_list([commit_info, pr_info, buildkite_info]))
  full_table.append(
      categorize_benchmarks_into_tables(all_benchmarks,
                                        SIMILAR_BECNHMARK_THRESHOLD))

  # Compose the abbreviated benchmark tables.
  abbr_table = [md.header(ABBR_PR_COMMENT_TITLE, 2)]
  abbr_table.append(commit_info)
  abbr_table.append(
      categorize_benchmarks_into_tables(all_benchmarks,
                                        SIMILAR_BECNHMARK_THRESHOLD,
                                        TABLE_SIZE_CUT))
  abbr_table.append("For more information:")
  # We don't know until a Gist is really created. Use a placeholder for now
  # and replace later.
  full_result_info = md.link("Full benchmark result tables",
                             "<<placeholder-link>>")
  abbr_table.append(md.unordered_list([full_result_info, buildkite_info]))

  return "\n\n".join(full_table), "\n\n".join(abbr_table)
Ejemplo n.º 2
0
def category_section_main(blg, stats):
    """Generates the main section of the category README.md file."""
    value_percentage = float(
        (
            (int(stats["unprocessed"]) - int(stats["processed"]))
            / int(stats["unprocessed"])
        )
        * 100
    )
    link_filter = markdown_strings.link(
        "Download",
        f"{blg.info.home}/filters/{blg.category}.txt",
    )
    main_title = (
        markdown_strings.header(f"{blg.data_json[blg.j_key.title]}", 1)
        + "\n"
        + "**"
        + link_filter
        + "**"
    )

    main_desc = markdown_strings.bold(f"{fill(blg.data_json[blg.j_key.desc])}")
    info_list = [
        f"Sources: {len(blg.data_json[blg.j_key.sources])}",
        f"""Rules before processing: {stats["unprocessed"]}""",
        f"""Rules after processing: {stats["processed"]}""",
    ]
    info_add = markdown_strings.unordered_list(info_list)
    string_bold = (
        f"aBL - {blg.data_json[blg.j_key.title]} is {value_percentage:.2f}% lighter"
    )
    sub_desc = f"The {markdown_strings.bold(string_bold)} than its combined sources"
    return [main_title, main_desc, info_add, sub_desc]
Ejemplo n.º 3
0
    def gen_diagnostics(self):
        insight_timespan_threshold = 10 * 60  # 10 min
        if self.job_timespan < insight_timespan_threshold:
            msg = "Insight will be available when more metric samples are " \
                  "collected.\n"
            self.diagnostics += msg
            return

        # Check idleness
        self.diagnostics += md.header("GPU Idleness", 2) + "\n"
        if len(self.idle_gpus) == self.num_gpus:
            msg = md.bold("All of %s GPU(s) in the job are idle. " %
                          len(self.idle_gpus))
            msg += "Please consider killing the job if you no longer need it.\n"
            self.diagnostics += msg
            return
        elif len(self.idle_gpus) > 0:
            msg = md.bold("There are %s idle GPU(s) in the job.\n" %
                          len(self.idle_gpus))
            c1 = "If you are running a job on all GPUs, please check if the process(es) on the idle GPU(s) have died/hung"
            c2 = "If you do not need all GPUs in the job, please consider killing the job and request a new job with fewer GPUs."
            msg += md.unordered_list([c1, c2]) + "\n"
            self.diagnostics += msg
        else:
            self.diagnostics += md.bold("All GPU(s) are active.") + "\n"
        self.diagnostics += "\n"

        # Check Resource usage for active GPUs
        self.diagnostics += md.header("Active GPU Utilization", 2) + "\n"
        good_gpu_util_threshold = 90
        good_gpu_mem_util_threshold = 50
        if self.active_gpu_util >= good_gpu_util_threshold:
            msg = "Average active GPU utilization over time is good at " \
                  "%.2f%%.\n" % self.active_gpu_util
            self.diagnostics += msg
        else:
            msg = "Average active GPU utilization over time is " \
                  "%.2f%% < %s%%. You can try below suggestions to boost " \
                  "GPU utilization:\n" % \
                  (self.active_gpu_util, good_gpu_util_threshold)

            suggestions = []
            if self.active_gpu_memory_util < good_gpu_mem_util_threshold:
                suggestions.append(
                    "Average active GPU memory utilization over time is below "
                    "%s%%. Try increasing batch size to put more data "
                    "onto GPU memory to boost GPU utilization. For a "
                    "distributed job, if the model has strict "
                    "requirement on the global effective batch size "
                    "for convergence, you can consider using a job "
                    "with fewer GPUs and bigger batch size per GPU." %
                    good_gpu_mem_util_threshold)

            if self.max_cpu_per_gpu is not None and \
                    self.cpu_per_active_gpu < self.max_cpu_per_gpu:
                suggestions.append(
                    "The job uses %.2f CPU cores per active GPU on average"
                    "over time. The maximum CPU cores per GPU you can "
                    "use without interfering with other GPUs in this "
                    "cluster is %.2f. You can use more CPU cores to "
                    "perform data preprocessing to keep GPUs from "
                    "starvation. Please consider using/increasing "
                    "parallel preprocessing on your input data." %
                    (self.cpu_per_active_gpu, self.max_cpu_per_gpu))

            if self.max_memory_per_gpu is not None and \
                    self.memory_per_active_gpu < self.max_memory_per_gpu:
                suggestions.append(
                    "The job uses %.2fG memory per active GPU on average"
                    "over time. The maximum memory per GPU you can "
                    "use without interfering with other GPUs in this "
                    "cluster is %.2fG. You can preload more input "
                    "data into memory to make sure your data pipeline "
                    "is never waiting on data loading from "
                    "disk/remote." % (self.memory_per_active_gpu / G,
                                      self.max_memory_per_gpu / G))

            suggestions.append(
                "Please check if your program is waiting on NFS I/O. "
                "If so, please consider using scalable storage, e.g. "
                "Azure blob.")

            suggestions.append(
                "Suggestions above are purely based on average usage over a "
                "time window. Please take a closer look at METRICS tab to "
                "better understand the utilization pattern of GPU, GPU "
                "memory, CPU and memory over time for further optimization.")
            msg += md.unordered_list(suggestions) + "\n"
            self.diagnostics += msg + "\n"
Ejemplo n.º 4
0
    def test_gen_insights(self):
        since = 1588630427
        end = 1588634027
        node_spec = test_node_spec()
        task_gpu_percent = test_task_gpu_percent()
        task_gpu_mem_percent = test_task_gpu_mem_percent()
        task_cpu_percent = test_task_cpu_percent()
        task_mem_usage_byte = test_task_mem_usage_byte()
        running_job_ids = test_running_job_ids()

        insights = gen_insights(task_gpu_percent, task_gpu_mem_percent,
                                task_cpu_percent, task_mem_usage_byte, since,
                                end, node_spec, running_job_ids)
        self.assertEqual(len(insights), 1)

        insight = insights[0]

        expected_diagnostics = md.header("GPU Idleness", 2) + "\n"
        expected_diagnostics += md.bold("All GPU(s) are active.") + "\n\n"

        expected_diagnostics += md.header("Active GPU Utilization", 2) + "\n"
        expected_diagnostics += "Average active GPU utilization over time is 30.00% < 90%. You can try below suggestions to boost GPU utilization:\n"
        suggestions = []
        suggestions.append(
            "Average active GPU memory utilization over time is below "
            "50%. Try increasing batch size to put more data "
            "onto GPU memory to boost GPU utilization. For a "
            "distributed job, if the model has strict "
            "requirement on the global effective batch size "
            "for convergence, you can consider using a job "
            "with fewer GPUs and bigger batch size per GPU.")
        suggestions.append(
            "The job uses 1.00 CPU cores per active GPU on average"
            "over time. The maximum CPU cores per GPU you can "
            "use without interfering with other GPUs in this "
            "cluster is 4.00. You can use more CPU cores to "
            "perform data preprocessing to keep GPUs from "
            "starvation. Please consider using/increasing "
            "parallel preprocessing on your input data.")
        suggestions.append(
            "The job uses 10.00G memory per active GPU on average"
            "over time. The maximum memory per GPU you can "
            "use without interfering with other GPUs in this "
            "cluster is 100.00G. You can preload more input "
            "data into memory to make sure your data pipeline "
            "is never waiting on data loading from "
            "disk/remote.")
        suggestions.append(
            "Please check if your program is waiting on NFS I/O. "
            "If so, please consider using scalable storage, e.g. "
            "Azure blob.")
        suggestions.append(
            "Suggestions above are purely based on average usage over a "
            "time window. Please take a closer look at METRICS tab to "
            "better understand the utilization pattern of GPU, GPU "
            "memory, CPU and memory over time for further optimization.")
        expected_diagnostics += md.unordered_list(suggestions) + "\n"
        expected_diagnostics += "\n"

        expected_insight = {
            "job_id": "job0",
            "since": since,
            "end": end,
            "diagnostics": expected_diagnostics,
        }
        self.assertEqual(expected_insight, insight)