def histogram_plot(experiment_log_dir, plot_spec_string, output_filename, has_legend, x_limit, verbose): queries = [ plot_utils.plot_spec_string_to_query(plot_spec_string, 0, "HIST")] plot_data = metaprogram_utils.process_queries( queries, experiment_log_dir, verbose) if "plot_points" not in plot_data: warnings.warn("No data to plot!") return histogram_data = plot_data["plot_points"][0] cumulative_histogram = {} layout = PlotLayout() layout.dpi = 250 for stat_name in histogram_data: plot = Plot() plot.setTitle(stat_name) if has_legend: plot.hasLegend(labelSize=8) if x_limit is not None: plot.setXLimits(0, x_limit) style_plot(plot, stat_name) for key, points in sorted(histogram_data[stat_name].items()): for size, count in itertools.izip(points["bin"], points["count"]): if size not in cumulative_histogram: cumulative_histogram[size] = 0 cumulative_histogram[size] += count line = Line() line.stepFunction("pre") line.label = str(key) line.xValues = points["bin"] line.yValues = points["count"] plot.add(line) layout.addPlot(plot) cumulative_plot = Plot() if x_limit is not None: cumulative_plot.setXLimits(0, x_limit) cumulative_plot.setTitle("Cumulative Histogram for " + stat_name) style_plot(cumulative_plot, stat_name) line = Line() line.stepFunction("pre") line.xValues = sorted(cumulative_histogram.keys()) line.yValues = [cumulative_histogram[key] for key in line.xValues] cumulative_plot.add(line) layout.addPlot(cumulative_plot) layout.save(output_filename)
def calculate_rate(input_directory, skip_phase_zero, skip_phase_one, skip_phase_two, verbose): phaseTimesQuery = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "phase_runtime"), ("start_time", None)) phaseTimesQuery.match_processor_function = handleTimestampQueryMatch diskCountQuery = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("logger_name", "mapreduce"), ("stat_name", ["num_input_disks", "num_intermediate_disks"]), ("uint_value", None)) diskCountQuery.match_processor_function = handleDiskCountMatch inputSizeQuery = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", "reader"), ("id", None), ("stat_name", "bytes_produced"), ("uint_value", None)) inputSizeQuery.match_processor_function = handleReaderInputMatch writerOutputQuery = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", "writer"), ("id", None), ("stat_name", "bytes_consumed"), ("uint_value", None)) writerOutputQuery.match_processor_function = handleWriterOutputMatch queries = [ phaseTimesQuery, diskCountQuery, inputSizeQuery, writerOutputQuery ] skipped_phases = [] if skip_phase_zero: skipped_phases.append("phase_zero") if skip_phase_one: skipped_phases.append("phase_one") if skip_phase_two: skipped_phases.append("phase_two") output_data = utils.process_queries(queries, input_directory, verbose, skipped_phases) data_for_display = postprocess_rate_data(output_data) for key in sorted(data_for_display.keys()): env = jinja2.Environment(loader=jinja2.FileSystemLoader( os.path.dirname(__file__)), trim_blocks=True) template = env.get_template('rate_summary_template.jinja') rendered_template = template.render(**data_for_display[key]) print rendered_template.strip() + "\n"
def list_time_series( experiment_log_dir, plot_spec_strings, output_filename, verbose): queries = [] queries.extend(get_list_time_series_queries()) time_series_data = utils.process_queries( queries, experiment_log_dir, verbose) time_series_keys = time_series_data["time_series_keys"] output_fp = open(output_filename, 'w') for time_series_key in sorted(time_series_keys, key=itemgetter(0,1,3,2)): print >> output_fp, time_series_tuple_to_str(time_series_key) output_fp.close()
def list_time_series(experiment_log_dir, plot_spec_strings, output_filename, verbose): queries = [] queries.extend(get_list_time_series_queries()) time_series_data = utils.process_queries(queries, experiment_log_dir, verbose) time_series_keys = time_series_data["time_series_keys"] output_fp = open(output_filename, 'w') for time_series_key in sorted(time_series_keys, key=itemgetter(0, 1, 3, 2)): print >> output_fp, time_series_tuple_to_str(time_series_key) output_fp.close()
def time_series_plot(experiment_log_dir, plot_spec_strings, make_legend, split_by_host, group_by_query, verbose): queries = [] for i, plot_spec_string in enumerate(plot_spec_strings): queries.append( plot_utils.plot_spec_string_to_query(plot_spec_string, i, "COLL")) plot_data = metaprogram_utils.process_queries(queries, experiment_log_dir, verbose) plots = make_plots(plot_data, make_legend, split_by_host, group_by_query) if len(plots) == 0: warnings.warn("No data to plot!") return return plots
def time_series_plot(experiment_log_dir, plot_spec_strings, make_legend, split_by_host, group_by_query, verbose): queries = [] for i, plot_spec_string in enumerate(plot_spec_strings): queries.append( plot_utils.plot_spec_string_to_query( plot_spec_string, i, "COLL")) plot_data = metaprogram_utils.process_queries( queries, experiment_log_dir, verbose) plots = make_plots(plot_data, make_legend, split_by_host, group_by_query) if len(plots) == 0: warnings.warn("No data to plot!") return return plots
def histogram_plot(experiment_log_dir, plot_spec_string, output_filename, has_legend, x_limit, verbose): queries = [ plot_utils.plot_spec_string_to_query(plot_spec_string, 0, "HIST") ] plot_data = metaprogram_utils.process_queries(queries, experiment_log_dir, verbose) if "plot_points" not in plot_data: warnings.warn("No data to plot!") return histogram_data = plot_data["plot_points"][0] cumulative_histogram = {} layout = PlotLayout() layout.dpi = 250 for stat_name in histogram_data: plot = Plot() plot.setTitle(stat_name) if has_legend: plot.hasLegend(labelSize=8) if x_limit is not None: plot.setXLimits(0, x_limit) style_plot(plot, stat_name) for key, points in sorted(histogram_data[stat_name].items()): for size, count in itertools.izip(points["bin"], points["count"]): if size not in cumulative_histogram: cumulative_histogram[size] = 0 cumulative_histogram[size] += count line = Line() line.stepFunction("pre") line.label = str(key) line.xValues = points["bin"] line.yValues = points["count"] plot.add(line) layout.addPlot(plot) cumulative_plot = Plot() if x_limit is not None: cumulative_plot.setXLimits(0, x_limit) cumulative_plot.setTitle("Cumulative Histogram for " + stat_name) style_plot(cumulative_plot, stat_name) line = Line() line.stepFunction("pre") line.xValues = sorted(cumulative_histogram.keys()) line.yValues = [cumulative_histogram[key] for key in line.xValues] cumulative_plot.add(line) layout.addPlot(cumulative_plot) layout.save(output_filename)
def gather_runtime_info(experiment_directory, verbose, skipped_phases=[]): total_runtime_query = StatQuery("SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "runtime"), ("summary_stat_name", "sum")) total_runtime_query.match_processor_function = \ stat_container_append_matcher("total_runtime", "value") total_idle_time_query = StatQuery("SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "wait"), ("summary_stat_name", "sum")) total_idle_time_query.match_processor_function = \ stat_container_append_matcher("total_idle_time", "value") pipeline_saturation_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "pipeline_saturation_wait"), ("summary_stat_name", "sum")) pipeline_saturation_time_query.match_processor_function = ( stat_container_append_matcher("pipeline_saturation_time", "value")) num_workers_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "num_workers"), ("uint_value", None)) num_workers_query.match_processor_function = stat_container_append_matcher( "num_workers", "uint_value") teardown_time_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "teardown"), ("start_time", None)) teardown_time_query.match_processor_function = \ stat_container_append_matcher("total_teardown_time", "elapsed_time") stage_runtime_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "stage_runtime"), ("start_time", None)) stage_runtime_query.match_processor_function = \ stat_container_append_matcher("stage_runtime", "elapsed_time") input_size_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "bytes_consumed"), ("uint_value", None)) input_size_query.match_processor_function = stat_container_append_matcher( "total_bytes_in", "uint_value") output_size_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "bytes_produced"), ("uint_value", None)) output_size_query.match_processor_function = stat_container_append_matcher( "total_bytes_out", "uint_value") allocation_time_query = StatQuery("SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "allocation_wait_time"), ("summary_stat_name", "sum")) allocation_time_query.match_processor_function = \ stat_container_append_matcher("total_mem_wait_time", "value") enqueue_block_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "queue_saturation_block_time"), ("summary_stat_name", "sum")) enqueue_block_time_query.match_processor_function = \ stat_container_append_matcher( "total_enqueue_block_time", "value") worker_type_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_type"), ("str_value", None)) worker_type_query.match_processor_function = set_stage_value_matcher would_have_blocked_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "would_have_blocked"), ("uint_value", None)) would_have_blocked_query.match_processor_function = \ stat_container_append_matcher("would_have_blocked", "uint_value") total_ios_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "total_ios"), ("uint_value", None)) total_ios_query.match_processor_function = stat_container_append_matcher( "total_ios", "uint_value") worker_start_time_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_start_time"), ("uint_value", None)) worker_start_time_query.match_processor_function = \ stat_container_append_matcher("worker_start_time", "uint_value") worker_stop_time_query = StatQuery("DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_stop_time"), ("uint_value", None)) worker_stop_time_query.match_processor_function = \ stat_container_append_matcher("worker_stop_time", "uint_value") queries = [ total_runtime_query, total_idle_time_query, pipeline_saturation_time_query, num_workers_query, teardown_time_query, stage_runtime_query, input_size_query, output_size_query, allocation_time_query, enqueue_block_time_query, worker_type_query, would_have_blocked_query, total_ios_query, worker_start_time_query, worker_stop_time_query ] runtime_info = utils.process_queries(queries, experiment_directory, verbose, skipped_phases) runtime_info = postprocess(runtime_info, experiment_directory) return runtime_info
def gather_runtime_info(experiment_directory, verbose, skipped_phases=[]): total_runtime_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "runtime"), ("summary_stat_name", "sum")) total_runtime_query.match_processor_function = \ stat_container_append_matcher("total_runtime", "value") total_idle_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "wait"), ("summary_stat_name", "sum")) total_idle_time_query.match_processor_function = \ stat_container_append_matcher("total_idle_time", "value") pipeline_saturation_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "pipeline_saturation_wait"), ("summary_stat_name", "sum")) pipeline_saturation_time_query.match_processor_function = ( stat_container_append_matcher("pipeline_saturation_time", "value")) num_workers_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "num_workers"), ("uint_value", None)) num_workers_query.match_processor_function = stat_container_append_matcher( "num_workers", "uint_value") teardown_time_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "teardown"), ("start_time", None)) teardown_time_query.match_processor_function = \ stat_container_append_matcher("total_teardown_time", "elapsed_time") stage_runtime_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "stage_runtime"), ("start_time", None)) stage_runtime_query.match_processor_function = \ stat_container_append_matcher("stage_runtime", "elapsed_time") input_size_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "bytes_consumed"), ("uint_value", None)) input_size_query.match_processor_function = stat_container_append_matcher( "total_bytes_in", "uint_value") output_size_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "bytes_produced"), ("uint_value", None)) output_size_query.match_processor_function = stat_container_append_matcher( "total_bytes_out", "uint_value") allocation_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "allocation_wait_time"), ("summary_stat_name", "sum")) allocation_time_query.match_processor_function = \ stat_container_append_matcher("total_mem_wait_time", "value") enqueue_block_time_query = StatQuery( "SUMM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "queue_saturation_block_time"), ("summary_stat_name", "sum")) enqueue_block_time_query.match_processor_function = \ stat_container_append_matcher( "total_enqueue_block_time", "value") worker_type_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_type"), ("str_value", None)) worker_type_query.match_processor_function = set_stage_value_matcher would_have_blocked_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "would_have_blocked"), ("uint_value", None)) would_have_blocked_query.match_processor_function = \ stat_container_append_matcher("would_have_blocked", "uint_value") total_ios_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "total_ios"), ("uint_value", None)) total_ios_query.match_processor_function = stat_container_append_matcher( "total_ios", "uint_value") worker_start_time_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_start_time"), ("uint_value", None)) worker_start_time_query.match_processor_function = \ stat_container_append_matcher("worker_start_time", "uint_value") worker_stop_time_query = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", None), ("id", None), ("stat_name", "worker_stop_time"), ("uint_value", None)) worker_stop_time_query.match_processor_function = \ stat_container_append_matcher("worker_stop_time", "uint_value") queries = [total_runtime_query, total_idle_time_query, pipeline_saturation_time_query, num_workers_query, teardown_time_query, stage_runtime_query, input_size_query, output_size_query, allocation_time_query, enqueue_block_time_query, worker_type_query, would_have_blocked_query, total_ios_query, worker_start_time_query, worker_stop_time_query] runtime_info = utils.process_queries( queries, experiment_directory, verbose, skipped_phases) runtime_info = postprocess(runtime_info, experiment_directory) return runtime_info
def calculate_rate( input_directory, skip_phase_zero, skip_phase_one, skip_phase_two, verbose): phaseTimesQuery = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("logger_name", None), ("stat_name", "phase_runtime"), ("start_time", None)) phaseTimesQuery.match_processor_function = handleTimestampQueryMatch diskCountQuery = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("logger_name", "mapreduce"), ("stat_name", ["num_input_disks", "num_intermediate_disks"]), ("uint_value", None)) diskCountQuery.match_processor_function = handleDiskCountMatch inputSizeQuery = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", "reader"), ("id", None), ("stat_name", "bytes_produced"), ("uint_value", None) ) inputSizeQuery.match_processor_function = handleReaderInputMatch writerOutputQuery = StatQuery( "DATM", ("phase_name", None), ("epoch", None), ("stage_name", "writer"), ("id", None), ("stat_name", "bytes_consumed"), ("uint_value", None) ) writerOutputQuery.match_processor_function = handleWriterOutputMatch queries = [phaseTimesQuery, diskCountQuery, inputSizeQuery, writerOutputQuery] skipped_phases = [] if skip_phase_zero: skipped_phases.append("phase_zero") if skip_phase_one: skipped_phases.append("phase_one") if skip_phase_two: skipped_phases.append("phase_two") output_data = utils.process_queries( queries, input_directory, verbose, skipped_phases) data_for_display = postprocess_rate_data(output_data) for key in sorted(data_for_display.keys()): env = jinja2.Environment( loader = jinja2.FileSystemLoader(os.path.dirname(__file__)), trim_blocks = True) template = env.get_template('rate_summary_template.jinja') rendered_template = template.render(**data_for_display[key]) print rendered_template.strip() + "\n"