def write_stage_info(self, query_id, prefix): f = open("%s_stage_info" % prefix, "a") last_stage_runtime = -1 last_stage_finish_time = 0 for stage in self.stages.values(): # This is a hack! Count the most recent stage with runtime > 1s as the "last". # Shark produces 1-2 very short stages at the end that do not seem to do anything (and # certainly aren't doing the output write we're trying to account for). if (stage.finish_time() - stage.start_time) > 1000 and stage.finish_time() > last_stage_finish_time: last_stage_finish_time = stage.finish_time() last_stage_runtime = stage.finish_time() - stage.start_time f.write("%s\t%s\t%s\n" % (query_id, last_stage_runtime, self.original_runtime())) f.close()
def replace_stragglers_with_median_speedup(self, threshold_fn): """ Returns how much faster the job would have run if there were no stragglers. For each stage, passes the list of task runtimes into threshold_fn, which should return a threshold runtime. Then, replaces all task runtimes greater than the given threshold with the median runtime. For example, to replace the tasks with the longest 5% of runtimes with the median: self.replace_stragglers_with_median_speedup(lambda runtimes: numpy.percentile(runtimes, 95) """ self.print_heading("Computing speedup from replacing straggler tasks with median") total_no_stragglers_runtime = 0 start_and_runtimes_for_combined_stages = [] original_start_and_runtimes_for_combined_stages = [] num_stragglers_combined_stages = 0 for id, stage in self.stages.iteritems(): runtimes = [task.runtime() for task in stage.tasks] median_runtime = numpy.percentile(runtimes, 50) threshold_runtime = threshold_fn(runtimes) no_straggler_start_and_runtimes = [] num_stragglers = 0 sorted_stage_tasks = sorted(stage.tasks, key = lambda t: t.runtime()) for task in sorted_stage_tasks: if task.runtime() >= threshold_runtime: assert(median_runtime <= task.runtime()) no_straggler_start_and_runtimes.append((task.start_time, median_runtime)) num_stragglers += 1 else: no_straggler_start_and_runtimes.append((task.start_time, task.runtime())) if id in self.stages_to_combine: start_and_runtimes_for_combined_stages.extend(no_straggler_start_and_runtimes) original_start_and_runtimes_for_combined_stages.extend( [(t.start_time, t.runtime()) for t in stage.tasks]) num_stragglers_combined_stages += num_stragglers else: max_concurrency = concurrency.get_max_concurrency(stage.tasks) no_stragglers_runtime = simulate.simulate( [x[1] for x in no_straggler_start_and_runtimes], max_concurrency)[0] total_no_stragglers_runtime += no_stragglers_runtime original_runtime = simulate.simulate( [task.runtime() for task in sorted_stage_tasks], max_concurrency)[0] print ("%s: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" % (id, stage.finish_time() - stage.start_time, original_runtime, no_stragglers_runtime, num_stragglers)) if len(start_and_runtimes_for_combined_stages) > 0: original_start_time = min([x[0] for x in start_and_runtimes_for_combined_stages]) original_finish_time = max([x[0] + x[1] for x in start_and_runtimes_for_combined_stages]) start_and_runtimes_for_combined_stages.sort() runtimes_for_combined_stages = [x[1] for x in start_and_runtimes_for_combined_stages] new_runtime = simulate.simulate( runtimes_for_combined_stages, self.combined_stages_concurrency)[0] original_runtime = simulate.simulate( [x[1] for x in sorted(original_start_and_runtimes_for_combined_stages)], self.combined_stages_concurrency)[0] print ("Combined: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" % (original_finish_time - original_start_time, original_runtime, new_runtime, num_stragglers_combined_stages)) total_no_stragglers_runtime += new_runtime return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
def no_stragglers_perfect_parallelism_speedup(self): """ Returns how fast the job would have run if time were perfectly spread across 32 slots. """ ideal_runtime = 0 total_runtime_combined_stages = 0 for id, stage in self.stages.iteritems(): if id in self.stages_to_combine: total_runtime_combined_stages += stage.total_runtime() else: new_runtime = float(stage.total_runtime()) / concurrency.get_max_concurrency(stage.tasks) print "New runtime: %s, original runtime: %s" % (new_runtime, stage.finish_time() - stage.start_time) ideal_runtime += new_runtime print "Total runtime combined: %s (concurrency %d" % (total_runtime_combined_stages, self.combined_stages_concurrency) ideal_runtime += float(total_runtime_combined_stages) / self.combined_stages_concurrency print "Getting simulated runtime" simulated_actual_runtime = self.get_simulated_runtime() print "Ideal runtime for all: %s, simulated: %s" % (ideal_runtime, simulated_actual_runtime) return ideal_runtime / simulated_actual_runtime