def get_simulated_runtime(self, waterfall_prefix=""): """ Returns the simulated runtime for the job. This should be approximately the same as the original runtime of the job, except that it doesn't include scheduler delay. If a non-empty waterfall_prefix is passed in, makes a waterfall plot based on the simulated runtimes. """ total_runtime = 0 tasks_for_combined_stages = [] all_start_finish_times = [] for id, stage in self.stages.iteritems(): if id in self.stages_to_combine: tasks_for_combined_stages.extend(stage.tasks) else: tasks = sorted(stage.tasks, key=lambda task: task.start_time) simulated_runtime, start_finish_times = simulate.simulate([task.runtime() for task in tasks]) start_finish_times_adjusted = [ (start + total_runtime, finish + total_runtime) for start, finish in start_finish_times ] all_start_finish_times.append(start_finish_times_adjusted) total_runtime += simulated_runtime if len(tasks_for_combined_stages) > 0: tasks = sorted(tasks_for_combined_stages, key=lambda task: task.start_time) simulated_runtime, start_finish_times = simulate.simulate([task.runtime() for task in tasks]) start_finish_times_adjusted = [ (start - simulated_runtime, finish - simulated_runtime) for start, finish in start_finish_times ] all_start_finish_times.append(start_finish_times_adjusted) total_runtime += simulated_runtime if waterfall_prefix: self.write_simulated_waterfall(all_start_finish_times, "%s_simulated" % waterfall_prefix) return total_runtime
def replace_95_stragglers_with_median_speedup(self): """ Returns how much faster the job would have run if there were no stragglers. Removes stragglers by replacing the longest 5% of tasks with the median runtime for tasks in the stage. """ total_no_stragglers_runtime = 0 runtimes_for_combined_stages = [] for id, stage in self.stages.iteritems(): runtimes = [task.runtime() for task in stage.tasks] runtimes.sort() median_runtime = get_percentile(runtimes, 0.5) threshold_runtime = get_percentile(runtimes, 0.95) no_straggler_runtimes = [] for runtime in runtimes: if runtime >= threshold_runtime: no_straggler_runtimes.append(median_runtime) else: no_straggler_runtimes.append(runtime) if id in self.stages_to_combine: runtimes_for_combined_stages.extend(no_straggler_runtimes) else: no_stragglers_runtime = simulate.simulate(no_straggler_runtimes)[0] total_no_stragglers_runtime += no_stragglers_runtime original_runtime = simulate.simulate([task.runtime() for task in stage.tasks])[0] print "%s: Orig: %s, no stragg: %s" % (id, original_runtime, no_stragglers_runtime) if len(runtimes_for_combined_stages) > 0: total_no_stragglers_runtime += simulate.simulate(runtimes_for_combined_stages)[0] return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
def replace_stragglers_with_median_speedup(self, threshold_fn): """ Returns how much faster the job would have run if there were no stragglers. For each stage, passes the list of task runtimes into threshold_fn, which should return a threshold runtime. Then, replaces all task runtimes greater than the given threshold with the median runtime. For example, to replace the tasks with the longest 5% of runtimes with the median: self.replace_stragglers_with_median_speedup(lambda runtimes: numpy.percentile(runtimes, 95) """ self.print_heading("Computing speedup from replacing straggler tasks with median") total_no_stragglers_runtime = 0 start_and_runtimes_for_combined_stages = [] original_start_and_runtimes_for_combined_stages = [] num_stragglers_combined_stages = 0 for id, stage in self.stages.iteritems(): runtimes = [task.runtime() for task in stage.tasks] median_runtime = numpy.percentile(runtimes, 50) threshold_runtime = threshold_fn(runtimes) no_straggler_start_and_runtimes = [] num_stragglers = 0 sorted_stage_tasks = sorted(stage.tasks, key = lambda t: t.runtime()) for task in sorted_stage_tasks: if task.runtime() >= threshold_runtime: assert(median_runtime <= task.runtime()) no_straggler_start_and_runtimes.append((task.start_time, median_runtime)) num_stragglers += 1 else: no_straggler_start_and_runtimes.append((task.start_time, task.runtime())) if id in self.stages_to_combine: start_and_runtimes_for_combined_stages.extend(no_straggler_start_and_runtimes) original_start_and_runtimes_for_combined_stages.extend( [(t.start_time, t.runtime()) for t in stage.tasks]) num_stragglers_combined_stages += num_stragglers else: max_concurrency = concurrency.get_max_concurrency(stage.tasks) no_stragglers_runtime = simulate.simulate( [x[1] for x in no_straggler_start_and_runtimes], max_concurrency)[0] total_no_stragglers_runtime += no_stragglers_runtime original_runtime = simulate.simulate( [task.runtime() for task in sorted_stage_tasks], max_concurrency)[0] print ("%s: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" % (id, stage.finish_time() - stage.start_time, original_runtime, no_stragglers_runtime, num_stragglers)) if len(start_and_runtimes_for_combined_stages) > 0: original_start_time = min([x[0] for x in start_and_runtimes_for_combined_stages]) original_finish_time = max([x[0] + x[1] for x in start_and_runtimes_for_combined_stages]) start_and_runtimes_for_combined_stages.sort() runtimes_for_combined_stages = [x[1] for x in start_and_runtimes_for_combined_stages] new_runtime = simulate.simulate( runtimes_for_combined_stages, self.combined_stages_concurrency)[0] original_runtime = simulate.simulate( [x[1] for x in sorted(original_start_and_runtimes_for_combined_stages)], self.combined_stages_concurrency)[0] print ("Combined: Original: %s, Orig (sim): %s, no stragg: %s (%s stragglers)" % (original_finish_time - original_start_time, original_runtime, new_runtime, num_stragglers_combined_stages)) total_no_stragglers_runtime += new_runtime return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
def fraction_time_waiting_on_compute(self): total_compute_wait_time = 0 total_runtime = 0 for stage in self.stages.values(): for task in stage.tasks: total_compute_wait_time += (task.runtime() - task.runtime_no_compute()) total_runtime += task.runtime() return total_compute_wait_time * 1.0 / total_runtime
def fraction_time_waiting_on_disk(self): total_disk_wait_time = 0 total_runtime = 0 for stage in self.stages.values(): for task in stage.tasks: total_disk_wait_time += task.runtime() - task.runtime_no_disk_for_shuffle() total_runtime += task.runtime() return total_disk_wait_time * 1.0 / total_runtime
def get_simulated_runtime(self, waterfall_prefix=""): """ Returns the simulated runtime for the job. This should be approximately the same as the original runtime of the job, except that it doesn't include scheduler delay. If a non-empty waterfall_prefix is passed in, makes a waterfall plot based on the simulated runtimes. """ total_runtime = 0 tasks_for_combined_stages = [] all_start_finish_times = [] for id, stage in self.stages.iteritems(): if id in self.stages_to_combine: tasks_for_combined_stages.extend(stage.tasks) else: tasks = sorted(stage.tasks, key = lambda task: task.start_time) simulated_runtime, start_finish_times = simulate.simulate( [t.runtime() for t in tasks], concurrency.get_max_concurrency(tasks)) start_finish_times_adjusted = [ (start + total_runtime, finish + total_runtime) for start, finish in start_finish_times] all_start_finish_times.append(start_finish_times_adjusted) total_runtime += simulated_runtime if len(tasks_for_combined_stages) > 0: tasks = sorted(tasks_for_combined_stages, key = lambda task: task.start_time) simulated_runtime, start_finish_times = simulate.simulate( [task.runtime() for task in tasks], self.combined_stages_concurrency) start_finish_times_adjusted = [ (start - simulated_runtime, finish - simulated_runtime) for start, finish in start_finish_times] all_start_finish_times.append(start_finish_times_adjusted) total_runtime += simulated_runtime if waterfall_prefix: self.write_simulated_waterfall(all_start_finish_times, "%s_simulated" % waterfall_prefix) return total_runtime
def fraction_time_computing(self): total_compute_time = 0 total_runtime = 0 for stage in self.stages.values(): for task in stage.tasks: total_compute_time += task.compute_time() total_runtime += task.runtime() return total_compute_time * 1.0 / total_runtime
def fraction_time_deserializing(self): """ Returns the fraction of time spent deserializing data. """ total_deserialize_time = 0 total_runtime = 0 for stage in self.stages.values(): for task in stage.tasks: serialize_time = task.estimated_deserialization_millis total_deserialize_time += serialize_time total_runtime += task.runtime() return total_deserialize_time * 1.0 / total_runtime
def fraction_time_serializing(self): """ Returns the fraction of time spent serializing and deserializing data. """ total_serialize_time = 0 total_runtime = 0 for stage in self.stages.values(): for task in stage.tasks: serialize_time = task.estimated_serialization_millis + task.estimated_deserialization_millis if (serialize_time > task.compute_time()): print ("!!!! Warning: For task %s, serialize time (%s) is larger than compute time (%s)" % (task, serialize_time, task.compute_time())) total_serialize_time += serialize_time total_runtime += task.runtime() return total_serialize_time * 1.0 / total_runtime
def fraction_time_using_disk(self): """ Fraction of task time spent writing shuffle outputs to disk and reading them back. Does not include time to spill data to disk (which is fine for now because that feature is turned off by default nor the time to persist result data to disk (if that happens). """ total_disk_write_time = 0 total_runtime = 0 for id, stage in self.stages.iteritems(): stage_disk_write_time = 0 stage_total_runtime = 0 for task in stage.tasks: stage_disk_write_time += task.disk_time() stage_total_runtime += task.runtime() self.logger.debug("Stage %s: Disk write time: %s, total runtime: %s" % (id, stage_disk_write_time, stage_total_runtime)) total_disk_write_time += stage_disk_write_time total_runtime += stage_total_runtime return total_disk_write_time * 1.0 / total_runtime
def replace_stragglers_with_median_speedup(self): """ Returns how much faster the job would have run if there were no stragglers. Removes stragglers by replacing all task runtimes with the median runtime for tasks in the stage. """ total_no_stragglers_runtime = 0 runtimes_for_combined_stages = [] for id, stage in self.stages.iteritems(): runtimes = [task.runtime() for task in stage.tasks] median_runtime = numpy.median(runtimes) no_straggler_runtimes = [numpy.median(runtimes)] * len(stage.tasks) if id in self.stages_to_combine: runtimes_for_combined_stages.extend(no_straggler_runtimes) else: total_no_stragglers_runtime += simulate.simulate(no_straggler_runtimes)[0] if len(runtimes_for_combined_stages) > 0: total_no_stragglers_runtime += simulate.simulate(runtimes_for_combined_stages)[0] return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()
def replace_all_tasks_with_median_speedup(self): """ Returns how much faster the job would have run if there were no stragglers. Removes stragglers by replacing all task runtimes with the median runtime for tasks in the stage. """ total_no_stragglers_runtime = 0 runtimes_for_combined_stages = [] for id, stage in self.stages.iteritems(): runtimes = [task.runtime() for task in stage.tasks] median_runtime = numpy.median(runtimes) no_straggler_runtimes = [numpy.median(runtimes)] * len(stage.tasks) if id in self.stages_to_combine: runtimes_for_combined_stages.extend(no_straggler_runtimes) else: total_no_stragglers_runtime += simulate.simulate( no_straggler_runtimes, concurrency.get_max_concurrency(stage.tasks))[0] if len(runtimes_for_combined_stages) > 0: total_no_stragglers_runtime += simulate.simulate( runtimes_for_combined_stages, self.combined_stages_concurrency)[0] return total_no_stragglers_runtime * 1.0 / self.get_simulated_runtime()