Example #1
0
class MCSFinder(ControlFlow):
    def __init__(self,
                 simulation_cfg,
                 superlog_path_or_dag,
                 invariant_check_name=None,
                 transform_dag=None,
                 end_wait_seconds=0.5,
                 mcs_trace_path=None,
                 extra_log=None,
                 runtime_stats_path=None,
                 wait_on_deterministic_values=False,
                 no_violation_verification_runs=1,
                 optimized_filtering=False,
                 forker=LocalForker(),
                 replay_final_trace=True,
                 strict_assertion_checking=False,
                 delay_flow_mods=False,
                 **kwargs):
        super(MCSFinder, self).__init__(simulation_cfg)
        # number of subsequences delta debugging has examined so far, for
        # distingushing runtime stats from different intermediate runs.
        self.subsequence_id = 0
        self.mcs_log_tracker = None
        self.replay_log_tracker = None
        self.mcs_trace_path = mcs_trace_path
        self.sync_callback = None
        self._log = logging.getLogger("mcs_finder")

        if invariant_check_name is None:
            raise ValueError("Must specify invariant check")
        if invariant_check_name not in name_to_invariant_check:
            raise ValueError(
                '''Unknown invariant check %s.\n'''
                '''Invariant check name must be defined in config.invariant_checks''',
                invariant_check_name)
        self.invariant_check = name_to_invariant_check[invariant_check_name]

        if type(superlog_path_or_dag) == str:
            self.superlog_path = superlog_path_or_dag
            # The dag is codefied as a list, where each element has
            # a list of its dependents
            self.dag = EventDag(log_parser.parse_path(self.superlog_path))
        else:
            self.dag = superlog_path_or_dag

        last_invariant_violation = self.dag.get_last_invariant_violation()
        if last_invariant_violation is None:
            raise ValueError("No invariant violation found in dag...")
        violations = last_invariant_violation.violations
        if len(violations) > 1:
            self.bug_signature = None
            while self.bug_signature is None:
                msg.interactive(
                    "\n------------------------------------------\n")
                msg.interactive(
                    "Multiple violations detected! Choose one for MCS Finding:"
                )
                for i, violation in enumerate(violations):
                    msg.interactive("  [%d] %s" % (i + 1, violation))
                violation_index = msg.raw_input("> ")
                if re.match("^\d+$", violation_index) is None or\
                   int(violation_index) < 1 or\
                   int(violation_index) > len(violations):
                    msg.fail("Must provide an integer between 1 and %d!" %
                             len(violations))
                    continue
                self.bug_signature = violations[int(violation_index) - 1]
        else:
            self.bug_signature = violations[0]
        msg.success(
            "\nBug signature to match is %s. Proceeding with MCS finding!\n" %
            self.bug_signature)

        self.transform_dag = transform_dag
        # A second log with just our MCS progress log messages
        self._extra_log = extra_log
        self.kwargs = kwargs
        self.end_wait_seconds = end_wait_seconds
        self.wait_on_deterministic_values = wait_on_deterministic_values
        # `no' means "number"
        self.no_violation_verification_runs = no_violation_verification_runs
        self._runtime_stats = RuntimeStats(
            self.subsequence_id, runtime_stats_path=runtime_stats_path)
        # Whether to try alternate trace splitting techiques besides splitting by time.
        self.optimized_filtering = optimized_filtering
        self.forker = forker
        self.replay_final_trace = replay_final_trace
        self.strict_assertion_checking = strict_assertion_checking
        self.delay_flow_mods = delay_flow_mods

    def log(self, s):
        ''' Output a message to both self._log and self._extra_log '''
        msg.mcs_event(s)
        if self._extra_log is not None:
            self._extra_log.write(s + '\n')
            self._extra_log.flush()

    def log_violation(self, s):
        ''' Output a message to both self._log and self._extra_log '''
        msg.mcs_event(color.RED + s)
        if self._extra_log is not None:
            self._extra_log.write(s + '\n')
            self._extra_log.flush()

    def log_no_violation(self, s):
        ''' Output a message to both self._log and self._extra_log '''
        msg.mcs_event(color.GREEN + s)
        if self._extra_log is not None:
            self._extra_log.write(s + '\n')
            self._extra_log.flush()

    def init_results(self, results_dir):
        ''' Precondition: results_dir exists, and is clean (preferably
    initialized by experiments/setup.py).'''
        if self._extra_log is None:
            self._extra_log = open("%s/mcs_finder.log" % results_dir, "w")
        if self._runtime_stats.get_runtime_stats_path() is None:
            runtime_stats_path = "%s/runtime_stats.json" % results_dir
            self._runtime_stats.set_runtime_stats_path(runtime_stats_path)
        if self.mcs_trace_path is None:
            self.mcs_trace_path = "%s/mcs.trace" % results_dir
        # TODO(cs): assumes that transform dag is a peeker, not some other
        # transformer
        peeker_exists = self.transform_dag is not None
        self.mcs_log_tracker = MCSLogTracker(results_dir, self.mcs_trace_path,
                                             self._runtime_stats,
                                             self.simulation_cfg,
                                             peeker_exists)
        self.replay_log_tracker = ReplayLogTracker(results_dir)

    # N.B. only called in the parent process.
    def simulate(self, check_reproducability=True):
        self._runtime_stats.set_dag_stats(self.dag)

        # apply domain knowledge: treat failure/recovery pairs atomically, and
        # filter event types we don't want to include in the MCS
        # (e.g. CheckInvariants)
        self.dag.mark_invalid_input_sequences()
        # TODO(cs): invoke dag.filter_unsupported_input_types()

        if len(self.dag) == 0:
            raise RuntimeError("No supported input types?")

        if check_reproducability:
            # First, run through without pruning to verify that the violation exists
            self._runtime_stats.record_replay_start()

            for i in range(0, self.no_violation_verification_runs):
                bug_found = self.replay(self.dag,
                                        "reproducibility",
                                        ignore_runtime_stats=True)
                if bug_found:
                    break
            self._runtime_stats.set_initial_verification_runs_needed(i)
            self._runtime_stats.record_replay_end()
            if not bug_found:
                msg.fail("Unable to reproduce correctness violation!")
                sys.exit(5)
            self.log(
                "Violation reproduced successfully! Proceeding with pruning")

        self._runtime_stats.record_prune_start()

        # Run optimizations.
        # TODO(cs): Better than a boolean flag: check if
        # log(len(self.dag)) > number of input types to try
        if self.optimized_filtering:
            self._optimize_event_dag()
        precompute_cache = PrecomputeCache()

        # Invoke delta debugging
        (dag,
         total_inputs_pruned) = self._ddmin(self.dag,
                                            2,
                                            precompute_cache=precompute_cache)
        # Make sure to track the final iteration size
        self._track_iteration_size(total_inputs_pruned)
        self.dag = dag

        self.log("=== Total replays: %d ===" %
                 self._runtime_stats.total_replays)
        self._runtime_stats.record_prune_end()
        self.mcs_log_tracker.dump_runtime_stats()
        self.log("Final MCS (%d elements):" % len(self.dag.input_events))
        for i in self.dag.input_events:
            self.log(" - %s" % str(i))

        if self.replay_final_trace:
            #  Replaying the final trace achieves two goals:
            #  - verifies that the MCS indeed ends in the violation
            #  - allows us to prune internal events that time out
            bug_found = self.replay(self.dag,
                                    "final_mcs_trace",
                                    ignore_runtime_stats=True)
            if not bug_found:
                self.log(
                    '''Warning: MCS did not result in violation. Trying replay '''
                    '''again without timed out events.''')
                # TODO(cs): always replay the MCS without timeouts, since the console
                # output will be significantly cleaner?
                no_timeouts = self.dag.filter_timeouts()
                bug_found = self.replay(no_timeouts,
                                        "final_mcs_no_timed_out_events",
                                        ignore_runtime_stats=True)
                if not bug_found:
                    self.log(
                        '''Warning! Final MCS did not result in violation, even '''
                        ''' after ignoring timed out internal events. '''
                        ''' See tools/visualization/visualize1D.html for debugging'''
                    )

        # N.B. dumping the MCS trace must occur after the final replay trace,
        # since we need to infer which events will time out for events.trace.notimeouts
        if self.mcs_trace_path is not None:
            self.mcs_log_tracker.dump_mcs_trace(self.dag, self)
        return ExitCode(0)

    # N.B. always called within a child process.
    def _ddmin(self,
               dag,
               split_ways,
               precompute_cache=None,
               label_prefix=(),
               total_inputs_pruned=0):
        # This is the delta-debugging algorithm from:
        #   http://www.st.cs.uni-saarland.de/papers/tse2002/tse2002.pdf,
        # Section 3.2
        # TODO(cs): we could do much better if we leverage domain knowledge (e.g.,
        # start by pruning all LinkFailures, or splitting by nodes rather than
        # time)
        if split_ways > len(dag.input_events):
            self.log("Done")
            return (dag, total_inputs_pruned)

        local_label = lambda i, inv=False: "%s%d/%d" % ("~" if inv else "", i,
                                                        split_ways)
        subset_label = lambda label: ".".join(
            map(str, label_prefix + (label, )))
        print_subset = lambda label, s: subset_label(label) + ": " + " ".join(
            map(lambda e: e.label, s))

        subsets = split_list(dag.input_events, split_ways)
        self.log("Subsets:\n" + "\n".join(
            print_subset(local_label(i), s) for i, s in enumerate(subsets)))
        for i, subset in enumerate(subsets):
            label = local_label(i)
            new_dag = dag.input_subset(subset)
            input_sequence = tuple(new_dag.input_events)
            self.log("Current subset: %s" %
                     print_subset(label, input_sequence))
            if precompute_cache.already_done(input_sequence):
                self.log("Already computed. Skipping")
                continue
            precompute_cache.update(input_sequence)
            if input_sequence == ():
                self.log(
                    "Subset after pruning dependencies was empty. Skipping")
                continue

            self._track_iteration_size(total_inputs_pruned)
            violation = self._check_violation(new_dag, i, label)
            if violation:
                self.log_violation(
                    "Subset %s reproduced violation. Subselecting." %
                    subset_label(label))
                self.mcs_log_tracker.maybe_dump_intermediate_mcs(
                    new_dag, subset_label(label), self)

                total_inputs_pruned += len(dag.input_events) - len(
                    new_dag.input_events)
                return self._ddmin(new_dag,
                                   2,
                                   precompute_cache=precompute_cache,
                                   label_prefix=label_prefix + (label, ),
                                   total_inputs_pruned=total_inputs_pruned)

        self.log_no_violation(
            "No subsets with violations. Checking complements")
        for i, subset in enumerate(subsets):
            label = local_label(i, True)
            prefix = label_prefix + (label, )
            new_dag = dag.input_complement(subset)
            input_sequence = tuple(new_dag.input_events)
            self.log("Current complement: %s" %
                     print_subset(label, input_sequence))
            if precompute_cache.already_done(input_sequence):
                self.log("Already computed. Skipping")
                continue
            precompute_cache.update(input_sequence)

            if input_sequence == ():
                self.log(
                    "Subset %s after pruning dependencies was empty. Skipping",
                    subset_label(label))
                continue

            self._track_iteration_size(total_inputs_pruned)
            violation = self._check_violation(new_dag, i, label)
            if violation:
                self.log_violation(
                    "Subset %s reproduced violation. Subselecting." %
                    subset_label(label))
                self.mcs_log_tracker.maybe_dump_intermediate_mcs(
                    new_dag, subset_label(label), self)
                total_inputs_pruned += len(dag.input_events) - len(
                    new_dag.input_events)
                return self._ddmin(new_dag,
                                   max(split_ways - 1, 2),
                                   precompute_cache=precompute_cache,
                                   label_prefix=prefix,
                                   total_inputs_pruned=total_inputs_pruned)

        self.log_no_violation("No complements with violations.")
        if split_ways < len(dag.input_events):
            self.log("Increasing granularity.")
            return self._ddmin(dag,
                               min(len(dag.input_events), split_ways * 2),
                               precompute_cache=precompute_cache,
                               label_prefix=label_prefix,
                               total_inputs_pruned=total_inputs_pruned)
        return (dag, total_inputs_pruned)

    # N.B. always called within a child process.
    def _track_iteration_size(self, total_inputs_pruned):
        self._runtime_stats.record_iteration_size(
            len(self.dag.input_events) - total_inputs_pruned)

    # N.B. always called within a child process.
    def _check_violation(self, new_dag, subset_index, label):
        ''' Check if there were violations '''
        # Try no_violation_verification_runs times to see if the bug shows up
        for i in range(0, self.no_violation_verification_runs):
            bug_found = self.replay(new_dag, label)

            if bug_found:
                # Violation in the subset
                self.log_violation("Violation! Considering %d'th" %
                                   subset_index)
                self._runtime_stats.record_violation_found(i)
                return True

        # No violation!
        self.log_no_violation("No violation in %d'th..." % subset_index)
        return False

    def replay(self, new_dag, label, ignore_runtime_stats=False):
        # Run the simulation forward
        if self.transform_dag:
            new_dag = self.transform_dag(new_dag)

        self._runtime_stats.record_replay_stats(len(new_dag.input_events))

        # N.B. this function is run as a child process.
        def play_forward(results_dir, subsequence_id):
            # TODO(cs): need to serialize the parameters to Replayer rather than
            # wrapping them in a closure... otherwise, can't use RemoteForker
            # TODO(aw): MCSFinder needs to configure Simulation to always let DataplaneEvents pass through
            create_clean_python_dir(results_dir)

            # Copy stdout and stderr to a file "replay.out"
            tee = Tee(open(os.path.join(results_dir, "replay.out"), "w"))
            tee.tee_stdout()
            tee.tee_stderr()

            # Set up replayer.
            input_logger = InputLogger()
            replayer = Replayer(
                self.simulation_cfg,
                new_dag,
                wait_on_deterministic_values=self.wait_on_deterministic_values,
                input_logger=input_logger,
                allow_unexpected_messages=False,
                pass_through_whitelisted_messages=True,
                delay_flow_mods=self.delay_flow_mods,
                **self.kwargs)
            replayer.init_results(results_dir)
            self._runtime_stats = RuntimeStats(subsequence_id)
            violations = []
            simulation = None
            try:
                simulation = replayer.simulate()
                self._track_new_internal_events(simulation, replayer)
                # Wait a bit in case the bug takes awhile to happen
                self.log("Sleeping %d seconds after run" %
                         self.end_wait_seconds)
                time.sleep(self.end_wait_seconds)
                violations = self.invariant_check(simulation)
                if violations != []:
                    input_logger.log_input_event(
                        InvariantViolation(violations))
            except SystemExit:
                # One of the invariant checks bailed early. Oddly, this is not an
                # error for us, it just means that there were no violations...
                # [this logic is arguably broken]
                # Return no violations, and let Forker handle system exit for us.
                violations = []
            finally:
                input_logger.close(replayer,
                                   self.simulation_cfg,
                                   skip_mcs_cfg=True)
                if simulation is not None:
                    simulation.clean_up()
                tee.close()
            if self.strict_assertion_checking:
                test_serialize_response(violations,
                                        self._runtime_stats.client_dict())
            timed_out_internal = [
                e.label for e in new_dag.events if e.timed_out
            ]
            return (violations, self._runtime_stats.client_dict(),
                    timed_out_internal)

        # TODO(cs): once play_forward() is no longer a closure, register it only once
        self.forker.register_task("play_forward", play_forward)
        results_dir = self.replay_log_tracker.get_replay_logger_dir(label)
        self.subsequence_id += 1
        (violations, client_runtime_stats,
         timed_out_internal) = self.forker.fork("play_forward", results_dir,
                                                self.subsequence_id)
        new_dag.set_events_as_timed_out(timed_out_internal)

        bug_found = False
        if violations != []:
            msg.fail("Violations: %s" % str(violations))
            if self.bug_signature in violations:
                bug_found = True
            else:
                msg.fail("Bug does not match initial violation fingerprint!")
        else:
            msg.success("No correctness violations!")

        if not ignore_runtime_stats:
            self._runtime_stats.merge_client_dict(client_runtime_stats)

        return bug_found

    def _optimize_event_dag(self):
        ''' Employs domain knowledge of event classes to reduce the size of event
    dag. Currently prunes event types.'''
        # TODO(cs): Another approach for later: split by nodes
        event_types = [
            TrafficInjection, DataplaneDrop, SwitchFailure, SwitchRecovery,
            LinkFailure, LinkRecovery, HostMigration, ControllerFailure,
            ControllerRecovery, PolicyChange, ControlChannelBlock,
            ControlChannelUnblock
        ]
        for event_type in event_types:
            pruned = [
                e for e in self.dag.input_events
                if not isinstance(e, event_type)
            ]
            if len(pruned) == len(self.dag.input_events):
                self.log("\t** No events pruned for event type %s. Next!" %
                         event_type)
                continue
            pruned_dag = self.dag.input_complement(pruned)
            bug_found = self.replay(pruned_dag, "opt_%s" % event_type.__name__)
            if bug_found:
                self.log(
                    "\t** VIOLATION for pruning event type %s! Resizing original dag"
                    % event_type)
                self.dag = pruned_dag

    # N.B. always called within a child process.
    def _track_new_internal_events(self, simulation, replayer):
        ''' Pre: simulation must have been run through a replay'''
        # We always check against internal events that were buffered at the end of
        # the original run (don't want to overcount)
        prev_buffered_receives = []
        try:
            path = self.superlog_path + ".unacked"
            if not os.path.exists(path):
                log.warn(
                    "unacked internal events file from original run does not exist"
                )
                return
            prev_buffered_receives = set([
                e.pending_receive for e in [
                    f for f in EventDag(log_parser.parse_path(path)).events
                    if type(f) == ControlMessageReceive
                ]
            ])
        except ValueError as e:
            log.warn("unacked internal events is corrupt? %r" % e)
            return
        buffered_message_receipts = []
        for p in simulation.openflow_buffer.pending_receives():
            if p not in prev_buffered_receives:
                buffered_message_receipts.append(repr(p))
            else:
                prev_buffered_receives.remove(p)

        self._runtime_stats.record_buffered_message_receipts(
            buffered_message_receipts)
        new_internal_events = replayer.unexpected_state_changes + replayer.passed_unexpected_messages
        self._runtime_stats.record_new_internal_events(new_internal_events)
        self._runtime_stats.record_early_internal_events(
            replayer.early_state_changes)
        self._runtime_stats.record_timed_out_events(
            dict(replayer.event_scheduler_stats.event2timeouts))
        self._runtime_stats.record_matched_events(
            dict(replayer.event_scheduler_stats.event2matched))
Example #2
0
class MCSFinder(ControlFlow):
  def __init__(self, simulation_cfg, superlog_path_or_dag,
               invariant_check_name=None,
               transform_dag=None, end_wait_seconds=0.5,
               mcs_trace_path=None, extra_log=None, runtime_stats_path=None,
               wait_on_deterministic_values=False,
               no_violation_verification_runs=1,
               optimized_filtering=False, forker=LocalForker(),
               replay_final_trace=True, strict_assertion_checking=False,
               **kwargs):
    super(MCSFinder, self).__init__(simulation_cfg)
    # number of subsequences delta debugging has examined so far, for
    # distingushing runtime stats from different intermediate runs.
    self.subsequence_id = 0
    self.mcs_log_tracker = None
    self.replay_log_tracker = None
    self.mcs_trace_path = mcs_trace_path
    self.sync_callback = None
    self._log = logging.getLogger("mcs_finder")

    if invariant_check_name is None:
      raise ValueError("Must specify invariant check")
    if invariant_check_name not in name_to_invariant_check:
      raise ValueError('''Unknown invariant check %s.\n'''
                       '''Invariant check name must be defined in config.invariant_checks''',
                       invariant_check_name)
    self.invariant_check = name_to_invariant_check[invariant_check_name]

    if type(superlog_path_or_dag) == str:
      self.superlog_path = superlog_path_or_dag
      # The dag is codefied as a list, where each element has
      # a list of its dependents
      self.dag = EventDag(log_parser.parse_path(self.superlog_path))
    else:
      self.dag = superlog_path_or_dag

    last_invariant_violation = self.dag.get_last_invariant_violation()
    if last_invariant_violation is None:
      raise ValueError("No invariant violation found in dag...")
    violations = last_invariant_violation.violations
    if len(violations) > 1:
      self.bug_signature = None
      while self.bug_signature is None:
        msg.interactive("\n------------------------------------------\n")
        msg.interactive("Multiple violations detected! Choose one for MCS Finding:")
        for i, violation in enumerate(violations):
          msg.interactive("  [%d] %s" % (i+1, violation))
        violation_index = msg.raw_input("> ")
        if re.match("^\d+$", violation_index) is None or\
           int(violation_index) < 1 or\
           int(violation_index) > len(violations):
          msg.fail("Must provide an integer between 1 and %d!" % len(violations))
          continue
        self.bug_signature = violations[int(violation_index)-1]
    else:
      self.bug_signature = violations[0]
    msg.success("\nBug signature to match is %s. Proceeding with MCS finding!\n" % self.bug_signature)

    self.transform_dag = transform_dag
    # A second log with just our MCS progress log messages
    self._extra_log = extra_log
    self.kwargs = kwargs
    self.end_wait_seconds = end_wait_seconds
    self.wait_on_deterministic_values = wait_on_deterministic_values
    # `no' means "number"
    self.no_violation_verification_runs = no_violation_verification_runs
    self._runtime_stats = RuntimeStats(self.subsequence_id, runtime_stats_path=runtime_stats_path)
    # Whether to try alternate trace splitting techiques besides splitting by time.
    self.optimized_filtering = optimized_filtering
    self.forker = forker
    self.replay_final_trace = replay_final_trace
    self.strict_assertion_checking = strict_assertion_checking

  def log(self, s):
    ''' Output a message to both self._log and self._extra_log '''
    msg.mcs_event(s)
    if self._extra_log is not None:
      self._extra_log.write(s + '\n')
      self._extra_log.flush()

  def log_violation(self, s):
    ''' Output a message to both self._log and self._extra_log '''
    msg.mcs_event(color.RED + s)
    if self._extra_log is not None:
      self._extra_log.write(s + '\n')
      self._extra_log.flush()

  def log_no_violation(self, s):
    ''' Output a message to both self._log and self._extra_log '''
    msg.mcs_event(color.GREEN + s)
    if self._extra_log is not None:
      self._extra_log.write(s + '\n')
      self._extra_log.flush()

  def init_results(self, results_dir):
    ''' Precondition: results_dir exists, and is clean (preferably
    initialized by experiments/setup.py).'''
    if self._extra_log is None:
      self._extra_log = open("%s/mcs_finder.log" % results_dir, "w")
    if self._runtime_stats.get_runtime_stats_path() is None:
      runtime_stats_path = "%s/runtime_stats.json" % results_dir
      self._runtime_stats.set_runtime_stats_path(runtime_stats_path)
    if self.mcs_trace_path is None:
      self.mcs_trace_path = "%s/mcs.trace" % results_dir
    # TODO(cs): assumes that transform dag is a peeker, not some other
    # transformer
    peeker_exists = self.transform_dag is not None
    self.mcs_log_tracker = MCSLogTracker(results_dir, self.mcs_trace_path,
                                         self._runtime_stats,
                                         self.simulation_cfg, peeker_exists)
    self.replay_log_tracker = ReplayLogTracker(results_dir)

  # N.B. only called in the parent process.
  def simulate(self, check_reproducability=True):
    self._runtime_stats.set_dag_stats(self.dag)

    # apply domain knowledge: treat failure/recovery pairs atomically, and
    # filter event types we don't want to include in the MCS
    # (e.g. CheckInvariants)
    self.dag.mark_invalid_input_sequences()
    self.dag = self.dag.filter_unsupported_input_types()

    if len(self.dag) == 0:
      raise RuntimeError("No supported input types?")

    if check_reproducability:
      # First, run through without pruning to verify that the violation exists
      self._runtime_stats.record_replay_start()

      for i in range(0, self.no_violation_verification_runs):
        bug_found = self.replay(self.dag, "reproducibility",
                                ignore_runtime_stats=True)
        if bug_found:
          break
      self._runtime_stats.set_initial_verification_runs_needed(i)
      self._runtime_stats.record_replay_end()
      if not bug_found:
        msg.fail("Unable to reproduce correctness violation!")
        sys.exit(5)
      self.log("Violation reproduced successfully! Proceeding with pruning")

    self._runtime_stats.record_prune_start()

    # Run optimizations.
    # TODO(cs): Better than a boolean flag: check if
    # log(len(self.dag)) > number of input types to try
    if self.optimized_filtering:
      self._optimize_event_dag()
    precompute_cache = PrecomputeCache()

    # Invoke delta debugging
    (dag, total_inputs_pruned) = self._ddmin(self.dag, 2, precompute_cache=precompute_cache)
    # Make sure to track the final iteration size
    self._track_iteration_size(total_inputs_pruned)
    self.dag = dag

    self.log("=== Total replays: %d ===" % self._runtime_stats.total_replays)
    self._runtime_stats.record_prune_end()
    self.mcs_log_tracker.dump_runtime_stats()
    self.log("Final MCS (%d elements):" % len(self.dag.input_events))
    for i in self.dag.input_events:
      self.log(" - %s" % str(i))

    if self.replay_final_trace:
      #  Replaying the final trace achieves two goals:
      #  - verifies that the MCS indeed ends in the violation
      #  - allows us to prune internal events that time out
      bug_found = self.replay(self.dag, "final_mcs_trace",
                              ignore_runtime_stats=True)
      if not bug_found:
        self.log('''Warning: MCS did not result in violation. Trying replay '''
                 '''again without timed out events.''')
        # TODO(cs): always replay the MCS without timeouts, since the console
        # output will be significantly cleaner?
        no_timeouts = self.dag.filter_timeouts()
        bug_found = self.replay(no_timeouts, "final_mcs_no_timed_out_events",
                                ignore_runtime_stats=True)
        if not bug_found:
          self.log('''Warning! Final MCS did not result in violation, even '''
                   ''' after ignoring timed out internal events. '''
                   ''' See tools/visualization/visualize1D.html for debugging''')

    # N.B. dumping the MCS trace must occur after the final replay trace,
    # since we need to infer which events will time out for events.trace.notimeouts
    if self.mcs_trace_path is not None:
      self.mcs_log_tracker.dump_mcs_trace(self.dag, self)
    return ExitCode(0)

  # N.B. always called within a child process.
  def _ddmin(self, dag, split_ways, precompute_cache=None, label_prefix=(),
             total_inputs_pruned=0):
    # This is the delta-debugging algorithm from:
    #   http://www.st.cs.uni-saarland.de/papers/tse2002/tse2002.pdf,
    # Section 3.2
    # TODO(cs): we could do much better if we leverage domain knowledge (e.g.,
    # start by pruning all LinkFailures, or splitting by nodes rather than
    # time)
    if split_ways > len(dag.input_events):
      self.log("Done")
      return (dag, total_inputs_pruned)

    local_label = lambda i, inv=False: "%s%d/%d" % ("~" if inv else "", i, split_ways)
    subset_label = lambda label: ".".join(map(str, label_prefix + ( label, )))
    print_subset = lambda label, s: subset_label(label) + ": "+" ".join(map(lambda e: e.label, s))

    subsets = split_list(dag.input_events, split_ways)
    self.log("Subsets:\n"+"\n".join(print_subset(local_label(i), s) for i, s in enumerate(subsets)))
    for i, subset in enumerate(subsets):
      label = local_label(i)
      new_dag = dag.input_subset(subset)
      input_sequence = tuple(new_dag.input_events)
      self.log("Current subset: %s" % print_subset(label, input_sequence))
      if precompute_cache.already_done(input_sequence):
        self.log("Already computed. Skipping")
        continue
      precompute_cache.update(input_sequence)
      if input_sequence == ():
        self.log("Subset after pruning dependencies was empty. Skipping")
        continue

      self._track_iteration_size(total_inputs_pruned)
      violation = self._check_violation(new_dag, i, label)
      if violation:
        self.log_violation("Subset %s reproduced violation. Subselecting." % subset_label(label))
        self.mcs_log_tracker.maybe_dump_intermediate_mcs(new_dag,
                                                         subset_label(label), self)

        total_inputs_pruned += len(dag.input_events) - len(new_dag.input_events)
        return self._ddmin(new_dag, 2, precompute_cache=precompute_cache,
                           label_prefix = label_prefix + (label, ),
                           total_inputs_pruned=total_inputs_pruned)

    self.log_no_violation("No subsets with violations. Checking complements")
    for i, subset in enumerate(subsets):
      label = local_label(i, True)
      prefix = label_prefix + (label, )
      new_dag = dag.input_complement(subset)
      input_sequence = tuple(new_dag.input_events)
      self.log("Current complement: %s" % print_subset(label, input_sequence))
      if precompute_cache.already_done(input_sequence):
        self.log("Already computed. Skipping")
        continue
      precompute_cache.update(input_sequence)

      if input_sequence == ():
        self.log("Subset %s after pruning dependencies was empty. Skipping", subset_label(label))
        continue

      self._track_iteration_size(total_inputs_pruned)
      violation = self._check_violation(new_dag, i, label)
      if violation:
        self.log_violation("Subset %s reproduced violation. Subselecting." % subset_label(label))
        self.mcs_log_tracker.maybe_dump_intermediate_mcs(new_dag,
                                                         subset_label(label), self)
        total_inputs_pruned += len(dag.input_events) - len(new_dag.input_events)
        return self._ddmin(new_dag, max(split_ways - 1, 2),
                           precompute_cache=precompute_cache,
                           label_prefix=prefix,
                           total_inputs_pruned=total_inputs_pruned)

    self.log_no_violation("No complements with violations.")
    if split_ways < len(dag.input_events):
      self.log("Increasing granularity.")
      return self._ddmin(dag, min(len(dag.input_events), split_ways*2),
                         precompute_cache=precompute_cache,
                         label_prefix=label_prefix,
                         total_inputs_pruned=total_inputs_pruned)
    return (dag, total_inputs_pruned)

  # N.B. always called within a child process.
  def _track_iteration_size(self, total_inputs_pruned):
    self._runtime_stats.record_iteration_size(len(self.dag.input_events) - total_inputs_pruned)

  # N.B. always called within a child process.
  def _check_violation(self, new_dag, subset_index, label):
    ''' Check if there were violations '''
    # Try no_violation_verification_runs times to see if the bug shows up
    for i in range(0, self.no_violation_verification_runs):
      bug_found = self.replay(new_dag, label)

      if bug_found:
        # Violation in the subset
        self.log_violation("Violation! Considering %d'th" % subset_index)
        self._runtime_stats.record_violation_found(i)
        return True

    # No violation!
    self.log_no_violation("No violation in %d'th..." % subset_index)
    return False

  def replay(self, new_dag, label, ignore_runtime_stats=False):
    # Run the simulation forward
    if self.transform_dag:
      new_dag = self.transform_dag(new_dag)

    self._runtime_stats.record_replay_stats(len(new_dag.input_events))

    # N.B. this function is run as a child process.
    def play_forward(results_dir, subsequence_id):
      # TODO(cs): need to serialize the parameters to Replayer rather than
      # wrapping them in a closure... otherwise, can't use RemoteForker
      # TODO(aw): MCSFinder needs to configure Simulation to always let DataplaneEvents pass through
      create_clean_python_dir(results_dir)

      # Copy stdout and stderr to a file "replay.out"
      tee = Tee(open(os.path.join(results_dir, "replay.out"), "w"))
      tee.tee_stdout()
      tee.tee_stderr()

      # Set up replayer.
      input_logger = InputLogger()
      replayer = Replayer(self.simulation_cfg, new_dag,
                          wait_on_deterministic_values=self.wait_on_deterministic_values,
                          input_logger=input_logger,
                          allow_unexpected_messages=False,
                          pass_through_whitelisted_messages=True,
                          **self.kwargs)
      replayer.init_results(results_dir)
      self._runtime_stats = RuntimeStats(subsequence_id)
      violations = []
      simulation = None
      try:
        simulation = replayer.simulate()
        self._track_new_internal_events(simulation, replayer)
        # Wait a bit in case the bug takes awhile to happen
        self.log("Sleeping %d seconds after run"  % self.end_wait_seconds)
        time.sleep(self.end_wait_seconds)
        violations = self.invariant_check(simulation)
        if violations != []:
          input_logger.log_input_event(InvariantViolation(violations))
      except SystemExit:
        # One of the invariant checks bailed early. Oddly, this is not an
        # error for us, it just means that there were no violations...
        # [this logic is arguably broken]
        # Return no violations, and let Forker handle system exit for us.
        violations = []
      finally:
        input_logger.close(replayer, self.simulation_cfg, skip_mcs_cfg=True)
        if simulation is not None:
          simulation.clean_up()
        tee.close()
      if self.strict_assertion_checking:
        test_serialize_response(violations, self._runtime_stats.client_dict())
      timed_out_internal = [ e.label for e in new_dag.events if e.timed_out ]
      return (violations, self._runtime_stats.client_dict(), timed_out_internal)

    # TODO(cs): once play_forward() is no longer a closure, register it only once
    self.forker.register_task("play_forward", play_forward)
    results_dir = self.replay_log_tracker.get_replay_logger_dir(label)
    self.subsequence_id += 1
    (violations, client_runtime_stats,
                 timed_out_internal) = self.forker.fork("play_forward",
                                                        results_dir,
                                                        self.subsequence_id)
    new_dag.set_events_as_timed_out(timed_out_internal)

    bug_found = False
    if violations != []:
      msg.fail("Violations: %s" % str(violations))
      if self.bug_signature in violations:
        bug_found = True
      else:
        msg.fail("Bug does not match initial violation fingerprint!")
    else:
      msg.interactive("No correctness violations!")

    if not ignore_runtime_stats:
      self._runtime_stats.merge_client_dict(client_runtime_stats)

    return bug_found

  def _optimize_event_dag(self):
    ''' Employs domain knowledge of event classes to reduce the size of event
    dag. Currently prunes event types.'''
    # TODO(cs): Another approach for later: split by nodes
    event_types = [TrafficInjection, DataplaneDrop, SwitchFailure,
                   SwitchRecovery, LinkFailure, LinkRecovery, HostMigration,
                   ControllerFailure, ControllerRecovery, PolicyChange, ControlChannelBlock,
                   ControlChannelUnblock]
    for event_type in event_types:
      pruned = [e for e in self.dag.input_events if not isinstance(e, event_type)]
      if len(pruned)==len(self.dag.input_events):
        self.log("\t** No events pruned for event type %s. Next!" % event_type)
        continue
      pruned_dag = self.dag.input_complement(pruned)
      bug_found = self.replay(pruned_dag, "opt_%s" % event_type.__name__)
      if bug_found:
        self.log("\t** VIOLATION for pruning event type %s! Resizing original dag" % event_type)
        self.dag = pruned_dag

  # N.B. always called within a child process.
  def _track_new_internal_events(self, simulation, replayer):
    ''' Pre: simulation must have been run through a replay'''
    # We always check against internal events that were buffered at the end of
    # the original run (don't want to overcount)
    prev_buffered_receives = []
    try:
      path = self.superlog_path + ".unacked"
      if not os.path.exists(path):
        log.warn("unacked internal events file from original run does not exist")
        return
      prev_buffered_receives = set([ e.pending_receive for e in
                                     [ f for f in EventDag(log_parser.parse_path(path)).events
                                       if type(f) == ControlMessageReceive ] ])
    except ValueError as e:
      log.warn("unacked internal events is corrupt? %r" % e)
      return
    buffered_message_receipts = []
    for p in simulation.openflow_buffer.pending_receives():
      if p not in prev_buffered_receives:
        buffered_message_receipts.append(repr(p))
      else:
        prev_buffered_receives.remove(p)

    self._runtime_stats.record_buffered_message_receipts(buffered_message_receipts)
    new_internal_events = replayer.unexpected_state_changes + replayer.passed_unexpected_messages
    self._runtime_stats.record_new_internal_events(new_internal_events)
    self._runtime_stats.record_early_internal_events(replayer.early_state_changes)
    self._runtime_stats.record_timed_out_events(dict(replayer.event_scheduler_stats.event2timeouts))
    self._runtime_stats.record_matched_events(dict(replayer.event_scheduler_stats.event2matched))