Exemple #1
0
class Replayer(ControlFlow):
  '''
  Replay events from a trace

  To set the event scheduling parameters, pass them as keyword args to the
  constructor of this class, which will pass them on to the EventScheduler object it creates.
  '''

  # Interpolated time parameter. *not* the event scheduling epsilon:
  time_epsilon_microseconds = 500

  def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None,
               print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False,
               fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False,
               end_in_interactive=False, input_logger=None,
               allow_unexpected_messages=False,
               pass_through_whitelisted_messages=True,
               delay_flow_mods=False,
               **kwargs):
    ControlFlow.__init__(self, simulation_cfg)
    if wait_on_deterministic_values:
      self.sync_callback = ReplaySyncCallback()
    else:
      self.sync_callback = ReplaySyncCallback(self.get_interpolated_time)

    if type(superlog_path_or_dag) == str:
      superlog_path = superlog_path_or_dag
      # The dag is codefied as a list, where each element has
      # a list of its dependents
      self.dag = EventDag(log_parser.parse_path(superlog_path))
    else:
      self.dag = superlog_path_or_dag

    if len(self.dag.events) == 0:
      raise ValueError("No events to replay!")

    self.default_dp_permit = default_dp_permit
    # Set DataplanePermit and DataplaneDrop to passive if permit is set
    # to default
    for event in [ e for e in self.dag.events if type(e) in dp_events ]:
      event.passive = default_dp_permit

    self.dp_checker = None
    if default_dp_permit:
      self.dp_checker = DataplaneChecker(self.dag)

    self.print_buffers_flag = print_buffers

    # compute interpolate to time to be just before first event
    self.compute_interpolated_time(self.dag.events[0])
    # String repesentations of unexpected state changes we've passed through, for
    # statistics purposes.
    self.unexpected_state_changes = []
    self.early_state_changes = []
    self.event_scheduler_stats = None
    self.end_in_interactive = end_in_interactive
    self.fail_to_interactive = fail_to_interactive
    self.fail_to_interactive_on_persistent_violations =\
      fail_to_interactive_on_persistent_violations
    self._input_logger = input_logger
    self.allow_unexpected_messages = allow_unexpected_messages
    self.pass_through_whitelisted_messages = pass_through_whitelisted_messages
    # How many logical rounds to peek ahead when deciding if a message is
    # expected or not.
    self.expected_message_round_window = 3
    # String repesentations of unexpected messages we've passed through, for
    # statistics purposes.
    self.passed_unexpected_messages = []
    self.delay_flow_mods = delay_flow_mods

    if self.pass_through_whitelisted_messages:
      for event in self.dag.events:
        if hasattr(event, "ignore_whitelisted_packets"):
          event.ignore_whitelisted_packets = True

    if create_event_scheduler:
      self.create_event_scheduler = create_event_scheduler
    else:
      self.create_event_scheduler = \
        lambda simulation: EventScheduler(simulation,
            **{ k: v for k,v in kwargs.items()
                if k in EventScheduler.kwargs })

  def _log_input_event(self, event, **kws):
    if self._input_logger is not None:
      self._input_logger.log_input_event(event, **kws)

  def get_interpolated_time(self):
    '''
    During divergence, the controller may ask for the current time more or
    less times than they did in the original run. We control the time, so we
    need to give them some answer. The answers we give them should be
    (i) monotonically increasing, and (ii) between the time of the last
    recorded ("landmark") event and the next landmark event, and (iii)
    as close to the recorded times as possible

    Our temporary solution is to always return the time right before the next
    landmark
    '''
    # TODO(cs): implement Andi's improved time heuristic
    return self.interpolated_time

  def compute_interpolated_time(self, current_event):
    next_time = current_event.time
    just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds
    just_before_micro = max(0, just_before_micro)
    self.interpolated_time = SyncTime(next_time.seconds, just_before_micro)

  def increment_round(self):
    msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE)

  def init_results(self, results_dir):
    if self._input_logger:
      self._input_logger.open(results_dir)

  def simulate(self, post_bootstrap_hook=None):
    ''' Caller *must* call simulation.clean_up() '''
    self.simulation = self.simulation_cfg.bootstrap(self.sync_callback)
    assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel))
    # TODO(aw): remove this hack
    self.simulation.fail_to_interactive = self.fail_to_interactive
    self.simulation.fail_to_interactive_on_persistent_violations =\
      self.fail_to_interactive_on_persistent_violations
    self.simulation.openflow_buffer.pass_through_whitelisted_messages =\
      self.pass_through_whitelisted_messages
    self.logical_time = 0
    if self.delay_flow_mods:
      for switch in self.simulation.topology.switches:
        assert(isinstance(switch, FuzzSoftwareSwitch))
        switch.use_delayed_commands()
    self.run_simulation_forward(self.dag, post_bootstrap_hook)
    if self.print_buffers_flag:
      self._print_buffers()
    return self.simulation

  def _print_buffers(self):
    log.debug("Pending Message Receives:")
    for p in self.simulation.openflow_buffer.pending_receives():
      log.debug("- %s", p)
    log.debug("Pending State Changes:")
    for p in self.sync_callback.pending_state_changes():
      log.debug("- %s", p)

  def run_simulation_forward(self, dag, post_bootstrap_hook=None):
    event_scheduler = self.create_event_scheduler(self.simulation)
    event_scheduler.set_input_logger(self._input_logger)
    self.event_scheduler_stats = event_scheduler.stats
    if post_bootstrap_hook is not None:
      post_bootstrap_hook()

    def interrupt(sgn, frame):
      msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)")
      signal.signal(signal.SIGINT, self.old_interrupt)
      self.old_interrupt = None
      raise KeyboardInterrupt()
    self.old_interrupt = signal.signal(signal.SIGINT, interrupt)

    try:
      for i, event in enumerate(dag.events):
        try:
          self.compute_interpolated_time(event)
          if self.default_dp_permit:
            self.dp_checker.check_dataplane(i, self.simulation)
          if isinstance(event, InputEvent):
            self._check_early_state_changes(dag, i, event)
          self._check_new_state_changes(dag, i)
          self._check_unexpected_dp_messages(dag, i)
          # TODO(cs): quasi race-condition here. If unexpected state change
          # happens *while* we're waiting for event, we basically have a
          # deadlock (if controller logging is set to blocking) until the
          # timeout occurs
          # TODO(cs): we don't actually allow new internal message events
          # through.. we only let new state changes through. Should experiment
          # with whether we would get better fidelity if we let them through.
          event_scheduler.schedule(event)
          if self.logical_time != event.round:
            self.logical_time = event.round
            self.increment_round()
        except KeyboardInterrupt:
          interactive = Interactive(self.simulation_cfg,
                                    input_logger=self._input_logger)
          interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))
          self.old_interrupt = signal.signal(signal.SIGINT, interrupt)
    finally:
      if self.old_interrupt:
        signal.signal(signal.SIGINT, self.old_interrupt)
      msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats))
      if self.default_dp_permit:
        msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats))
    if self.end_in_interactive:
      interactive = Interactive(self.simulation_cfg,
          input_logger=self._input_logger)
      interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))

  def _check_early_state_changes(self, dag, current_index, input):
    ''' Check whether any pending state change that were supposed to come
    *after* the current input have occured. If so, we have violated causality.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      original_input_index = dag.get_original_index_for_event(input)
      if (next_expected is not None and
          state_change == next_expected.pending_state_change and
          dag.get_original_index_for_event(next_expected) > original_input_index):
        # raise RuntimeError("State change happened before expected! Causality violated")
        log.warn("State change happened before expected! Causality violated")
        self.early_state_changes.append(repr(next_expected))

  def _check_new_state_changes(self, dag, current_index):
    ''' If we are blocking controllers, it's bad news bears if an unexpected
    internal state change occurs. Check if there are any, and ACK them so that
    the execution can proceed.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      if (next_expected is None or
          state_change != next_expected.pending_state_change):
        log.info("Unexpected state change. Ack'ing")
        log_event = ControllerStateChange.from_pending_state_change(state_change)
        # Monkeypatch a "new internal event" marker to be logged to the JSON trace
        # (All fields picked up by event.to_json())
        log_event.new_internal_event = True
        log_event.replay_time = SyncTime.now()
        self._log_input_event(log_event)
        self.unexpected_state_changes.append(repr(state_change))
        self.sync_callback.ack_pending_state_change(state_change)

  def _check_unexpected_dp_messages(self, dag, current_index):
    ''' If throughout replay we observe new messages that weren't in the
    original trace, we (usually) want to let them through. This is especially true
    for messages that are related to timers, such as LLDP, since timings will
    often change during replay, and we don't want to unnecessarily change the
    controller's behavior. '''
    if not self.allow_unexpected_messages:
      return
    # TODO(cs): evaluate whether _check_unexpected_dp_messages is more or less
    # effective than _check_whitelisted_dp_messages. Compare executions with
    # visualization tool. Currently it appears that this method is too
    # liberal, and ends up causing timouts as a result of letting messages
    # through.

    # First, build a set of expected ControlMessageSends/Receives fingerprints
    # within the next expected_message_round_window rounds.
    start_round = dag.events[current_index].round
    expected_receive_fingerprints = set()
    expected_send_fingerprints = set()
    for i in xrange(current_index, len(dag.events)):
      event = dag.events[i]
      if event.round - start_round > self.expected_message_round_window:
        break
      if type(event) == ControlMessageReceive:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_receive_fingerprints.add((of_fingerprint, dpid, cid))
      if type(event) == ControlMessageSend:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_send_fingerprints.add((of_fingerprint, dpid, cid))

    # Now check pending messages.
    for expected_fingerprints, messages in [
         (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives()),
         (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends())]:
      for pending_message in messages:
        fingerprint = (pending_message.fingerprint,
                       pending_message.dpid,
                       pending_message.controller_id)
        if fingerprint not in expected_fingerprints:
          message = self.simulation.openflow_buffer.schedule(pending_message)
          log.debug("Sending unexpected %s" % message)
          b64_packet = base64_encode(message)
          # Monkeypatch a "new internal event" marker to be logged to the JSON trace
          # (All fields picked up by event.to_json())
          event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend
          log_event = event_type(pending_message.dpid, pending_message.controller_id,
                                 pending_message.fingerprint, b64_packet=b64_packet)
          log_event.new_internal_event = True
          log_event.replay_time = SyncTime.now()
          self.passed_unexpected_messages.append(repr(log_event))
          self._log_input_event(log_event)
Exemple #2
0
class Replayer(ControlFlow):
    '''
  Replay events from a trace

  To set the event scheduling parameters, pass them as keyword args to the
  constructor of this class, which will pass them on to the EventScheduler object it creates.
  '''

    # Interpolated time parameter. *not* the event scheduling epsilon:
    time_epsilon_microseconds = 500

    kwargs = EventScheduler.kwargs | set([
        'create_event_scheduler', 'print_buffers',
        'wait_on_deterministic_values', 'default_dp_permit',
        'fail_to_interactive', 'fail_to_interactive_on_persistent_violations',
        'end_in_interactive', 'input_logger', 'allow_unexpected_messages',
        'expected_message_round_window', 'pass_through_whitelisted_messages',
        'delay_flow_mods', 'invariant_check_name', 'bug_signature',
        'end_wait_seconds', 'transform_dag', 'pass_through_sends', 'fail_fast',
        'check_interval'
    ])

    def __init__(self,
                 simulation_cfg,
                 superlog_path_or_dag,
                 create_event_scheduler=None,
                 print_buffers=True,
                 wait_on_deterministic_values=False,
                 default_dp_permit=False,
                 fail_to_interactive=False,
                 fail_to_interactive_on_persistent_violations=False,
                 end_in_interactive=False,
                 input_logger=None,
                 allow_unexpected_messages=False,
                 expected_message_round_window=3,
                 pass_through_whitelisted_messages=False,
                 delay_flow_mods=False,
                 invariant_check_name="",
                 bug_signature="",
                 end_wait_seconds=0.5,
                 transform_dag=None,
                 pass_through_sends=False,
                 fail_fast=False,
                 check_interval=5,
                 **kwargs):
        '''
     - If invariant_check_name is not None, check it at the end for the
       execution
     - If bug_signature is not None, check whether this particular signature
       appears in the output of the invariant check at the end of the
       execution
    '''
        ControlFlow.__init__(self, simulation_cfg)
        # Label uniquely identifying this replay, set in init_results()
        self.replay_id = "N/A"
        self.logical_time = 0
        if wait_on_deterministic_values:
            self.sync_callback = ReplaySyncCallback()
        else:
            self.sync_callback = ReplaySyncCallback(self.get_interpolated_time)

        if type(superlog_path_or_dag) == str:
            superlog_path = superlog_path_or_dag
            # The dag is codefied as a list, where each element has
            # a list of its dependents
            self.dag = EventDag(log_parser.parse_path(superlog_path))
        else:
            self.dag = superlog_path_or_dag

        if len(self.dag.events) == 0:
            raise ValueError("No events to replay!")

        self.default_dp_permit = default_dp_permit
        self.dp_checker = self._setup_dp_checker(default_dp_permit)
        if self.default_dp_permit:
            # Set DataplanePermit and DataplaneDrop to passive if permit is set
            # to default
            # TODO(cs): rather than setting these to passive (which still causes them to
            # be scheduled as regular events) should these just be removed from the
            # event dag altogether?
            for event in [e for e in self.dag.events if type(e) in dp_events]:
                event.passive = default_dp_permit

        self.print_buffers_flag = print_buffers
        self.fail_fast = fail_fast
        self.check_interval = check_interval

        # compute interpolate to time to be just before first event
        self.compute_interpolated_time(self.dag.events[0])
        # String repesentations of unexpected state changes we've passed through, for
        # statistics purposes.
        self.unexpected_state_changes = []
        self.early_state_changes = []
        self.event_scheduler_stats = None
        self.end_in_interactive = end_in_interactive
        self.fail_to_interactive = fail_to_interactive
        self.fail_to_interactive_on_persistent_violations =\
          fail_to_interactive_on_persistent_violations
        self._input_logger = input_logger
        self.allow_unexpected_messages = allow_unexpected_messages
        self.expected_message_round_window = expected_message_round_window
        self.pass_through_whitelisted_messages = pass_through_whitelisted_messages
        self.pass_through_sends = pass_through_sends
        # How many logical rounds to peek ahead when deciding if a message is
        # expected or not.
        # String repesentations of unexpected messages we've passed through, for
        # statistics purposes.
        self.passed_unexpected_messages = []
        self.delay_flow_mods = delay_flow_mods
        self.end_wait_seconds = end_wait_seconds
        self.transform_dag = transform_dag
        self.bug_signature = bug_signature
        self.invariant_check_name = invariant_check_name
        self.invariant_check = None
        if self.invariant_check_name:
            if self.invariant_check_name not in name_to_invariant_check:
                raise ValueError(
                    '''Unknown invariant check %s.\n'''
                    '''Invariant check name must be defined in config.invariant_checks''',
                    self.invariant_check_name)
            self.invariant_check = name_to_invariant_check[
                self.invariant_check_name]

        if self.pass_through_whitelisted_messages:
            for event in self.dag.events:
                if hasattr(event, "ignore_whitelisted_packets"):
                    event.ignore_whitelisted_packets = True

        if self.simulation_cfg.ignore_interposition:
            self._ignore_interposition()

        if create_event_scheduler:
            self.create_event_scheduler = create_event_scheduler
        else:
            if self.default_dp_permit:
                # Tell EventScheduler to use call into us whenever it wants to sleep or
                # select, so that we can keep forwarding dataplane packets.
                kwargs[
                    'sleep_continuation'] = self._sleep_with_dataplane_passthrough
                kwargs[
                    'select_continuation'] = self._select_with_dataplane_passthrough

            self.create_event_scheduler = \
              lambda simulation: EventScheduler(simulation,
                  **{ k: v for k,v in kwargs.items()
                      if k in EventScheduler.kwargs })

        unknown_kwargs = [
            k for k in kwargs.keys() if k not in EventScheduler.kwargs
        ]
        if unknown_kwargs != []:
            raise ValueError("Unknown kwargs %s" % str(unknown_kwargs))

    def _log_input_event(self, event, **kws):
        if self._input_logger is not None:
            self._input_logger.log_input_event(event, **kws)

    def _setup_dp_checker(self, default_dp_permit):
        no_dp_drops = [e for e in self.dag.events
                       if type(e) == DataplaneDrop] == []
        if not default_dp_permit and no_dp_drops:
            print >> sys.stderr, (
                '''No DataplaneDrops to replay. We suggest you '''
                '''set Replayer's default_dp_permit=True ''')
        if default_dp_permit:
            if no_dp_drops:
                return AlwaysAllowDataplane(self.dag)
            else:
                return DataplaneChecker(self.dag)
        return None

    def _ignore_interposition(self):
        '''
    Configure all interposition points to immediately pass through all
    internal events
    (possibly useful for replays affected by non-determinism)
    '''
        filtered_events = [
            e for e in self.dag.events if type(e) not in all_internal_events
        ]
        self.dag = EventDag(filtered_events)
        self.default_dp_permit = True
        self.dp_checker = AlwaysAllowDataplane(self.dag)

    def _sleep_with_dataplane_passthrough(self, seconds):
        ''' Pre: simulate() has been called '''
        start = time.time()
        while not self.simulation.io_master.closed:
            elapsed = time.time() - start
            remaining = seconds - elapsed
            if remaining < 0.01:
                break
            self._select_with_dataplane_passthrough(remaining)

    def _select_with_dataplane_passthrough(self, timeout_seconds):
        ''' Pre: simulate() has been called '''
        if self.default_dp_permit:
            self.dp_checker.check_dataplane(-1, self.simulation)
        return self.simulation.io_master.select(timeout_seconds)

    def get_interpolated_time(self):
        '''
    During divergence, the controller may ask for the current time more or
    less times than they did in the original run. We control the time, so we
    need to give them some answer. The answers we give them should be
    (i) monotonically increasing, and (ii) between the time of the last
    recorded ("landmark") event and the next landmark event, and (iii)
    as close to the recorded times as possible

    Our temporary solution is to always return the time right before the next
    landmark
    '''
        # TODO(cs): implement Andi's improved time heuristic
        return self.interpolated_time

    def compute_interpolated_time(self, current_event):
        next_time = current_event.time
        just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds
        just_before_micro = max(0, just_before_micro)
        self.interpolated_time = SyncTime(next_time.seconds, just_before_micro)

    def increment_round(self):
        msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE)

    def init_results(self, results_dir):
        self.replay_id = results_dir
        if self._input_logger:
            self._input_logger.open(results_dir)

    def set_pass_through_sends(self, simulation):
        simulation.openflow_buffer.pass_through_sends_only()
        for e in self.dag.events:
            if type(e) == ControlMessageSend:
                e.pass_through_sends = True

    def simulate(self, post_bootstrap_hook=None):
        ''' Caller *must* call simulation.clean_up() '''
        if self.transform_dag:
            log.info("Transforming dag")
            self.dag = self.transform_dag(self.dag)
            log.info("Proceeding with normal replay")

        self.simulation = self.simulation_cfg.bootstrap(self.sync_callback)
        assert (isinstance(self.simulation.patch_panel, BufferedPatchPanel))
        # TODO(aw): remove this hack
        self.simulation.fail_to_interactive = self.fail_to_interactive
        self.simulation.fail_to_interactive_on_persistent_violations =\
          self.fail_to_interactive_on_persistent_violations
        self.simulation.openflow_buffer.pass_through_whitelisted_packets =\
          self.pass_through_whitelisted_messages
        if self.pass_through_sends:
            self.set_pass_through_sends(self.simulation)
        if self.delay_flow_mods:
            for switch in self.simulation.topology.switches:
                assert (isinstance(switch, FuzzSoftwareSwitch))
                switch.use_delayed_commands()
        self.run_simulation_forward(post_bootstrap_hook)
        if self.print_buffers_flag:
            self._print_buffers()
        return self.simulation

    def _print_buffers(self):
        log.debug("Pending Message Receives:")
        for p in self.simulation.openflow_buffer.pending_receives:
            log.debug("- %s", p)
        log.debug("Pending State Changes:")
        for p in self.sync_callback.pending_state_changes():
            log.debug("- %s", p)

    def run_simulation_forward(self, post_bootstrap_hook=None):
        event_scheduler = self.create_event_scheduler(self.simulation)
        event_scheduler.set_input_logger(self._input_logger)
        self.event_scheduler_stats = event_scheduler.stats
        if post_bootstrap_hook is not None:
            post_bootstrap_hook()

        def interrupt(sgn, frame):
            msg.interactive(
                "Interrupting replayer, dropping to console (press ^C again to terminate)"
            )
            signal.signal(signal.SIGINT, self.old_interrupt)
            self.old_interrupt = None
            raise KeyboardInterrupt()

        self.old_interrupt = signal.signal(signal.SIGINT, interrupt)

        try:
            for i, event in enumerate(self.dag.events):
                try:
                    self.compute_interpolated_time(event)
                    if self.default_dp_permit:
                        self.dp_checker.check_dataplane(i, self.simulation)
                    if isinstance(event, InputEvent):
                        self._check_early_state_changes(self.dag, i, event)
                    self._check_new_state_changes(self.dag, i)
                    self._check_unexpected_cp_messages(self.dag, i)
                    # TODO(cs): quasi race-condition here. If unexpected state change
                    # happens *while* we're waiting for event, we basically have a
                    # deadlock (if controller logging is set to blocking) until the
                    # timeout occurs
                    # TODO(cs): we don't actually allow new internal message events
                    # through.. we only let new state changes through. Should experiment
                    # with whether we would get better fidelity if we let them through.
                    if self.fail_fast and (i % self.check_interval
                                           ) == 0 and self._check_violation():
                        return
                    event_scheduler.schedule(event)
                    if self.logical_time != event.round:
                        self.logical_time = event.round
                        self.increment_round()
                except KeyboardInterrupt:
                    interactive = Interactive(self.simulation_cfg,
                                              input_logger=self._input_logger)
                    interactive.simulate(self.simulation,
                                         bound_objects=(('replayer', self), ))
                    # If Interactive terminated due to ^D, return to our fuzzing loop,
                    # prepared again to drop into Interactive on ^C.
                    self.old_interrupt = signal.signal(signal.SIGINT,
                                                       interrupt)
                except Exception:
                    log.critical(
                        "Exception raised while scheduling event %s, replay %s"
                        % (str(event), self.replay_id))
                    raise

            if self.invariant_check:
                # Wait a bit in case the bug takes awhile to happen
                # TODO(cs): may be redundant with WaitTime events at the end of the
                # trace.
                log.debug("Sleeping %d seconds after run" %
                          self.end_wait_seconds)
                if self.default_dp_permit:
                    self._sleep_with_dataplane_passthrough(
                        self.end_wait_seconds)
                else:
                    time.sleep(self.end_wait_seconds)

                self._check_violation()
        finally:
            if self.old_interrupt:
                signal.signal(signal.SIGINT, self.old_interrupt)
            msg.event(color.B_BLUE +
                      "Event Stats: %s" % str(event_scheduler.stats))
            if self.default_dp_permit:
                msg.event(color.B_BLUE + "DataplaneDrop Stats: %s" %
                          str(self.dp_checker.stats))
        if self.end_in_interactive:
            interactive = Interactive(self.simulation_cfg,
                                      input_logger=self._input_logger)
            interactive.simulate(self.simulation,
                                 bound_objects=(('replayer', self), ))

    def _check_violation(self):
        # TODO(cs): this does not verify whether the violation is persistent
        # or transient. Perhaps it should?
        violations = self.invariant_check(self.simulation)
        self.simulation.violation_found = False
        if violations != []:
            self._log_input_event(InvariantViolation(violations))
            msg.fail("Violations at end of trace: %s" % str(violations))
            if self.bug_signature:
                if self.bug_signature in violations:
                    self.simulation.violation_found = True
                    msg.success("Violation found %s" % self.bug_signature)
                    return True
                else:
                    msg.fail("Violation does not match violation signature!")
                    return False
            else:
                self.simulation.violation_found = True
                return True
        else:
            msg.success("No correctness violations!")
            return False

    def _check_early_state_changes(self, dag, current_index, input):
        ''' Check whether any pending state change that were supposed to come
    *after* the current input have occured. If so, we have violated causality.'''
        # Note that we never violate causality due to message sends/receipts,
        # since we buffer all messages.
        pending_state_changes = self.sync_callback.pending_state_changes()
        if len(pending_state_changes) > 0:
            # TODO(cs): currently assumes a single controller (-> single pending state
            # change)
            state_change = pending_state_changes[0]
            next_expected = dag.next_state_change(current_index)
            original_input_index = dag.get_original_index_for_event(input)
            if (next_expected is not None
                    and state_change == next_expected.pending_state_change
                    and dag.get_original_index_for_event(next_expected) >
                    original_input_index):
                # raise RuntimeError("State change happened before expected! Causality violated")
                log.warn(
                    "State change happened before expected! Causality violated"
                )
                self.early_state_changes.append(repr(next_expected))

    def _check_new_state_changes(self, dag, current_index):
        ''' If we are blocking controllers, it's bad news bears if an unexpected
    internal state change occurs. Check if there are any, and ACK them so that
    the execution can proceed.'''
        pending_state_changes = self.sync_callback.pending_state_changes()
        if len(pending_state_changes) > 0:
            # TODO(cs): currently assumes a single controller (-> single pending state
            # change)
            state_change = pending_state_changes[0]
            next_expected = dag.next_state_change(current_index)
            if (next_expected is None
                    or state_change != next_expected.pending_state_change):
                log.info("Unexpected state change. Ack'ing %s" %
                         str(state_change))
                log_event = ControllerStateChange.from_pending_state_change(
                    state_change)
                # Monkeypatch a "new internal event" marker to be logged to the JSON trace
                # (All fields picked up by event.to_json())
                log_event.new_internal_event = True
                log_event.replay_time = SyncTime.now()
                self._log_input_event(log_event)
                self.unexpected_state_changes.append(repr(state_change))
                self.sync_callback.ack_pending_state_change(state_change)

    def _check_unexpected_cp_messages(self, dag, current_index):
        ''' If throughout replay we observe new messages that weren't in the
    original trace, we (usually) want to let them through. This is especially true
    for messages that are related to timers, such as LLDP, since timings will
    often change during replay, and we don't want to unnecessarily change the
    controller's behavior. '''
        if not self.allow_unexpected_messages:
            return
        # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less
        # effective than whitelisting cp message types (in OpenflowBuffer).
        # Compare executions with visualization tool.
        # Currently it appears that this method is too liberal, and ends up
        # causing timouts as a result of letting messages through.

        # First, build a set of expected ControlMessageSends/Receives fingerprints
        # within the next expected_message_round_window rounds.
        start_round = dag.events[current_index].round
        expected_receive_fingerprints = set()
        expected_send_fingerprints = set()
        for i in xrange(current_index, len(dag.events)):
            event = dag.events[i]
            if event.round - start_round > self.expected_message_round_window:
                break
            if type(event) == ControlMessageReceive:
                (_, of_fingerprint, dpid, cid) = event.fingerprint
                expected_receive_fingerprints.add((of_fingerprint, dpid, cid))
            if type(event) == ControlMessageSend:
                (_, of_fingerprint, dpid, cid) = event.fingerprint
                expected_send_fingerprints.add((of_fingerprint, dpid, cid))

        # Now check pending messages.
        for expected_fingerprints, messages in [
            (expected_receive_fingerprints,
             self.simulation.openflow_buffer.pending_receives),
            (expected_send_fingerprints,
             self.simulation.openflow_buffer.pending_sends)
        ]:
            for pending_message in messages:
                fingerprint = (pending_message.fingerprint,
                               pending_message.dpid,
                               pending_message.controller_id)
                if fingerprint not in expected_fingerprints:
                    message = self.simulation.openflow_buffer.schedule(
                        pending_message)
                    log.debug("Allowed unexpected message %s" % message)
                    b64_packet = base64_encode(message)
                    # Monkeypatch a "new internal event" marker to be logged to the JSON trace
                    # (All fields picked up by event.to_json())
                    event_type = ControlMessageReceive if type(
                        pending_message
                    ) == PendingReceive else ControlMessageSend
                    log_event = event_type(pending_message.dpid,
                                           pending_message.controller_id,
                                           pending_message.fingerprint,
                                           b64_packet=b64_packet)
                    log_event.new_internal_event = True
                    log_event.replay_time = SyncTime.now()
                    self.passed_unexpected_messages.append(repr(log_event))
                    self._log_input_event(log_event)
Exemple #3
0
class Replayer(ControlFlow):
  '''
  Replay events from a trace

  To set the event scheduling parameters, pass them as keyword args to the
  constructor of this class, which will pass them on to the EventScheduler object it creates.
  '''

  # Interpolated time parameter. *not* the event scheduling epsilon:
  time_epsilon_microseconds = 500

  kwargs = EventScheduler.kwargs | set(['create_event_scheduler', 'print_buffers',
                'wait_on_deterministic_values', 'default_dp_permit',
                'fail_to_interactive', 'fail_to_interactive_on_persistent_violations',
                'end_in_interactive', 'input_logger',
                'allow_unexpected_messages',
                'expected_message_round_window',
                'pass_through_whitelisted_messages',
                'delay_flow_mods', 'invariant_check_name',
                'bug_signature', 'end_wait_seconds',
                'transform_dag', 'pass_through_sends', 'fail_fast',
                'check_interval'])

  def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None,
               print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False,
               fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False,
               end_in_interactive=False, input_logger=None,
               allow_unexpected_messages=False,
               expected_message_round_window=3,
               pass_through_whitelisted_messages=False,
               delay_flow_mods=False, invariant_check_name="",
               bug_signature="", end_wait_seconds=0.5,
               transform_dag=None, pass_through_sends=False,
               fail_fast=False, check_interval=5,
               **kwargs):
    '''
     - If invariant_check_name is not None, check it at the end for the
       execution
     - If bug_signature is not None, check whether this particular signature
       appears in the output of the invariant check at the end of the
       execution
    '''
    ControlFlow.__init__(self, simulation_cfg)
    # Label uniquely identifying this replay, set in init_results()
    self.replay_id = "N/A"
    self.logical_time = 0
    if wait_on_deterministic_values:
      self.sync_callback = ReplaySyncCallback()
    else:
      self.sync_callback = ReplaySyncCallback(self.get_interpolated_time)

    if type(superlog_path_or_dag) == str:
      superlog_path = superlog_path_or_dag
      # The dag is codefied as a list, where each element has
      # a list of its dependents
      self.dag = EventDag(log_parser.parse_path(superlog_path))
    else:
      self.dag = superlog_path_or_dag

    if len(self.dag.events) == 0:
      raise ValueError("No events to replay!")

    self.default_dp_permit = default_dp_permit
    self.dp_checker = self._setup_dp_checker(default_dp_permit)
    if self.default_dp_permit:
      # Set DataplanePermit and DataplaneDrop to passive if permit is set
      # to default
      # TODO(cs): rather than setting these to passive (which still causes them to
      # be scheduled as regular events) should these just be removed from the
      # event dag altogether?
      for event in [ e for e in self.dag.events if type(e) in dp_events ]:
        event.passive = default_dp_permit

    self.print_buffers_flag = print_buffers
    self.fail_fast = fail_fast
    self.check_interval = check_interval

    # compute interpolate to time to be just before first event
    self.compute_interpolated_time(self.dag.events[0])
    # String repesentations of unexpected state changes we've passed through, for
    # statistics purposes.
    self.unexpected_state_changes = []
    self.early_state_changes = []
    self.event_scheduler_stats = None
    self.end_in_interactive = end_in_interactive
    self.fail_to_interactive = fail_to_interactive
    self.fail_to_interactive_on_persistent_violations =\
      fail_to_interactive_on_persistent_violations
    self._input_logger = input_logger
    self.allow_unexpected_messages = allow_unexpected_messages
    self.expected_message_round_window = expected_message_round_window
    self.pass_through_whitelisted_messages = pass_through_whitelisted_messages
    self.pass_through_sends = pass_through_sends
    # How many logical rounds to peek ahead when deciding if a message is
    # expected or not.
    # String repesentations of unexpected messages we've passed through, for
    # statistics purposes.
    self.passed_unexpected_messages = []
    self.delay_flow_mods = delay_flow_mods
    self.end_wait_seconds = end_wait_seconds
    self.transform_dag = transform_dag
    self.bug_signature = bug_signature
    self.invariant_check_name = invariant_check_name
    self.invariant_check = None
    if self.invariant_check_name:
      if self.invariant_check_name not in name_to_invariant_check:
        raise ValueError('''Unknown invariant check %s.\n'''
                         '''Invariant check name must be defined in config.invariant_checks''',
                         self.invariant_check_name)
      self.invariant_check = name_to_invariant_check[self.invariant_check_name]

    if self.pass_through_whitelisted_messages:
      for event in self.dag.events:
        if hasattr(event, "ignore_whitelisted_packets"):
          event.ignore_whitelisted_packets = True

    if self.simulation_cfg.ignore_interposition:
      self._ignore_interposition()

    if create_event_scheduler:
      self.create_event_scheduler = create_event_scheduler
    else:
      if self.default_dp_permit:
        # Tell EventScheduler to use call into us whenever it wants to sleep or
        # select, so that we can keep forwarding dataplane packets.
        kwargs['sleep_continuation'] = self._sleep_with_dataplane_passthrough
        kwargs['select_continuation'] = self._select_with_dataplane_passthrough

      self.create_event_scheduler = \
        lambda simulation: EventScheduler(simulation,
            **{ k: v for k,v in kwargs.items()
                if k in EventScheduler.kwargs })

    unknown_kwargs = [ k for k in kwargs.keys() if k not in EventScheduler.kwargs ]
    if unknown_kwargs != []:
      raise ValueError("Unknown kwargs %s" % str(unknown_kwargs))

  def _log_input_event(self, event, **kws):
    if self._input_logger is not None:
      self._input_logger.log_input_event(event, **kws)

  def _setup_dp_checker(self, default_dp_permit):
    no_dp_drops = [ e for e in self.dag.events if type(e) == DataplaneDrop ] == []
    if not default_dp_permit and no_dp_drops:
      print >> sys.stderr, ('''No DataplaneDrops to replay. We suggest you '''
                            '''set Replayer's default_dp_permit=True ''')
    if default_dp_permit:
      if no_dp_drops:
        return AlwaysAllowDataplane(self.dag)
      else:
        return DataplaneChecker(self.dag)
    return None

  def _ignore_interposition(self):
    '''
    Configure all interposition points to immediately pass through all
    internal events
    (possibly useful for replays affected by non-determinism)
    '''
    filtered_events = [e for e in self.dag.events if type(e) not in all_internal_events]
    self.dag = EventDag(filtered_events)
    self.default_dp_permit = True
    self.dp_checker = AlwaysAllowDataplane(self.dag)

  def _sleep_with_dataplane_passthrough(self, seconds):
    ''' Pre: simulate() has been called '''
    start = time.time()
    while not self.simulation.io_master.closed:
      elapsed = time.time() - start
      remaining = seconds - elapsed
      if remaining < 0.01:
        break
      self._select_with_dataplane_passthrough(remaining)

  def _select_with_dataplane_passthrough(self, timeout_seconds):
    ''' Pre: simulate() has been called '''
    if self.default_dp_permit:
      self.dp_checker.check_dataplane(-1, self.simulation)
    return self.simulation.io_master.select(timeout_seconds)

  def get_interpolated_time(self):
    '''
    During divergence, the controller may ask for the current time more or
    less times than they did in the original run. We control the time, so we
    need to give them some answer. The answers we give them should be
    (i) monotonically increasing, and (ii) between the time of the last
    recorded ("landmark") event and the next landmark event, and (iii)
    as close to the recorded times as possible

    Our temporary solution is to always return the time right before the next
    landmark
    '''
    # TODO(cs): implement Andi's improved time heuristic
    return self.interpolated_time

  def compute_interpolated_time(self, current_event):
    next_time = current_event.time
    just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds
    just_before_micro = max(0, just_before_micro)
    self.interpolated_time = SyncTime(next_time.seconds, just_before_micro)

  def increment_round(self):
    msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE)

  def init_results(self, results_dir):
    self.replay_id = results_dir
    if self._input_logger:
      self._input_logger.open(results_dir)

  def set_pass_through_sends(self, simulation):
    simulation.openflow_buffer.pass_through_sends_only()
    for e in self.dag.events:
      if type(e) == ControlMessageSend:
        e.pass_through_sends = True

  def simulate(self, post_bootstrap_hook=None):
    ''' Caller *must* call simulation.clean_up() '''
    if self.transform_dag:
      log.info("Transforming dag")
      self.dag = self.transform_dag(self.dag)
      log.info("Proceeding with normal replay")

    self.simulation = self.simulation_cfg.bootstrap(self.sync_callback)
    assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel))
    # TODO(aw): remove this hack
    self.simulation.fail_to_interactive = self.fail_to_interactive
    self.simulation.fail_to_interactive_on_persistent_violations =\
      self.fail_to_interactive_on_persistent_violations
    self.simulation.openflow_buffer.pass_through_whitelisted_packets =\
      self.pass_through_whitelisted_messages
    if self.pass_through_sends:
      self.set_pass_through_sends(self.simulation)
    if self.delay_flow_mods:
      for switch in self.simulation.topology.switches:
        assert(isinstance(switch, FuzzSoftwareSwitch))
        switch.use_delayed_commands()
    self.run_simulation_forward(post_bootstrap_hook)
    if self.print_buffers_flag:
      self._print_buffers()
    return self.simulation

  def _print_buffers(self):
    log.debug("Pending Message Receives:")
    for p in self.simulation.openflow_buffer.pending_receives:
      log.debug("- %s", p)
    log.debug("Pending State Changes:")
    for p in self.sync_callback.pending_state_changes():
      log.debug("- %s", p)

  def run_simulation_forward(self, post_bootstrap_hook=None):
    event_scheduler = self.create_event_scheduler(self.simulation)
    event_scheduler.set_input_logger(self._input_logger)
    self.event_scheduler_stats = event_scheduler.stats
    if post_bootstrap_hook is not None:
      post_bootstrap_hook()

    def interrupt(sgn, frame):
      msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)")
      signal.signal(signal.SIGINT, self.old_interrupt)
      self.old_interrupt = None
      raise KeyboardInterrupt()
    self.old_interrupt = signal.signal(signal.SIGINT, interrupt)

    try:
      for i, event in enumerate(self.dag.events):
        try:
          self.compute_interpolated_time(event)
          if self.default_dp_permit:
            self.dp_checker.check_dataplane(i, self.simulation)
          if isinstance(event, InputEvent):
            self._check_early_state_changes(self.dag, i, event)
          self._check_new_state_changes(self.dag, i)
          self._check_unexpected_cp_messages(self.dag, i)
          # TODO(cs): quasi race-condition here. If unexpected state change
          # happens *while* we're waiting for event, we basically have a
          # deadlock (if controller logging is set to blocking) until the
          # timeout occurs
          # TODO(cs): we don't actually allow new internal message events
          # through.. we only let new state changes through. Should experiment
          # with whether we would get better fidelity if we let them through.
          if self.fail_fast and (i % self.check_interval) == 0 and self._check_violation():
            return
          event_scheduler.schedule(event)
          if self.logical_time != event.round:
            self.logical_time = event.round
            self.increment_round()
        except KeyboardInterrupt:
          interactive = Interactive(self.simulation_cfg,
                                    input_logger=self._input_logger)
          interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))
          # If Interactive terminated due to ^D, return to our fuzzing loop,
          # prepared again to drop into Interactive on ^C.
          self.old_interrupt = signal.signal(signal.SIGINT, interrupt)
        except Exception:
          log.critical("Exception raised while scheduling event %s, replay %s"
                       % (str(event), self.replay_id))
          raise

      if self.invariant_check:
        # Wait a bit in case the bug takes awhile to happen
        # TODO(cs): may be redundant with WaitTime events at the end of the
        # trace.
        log.debug("Sleeping %d seconds after run" % self.end_wait_seconds)
        if self.default_dp_permit:
          self._sleep_with_dataplane_passthrough(self.end_wait_seconds)
        else:
          time.sleep(self.end_wait_seconds)

        self._check_violation()
    finally:
      if self.old_interrupt:
        signal.signal(signal.SIGINT, self.old_interrupt)
      msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats))
      if self.default_dp_permit:
        msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats))
    if self.end_in_interactive:
      interactive = Interactive(self.simulation_cfg, input_logger=self._input_logger)
      interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))

  def _check_violation(self):
    # TODO(cs): this does not verify whether the violation is persistent
    # or transient. Perhaps it should?
    violations = self.invariant_check(self.simulation)
    self.simulation.violation_found = False
    if violations != []:
      self._log_input_event(InvariantViolation(violations))
      msg.fail("Violations at end of trace: %s" % str(violations))
      if self.bug_signature:
        if self.bug_signature in violations:
          self.simulation.violation_found = True
          msg.success("Violation found %s" % self.bug_signature)
          return True
        else:
          msg.fail("Violation does not match violation signature!")
          return False
      else:
        self.simulation.violation_found = True
        return True
    else:
      msg.success("No correctness violations!")
      return False

  def _check_early_state_changes(self, dag, current_index, input):
    ''' Check whether any pending state change that were supposed to come
    *after* the current input have occured. If so, we have violated causality.'''
    # Note that we never violate causality due to message sends/receipts,
    # since we buffer all messages.
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      original_input_index = dag.get_original_index_for_event(input)
      if (next_expected is not None and
          state_change == next_expected.pending_state_change and
          dag.get_original_index_for_event(next_expected) > original_input_index):
        # raise RuntimeError("State change happened before expected! Causality violated")
        log.warn("State change happened before expected! Causality violated")
        self.early_state_changes.append(repr(next_expected))

  def _check_new_state_changes(self, dag, current_index):
    ''' If we are blocking controllers, it's bad news bears if an unexpected
    internal state change occurs. Check if there are any, and ACK them so that
    the execution can proceed.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      if (next_expected is None or
          state_change != next_expected.pending_state_change):
        log.info("Unexpected state change. Ack'ing %s" % str(state_change))
        log_event = ControllerStateChange.from_pending_state_change(state_change)
        # Monkeypatch a "new internal event" marker to be logged to the JSON trace
        # (All fields picked up by event.to_json())
        log_event.new_internal_event = True
        log_event.replay_time = SyncTime.now()
        self._log_input_event(log_event)
        self.unexpected_state_changes.append(repr(state_change))
        self.sync_callback.ack_pending_state_change(state_change)

  def _check_unexpected_cp_messages(self, dag, current_index):
    ''' If throughout replay we observe new messages that weren't in the
    original trace, we (usually) want to let them through. This is especially true
    for messages that are related to timers, such as LLDP, since timings will
    often change during replay, and we don't want to unnecessarily change the
    controller's behavior. '''
    if not self.allow_unexpected_messages:
      return
    # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less
    # effective than whitelisting cp message types (in OpenflowBuffer).
    # Compare executions with visualization tool.
    # Currently it appears that this method is too liberal, and ends up
    # causing timouts as a result of letting messages through.

    # First, build a set of expected ControlMessageSends/Receives fingerprints
    # within the next expected_message_round_window rounds.
    start_round = dag.events[current_index].round
    expected_receive_fingerprints = set()
    expected_send_fingerprints = set()
    for i in xrange(current_index, len(dag.events)):
      event = dag.events[i]
      if event.round - start_round > self.expected_message_round_window:
        break
      if type(event) == ControlMessageReceive:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_receive_fingerprints.add((of_fingerprint, dpid, cid))
      if type(event) == ControlMessageSend:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_send_fingerprints.add((of_fingerprint, dpid, cid))

    # Now check pending messages.
    for expected_fingerprints, messages in [
         (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives),
         (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends)]:
      for pending_message in messages:
        fingerprint = (pending_message.fingerprint,
                       pending_message.dpid,
                       pending_message.controller_id)
        if fingerprint not in expected_fingerprints:
          message = self.simulation.openflow_buffer.schedule(pending_message)
          log.debug("Allowed unexpected message %s" % message)
          b64_packet = base64_encode(message)
          # Monkeypatch a "new internal event" marker to be logged to the JSON trace
          # (All fields picked up by event.to_json())
          event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend
          log_event = event_type(pending_message.dpid, pending_message.controller_id,
                                 pending_message.fingerprint, b64_packet=b64_packet)
          log_event.new_internal_event = True
          log_event.replay_time = SyncTime.now()
          self.passed_unexpected_messages.append(repr(log_event))
          self._log_input_event(log_event)
Exemple #4
0
class Replayer(ControlFlow):
  '''
  Replay events from a `superlog` with causal dependencies, pruning as we go

  To set the event scheduling paramters, pass them as keyword args to the
  constructor of this class, which will pass them on to the EventScheduler object it creates.
  '''

  # Runtime stats:
  total_replays = 0
  total_inputs_replayed = 0
  # Interpolated time parameter. *not* the event scheduling epsilon:
  time_epsilon_microseconds = 500

  def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None,
               print_buffers=True, wait_on_deterministic_values=False,
               fail_to_interactive=False, **kwargs):
    ControlFlow.__init__(self, simulation_cfg)
    if wait_on_deterministic_values:
      self.sync_callback = ReplaySyncCallback()
    else:
      self.sync_callback = ReplaySyncCallback(self.get_interpolated_time)

    if type(superlog_path_or_dag) == str:
      superlog_path = superlog_path_or_dag
      # The dag is codefied as a list, where each element has
      # a list of its dependents
      self.dag = EventDag(superlog_parser.parse_path(superlog_path))
    else:
      self.dag = superlog_path_or_dag

    self.dp_checker = DataplaneChecker(self.dag)

    self.print_buffers = print_buffers

    # compute interpolate to time to be just before first event
    self.compute_interpolated_time(self.dag.events[0])
    self.unexpected_state_changes = []
    self.early_state_changes = []
    self.event_scheduler_stats = None
    self.fail_to_interactive = fail_to_interactive

    if create_event_scheduler:
      self.create_event_scheduler = create_event_scheduler
    else:
      self.create_event_scheduler = \
        lambda simulation: EventScheduler(simulation,
            **{ k: v for k,v in kwargs.items()
                if k in EventScheduler.kwargs })

  def get_interpolated_time(self):
    '''
    During divergence, the controller may ask for the current time more or
    less times than they did in the original run. We control the time, so we
    need to give them some answer. The answers we give them should be
    (i) monotonically increasing, and (ii) between the time of the last
    recorded ("landmark") event and the next landmark event, and (iii)
    as close to the recorded times as possible

    Our temporary solution is to always return the time right before the next
    landmark
    '''
    # TODO(cs): implement Andi's improved time heuristic
    return self.interpolated_time

  def compute_interpolated_time(self, current_event):
    next_time = current_event.time
    just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds
    just_before_micro = max(0, just_before_micro)
    self.interpolated_time = SyncTime(next_time.seconds, just_before_micro)

  def increment_round(self):
    msg.event(color.CYAN + ( "Round %d" % self.logical_time) + color.WHITE)

  def simulate(self, post_bootstrap_hook=None):
    ''' Caller *must* call simulation.clean_up() '''
    Replayer.total_replays += 1
    Replayer.total_inputs_replayed += len(self.dag.input_events)
    self.simulation = self.simulation_cfg.bootstrap(self.sync_callback)
    assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel))
    ### TODO aw remove this hack
    self.simulation.fail_to_interactive = self.fail_to_interactive
    self.logical_time = 0
    self.run_simulation_forward(self.dag, post_bootstrap_hook)
    if self.print_buffers:
      self._print_buffers()
    return self.simulation

  def _print_buffers(self):
    log.debug("Pending Message Receives:")
    for p in self.simulation.god_scheduler.pending_receives():
      log.debug("- %s", p)
    log.debug("Pending State Changes:")
    for p in self.sync_callback.pending_state_changes():
      log.debug("- %s", p)

  def run_simulation_forward(self, dag, post_bootstrap_hook=None):
    event_scheduler = self.create_event_scheduler(self.simulation)
    self.event_scheduler_stats = event_scheduler.stats
    if post_bootstrap_hook is not None:
      post_bootstrap_hook()

    old_interrupt = None

    def interrupt(sgn, frame):
      msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)")
      signal.signal(signal.SIGINT, self.old_interrupt)
      self.old_interrupt = None
      raise KeyboardInterrupt()
    self.old_interrupt = signal.signal(signal.SIGINT, interrupt)

    try:
      for i, event in enumerate(dag.events):
        try:
          self.compute_interpolated_time(event)
          self.dp_checker.check_dataplane(i, self.simulation)
          if isinstance(event, InputEvent):
            self._check_early_state_changes(dag, i, event)
          self._check_new_state_changes(dag, i)
          # TODO(cs): quasi race-condition here. If unexpected state change
          # happens *while* we're waiting for event, we basically have a
          # deadlock (if controller logging is set to blocking) until the
          # timeout occurs
          event_scheduler.schedule(event)
          if self.logical_time != event.round:
            self.logical_time = event.round
            self.increment_round()
        except KeyboardInterrupt as e:
          interactive = Interactive(self.simulation_cfg)
          interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))
          self.old_interrupt = signal.signal(signal.SIGINT, interrupt)
    finally:
      if self.old_interrupt:
        signal.signal(signal.SIGINT, self.old_interrupt)
      msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats))
      msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats))

  def _check_early_state_changes(self, dag, current_index, input):
    ''' Check whether the any pending state change were supposed to come
    *after* the current input. If so, we have violated causality.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      original_input_index = dag.get_original_index_for_event(input)
      if (next_expected is not None and
          state_change == next_expected.pending_state_change and
          dag.get_original_index_for_event(next_expected) > original_input_index):
        raise RuntimeError("State change happened before expected! Causality violated")
        self.early_state_changes.append(repr(next_expected))

  def _check_new_state_changes(self, dag, current_index):
    ''' If we are blocking controllers, it's bad news bears if an unexpected
    internal state change occurs. Check if there are any, and ACK them so that
    the execution can proceed.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      if (next_expected is None or
          state_change != next_expected.pending_state_change):
        log.info("Unexpected state change. Ack'ing")
        self.unexpected_state_changes.append(repr(state_change))
        self.sync_callback.ack_pending_state_change(state_change)
Exemple #5
0
class Replayer(ControlFlow):
  '''
  Replay events from a trace

  To set the event scheduling parameters, pass them as keyword args to the
  constructor of this class, which will pass them on to the EventScheduler object it creates.
  '''

  # Interpolated time parameter. *not* the event scheduling epsilon:
  time_epsilon_microseconds = 500

  def __init__(self, simulation_cfg, superlog_path_or_dag, create_event_scheduler=None,
               print_buffers=True, wait_on_deterministic_values=False, default_dp_permit=False,
               fail_to_interactive=False, fail_to_interactive_on_persistent_violations=False,
               end_in_interactive=False, input_logger=None,
               allow_unexpected_messages=False,
               pass_through_whitelisted_messages=True,
               delay_flow_mods=False,
               **kwargs):
    ControlFlow.__init__(self, simulation_cfg)
    if wait_on_deterministic_values:
      self.sync_callback = ReplaySyncCallback()
    else:
      self.sync_callback = ReplaySyncCallback(self.get_interpolated_time)

    if type(superlog_path_or_dag) == str:
      superlog_path = superlog_path_or_dag
      # The dag is codefied as a list, where each element has
      # a list of its dependents
      self.dag = EventDag(log_parser.parse_path(superlog_path))
    else:
      self.dag = superlog_path_or_dag

    if len(self.dag.events) == 0:
      raise ValueError("No events to replay!")

    self.default_dp_permit = default_dp_permit
    # Set DataplanePermit and DataplaneDrop to passive if permit is set
    # to default
    for event in [ e for e in self.dag.events if type(e) in dp_events ]:
      event.passive = default_dp_permit

    self.dp_checker = None
    if default_dp_permit:
      self.dp_checker = DataplaneChecker(self.dag)

    self.print_buffers_flag = print_buffers

    # compute interpolate to time to be just before first event
    self.compute_interpolated_time(self.dag.events[0])
    # String repesentations of unexpected state changes we've passed through, for
    # statistics purposes.
    self.unexpected_state_changes = []
    self.early_state_changes = []
    self.event_scheduler_stats = None
    self.end_in_interactive = end_in_interactive
    self.fail_to_interactive = fail_to_interactive
    self.fail_to_interactive_on_persistent_violations =\
      fail_to_interactive_on_persistent_violations
    self._input_logger = input_logger
    self.allow_unexpected_messages = allow_unexpected_messages
    self.pass_through_whitelisted_messages = pass_through_whitelisted_messages
    # How many logical rounds to peek ahead when deciding if a message is
    # expected or not.
    self.expected_message_round_window = 3
    # String repesentations of unexpected messages we've passed through, for
    # statistics purposes.
    self.passed_unexpected_messages = []
    self.delay_flow_mods = delay_flow_mods

    if self.pass_through_whitelisted_messages:
      for event in self.dag.events:
        if hasattr(event, "ignore_whitelisted_packets"):
          event.ignore_whitelisted_packets = True

    if create_event_scheduler:
      self.create_event_scheduler = create_event_scheduler
    else:
      self.create_event_scheduler = \
        lambda simulation: EventScheduler(simulation,
            **{ k: v for k,v in kwargs.items()
                if k in EventScheduler.kwargs })

  def _log_input_event(self, event, **kws):
    if self._input_logger is not None:
      self._input_logger.log_input_event(event, **kws)

  def get_interpolated_time(self):
    '''
    During divergence, the controller may ask for the current time more or
    less times than they did in the original run. We control the time, so we
    need to give them some answer. The answers we give them should be
    (i) monotonically increasing, and (ii) between the time of the last
    recorded ("landmark") event and the next landmark event, and (iii)
    as close to the recorded times as possible

    Our temporary solution is to always return the time right before the next
    landmark
    '''
    # TODO(cs): implement Andi's improved time heuristic
    return self.interpolated_time

  def compute_interpolated_time(self, current_event):
    next_time = current_event.time
    just_before_micro = next_time.microSeconds - self.time_epsilon_microseconds
    just_before_micro = max(0, just_before_micro)
    self.interpolated_time = SyncTime(next_time.seconds, just_before_micro)

  def increment_round(self):
    msg.event(color.CYAN + ("Round %d" % self.logical_time) + color.WHITE)

  def init_results(self, results_dir):
    if self._input_logger:
      self._input_logger.open(results_dir)

  def simulate(self, post_bootstrap_hook=None):
    ''' Caller *must* call simulation.clean_up() '''
    self.simulation = self.simulation_cfg.bootstrap(self.sync_callback)
    assert(isinstance(self.simulation.patch_panel, BufferedPatchPanel))
    # TODO(aw): remove this hack
    self.simulation.fail_to_interactive = self.fail_to_interactive
    self.simulation.fail_to_interactive_on_persistent_violations =\
      self.fail_to_interactive_on_persistent_violations
    self.simulation.openflow_buffer.pass_through_whitelisted_messages =\
      self.pass_through_whitelisted_messages
    self.logical_time = 0
    if self.delay_flow_mods:
      for switch in self.simulation.topology.switches:
        assert(isinstance(switch, FuzzSoftwareSwitch))
        switch.use_delayed_commands()
    self.run_simulation_forward(self.dag, post_bootstrap_hook)
    if self.print_buffers_flag:
      self._print_buffers()
    return self.simulation

  def _print_buffers(self):
    log.debug("Pending Message Receives:")
    for p in self.simulation.openflow_buffer.pending_receives():
      log.debug("- %s", p)
    log.debug("Pending State Changes:")
    for p in self.sync_callback.pending_state_changes():
      log.debug("- %s", p)

  def run_simulation_forward(self, dag, post_bootstrap_hook=None):
    event_scheduler = self.create_event_scheduler(self.simulation)
    event_scheduler.set_input_logger(self._input_logger)
    self.event_scheduler_stats = event_scheduler.stats
    if post_bootstrap_hook is not None:
      post_bootstrap_hook()

    def interrupt(sgn, frame):
      msg.interactive("Interrupting replayer, dropping to console (press ^C again to terminate)")
      signal.signal(signal.SIGINT, self.old_interrupt)
      self.old_interrupt = None
      raise KeyboardInterrupt()
    self.old_interrupt = signal.signal(signal.SIGINT, interrupt)

    try:
      for i, event in enumerate(dag.events):
        try:
          self.compute_interpolated_time(event)
          if self.default_dp_permit:
            self.dp_checker.check_dataplane(i, self.simulation)
          if isinstance(event, InputEvent):
            self._check_early_state_changes(dag, i, event)
          self._check_new_state_changes(dag, i)
          self._check_unexpected_cp_messages(dag, i)
          # TODO(cs): quasi race-condition here. If unexpected state change
          # happens *while* we're waiting for event, we basically have a
          # deadlock (if controller logging is set to blocking) until the
          # timeout occurs
          # TODO(cs): we don't actually allow new internal message events
          # through.. we only let new state changes through. Should experiment
          # with whether we would get better fidelity if we let them through.
          event_scheduler.schedule(event)
          if self.logical_time != event.round:
            self.logical_time = event.round
            self.increment_round()
        except KeyboardInterrupt:
          interactive = Interactive(self.simulation_cfg,
                                    input_logger=self._input_logger)
          interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))
          self.old_interrupt = signal.signal(signal.SIGINT, interrupt)
    finally:
      if self.old_interrupt:
        signal.signal(signal.SIGINT, self.old_interrupt)
      msg.event(color.B_BLUE+"Event Stats: %s" % str(event_scheduler.stats))
      if self.default_dp_permit:
        msg.event(color.B_BLUE+"DataplaneDrop Stats: %s" % str(self.dp_checker.stats))
    if self.end_in_interactive:
      interactive = Interactive(self.simulation_cfg,
          input_logger=self._input_logger)
      interactive.simulate(self.simulation, bound_objects=( ('replayer', self), ))

  def _check_early_state_changes(self, dag, current_index, input):
    ''' Check whether any pending state change that were supposed to come
    *after* the current input have occured. If so, we have violated causality.'''
    # Note that we never violate causality due to message sends/receipts,
    # since we buffer all messages.
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      original_input_index = dag.get_original_index_for_event(input)
      if (next_expected is not None and
          state_change == next_expected.pending_state_change and
          dag.get_original_index_for_event(next_expected) > original_input_index):
        # raise RuntimeError("State change happened before expected! Causality violated")
        log.warn("State change happened before expected! Causality violated")
        self.early_state_changes.append(repr(next_expected))

  def _check_new_state_changes(self, dag, current_index):
    ''' If we are blocking controllers, it's bad news bears if an unexpected
    internal state change occurs. Check if there are any, and ACK them so that
    the execution can proceed.'''
    pending_state_changes = self.sync_callback.pending_state_changes()
    if len(pending_state_changes) > 0:
      # TODO(cs): currently assumes a single controller (-> single pending state
      # change)
      state_change = pending_state_changes[0]
      next_expected = dag.next_state_change(current_index)
      if (next_expected is None or
          state_change != next_expected.pending_state_change):
        log.info("Unexpected state change. Ack'ing")
        log_event = ControllerStateChange.from_pending_state_change(state_change)
        # Monkeypatch a "new internal event" marker to be logged to the JSON trace
        # (All fields picked up by event.to_json())
        log_event.new_internal_event = True
        log_event.replay_time = SyncTime.now()
        self._log_input_event(log_event)
        self.unexpected_state_changes.append(repr(state_change))
        self.sync_callback.ack_pending_state_change(state_change)

  def _check_unexpected_cp_messages(self, dag, current_index):
    ''' If throughout replay we observe new messages that weren't in the
    original trace, we (usually) want to let them through. This is especially true
    for messages that are related to timers, such as LLDP, since timings will
    often change during replay, and we don't want to unnecessarily change the
    controller's behavior. '''
    if not self.allow_unexpected_messages:
      return
    # TODO(cs): evaluate whether _check_unexpected_cp_messages is more or less
    # effective than whitelisting cp message types (in OpenflowBuffer).
    # Compare executions with visualization tool.
    # Currently it appears that this method is too liberal, and ends up
    # causing timouts as a result of letting messages through.

    # First, build a set of expected ControlMessageSends/Receives fingerprints
    # within the next expected_message_round_window rounds.
    start_round = dag.events[current_index].round
    expected_receive_fingerprints = set()
    expected_send_fingerprints = set()
    for i in xrange(current_index, len(dag.events)):
      event = dag.events[i]
      if event.round - start_round > self.expected_message_round_window:
        break
      if type(event) == ControlMessageReceive:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_receive_fingerprints.add((of_fingerprint, dpid, cid))
      if type(event) == ControlMessageSend:
        (_, of_fingerprint, dpid, cid) = event.fingerprint
        expected_send_fingerprints.add((of_fingerprint, dpid, cid))

    # Now check pending messages.
    for expected_fingerprints, messages in [
         (expected_receive_fingerprints, self.simulation.openflow_buffer.pending_receives()),
         (expected_send_fingerprints, self.simulation.openflow_buffer.pending_sends())]:
      for pending_message in messages:
        fingerprint = (pending_message.fingerprint,
                       pending_message.dpid,
                       pending_message.controller_id)
        if fingerprint not in expected_fingerprints:
          message = self.simulation.openflow_buffer.schedule(pending_message)
          log.debug("Sending unexpected %s" % message)
          b64_packet = base64_encode(message)
          # Monkeypatch a "new internal event" marker to be logged to the JSON trace
          # (All fields picked up by event.to_json())
          event_type = ControlMessageReceive if type(pending_message) == PendingReceive else ControlMessageSend
          log_event = event_type(pending_message.dpid, pending_message.controller_id,
                                 pending_message.fingerprint, b64_packet=b64_packet)
          log_event.new_internal_event = True
          log_event.replay_time = SyncTime.now()
          self.passed_unexpected_messages.append(repr(log_event))
          self._log_input_event(log_event)