def __init__(self, concurrency):
        """
    concurrency: allowed number of model slots
    """
        self._logger = _getLogger()

        self._profiling = (ModelSwapperConfig().getboolean(
            "debugging", "profiling")
                           or self._logger.isEnabledFor(logging.DEBUG))

        # Allowed number of model slots
        self._concurrency = concurrency

        # Input-reader thread target function sets this when it starts running to
        # let our event loop know that things are off to a good start
        self._notificationReaderStartedEvent = threading.Event()

        self._notificationMutex = threading.Lock()
        # Mutex used to guaranteed that no further model input notifications will
        # be added to main event queue once self._stopNotificationReader is set

        # _runNotificationReaderThread will not process any more notifications
        # once it detects that this flag is true
        self._stopNotificationReader = False

        # The event loop will exit some time after an event handler sets this flag
        # to True
        self._eventLoopStopPending = False

        # (non-thread-safe) The tuple of all slot agents
        self._slotAgents = tuple(
            SlotAgent(slotID=i) for i in xrange(concurrency))
        assert self._slotAgents

        # Thread-safe event queue for SwapController
        self._eventQ = Queue.Queue()

        # Main event loop's ModelSwapperInterface instance. MUST NOT use from
        # threads because ModelSwapperInterface
        self._mainSwapper = ModelSwapperInterface()

        # A (non-thread-safe) FIFO of models that are waiting to be scheduled for
        # running; there is incoming data for them that needs to be processed
        self._waitingModelsFIFO = []

        # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances
        self._runningModelsMap = dict()

        # A (non-thread-safe) list of free slot indexes into the self._slotsAgents
        # tuple
        self._freeSlots = list(xrange(len(self._slotAgents)))

        # (non-thread-safe) Indexes of SlotAgents pending preemption
        self._pendingPreemptSlotsSet = set()

        self._notificationReaderThread = threading.Thread(
            target=self._runNotificationReaderThread,
            name="%s-input-reader-%s" % (self.__class__.__name__, id(self)))
        # Allow process to exit even if thread is still running
        self._notificationReaderThread.setDaemon(True)
    def __init__(self, modelID):
        """
    :param modelID: model ID; string
    """
        self._logger = _getLogger()

        self._modelID = modelID

        self._swapperAPI = ModelSwapperInterface()

        self._archiver = _ModelArchiver(self._modelID)

        # "deleteModel" command handler sets this flag to force our processing
        # loop to terminate
        self._done = False

        modelSwapperConfig = ModelSwapperConfig()

        self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint(
            "model_runner", "target_requests_per_checkpoint")

        self._profiling = (modelSwapperConfig.getboolean(
            "debugging", "profiling")
                           or self._logger.isEnabledFor(logging.DEBUG))

        if self._profiling:
            self._logger.info("Profiling is turned on")

            self._modelLoadSec = 0
Beispiel #3
0
    def testStartMultipleModelRunnersAndStopThem(self):
        # Starts several ModelRunners and stops them gracefully
        # to confirm that they can all stop without conflicting with each other:
        # if ModelRunnerProxy doesn't configure subprocess.Popen with
        # `close_fds=True`, then graceful shutdown will fail because the stdin
        # of some child processes will be cloned into those that are started
        # after them and closing stding of an earlier ModelRunner child process
        # won't have the desired effect of terminating that process (since other
        # clones of that file descriptor will prevent it from fully closing)
        #
        # TODO send commands to models and verify output

        runners = []

        modelIDs = tuple("abcdef" + str(i) for i in xrange(5))

        with ModelSwapperInterface() as swapper:
            modelInputMQs = tuple(
                swapper._getModelInputQName(modelID=modelID)
                for modelID in modelIDs)

        with amqp_test_utils.managedQueueDeleter(modelInputMQs):
            with MessageBusConnector() as bus:
                for mq in modelInputMQs:
                    bus.createMessageQueue(mq, durable=True)

            for modelID in modelIDs:
                runners.append(
                    slot_agent.ModelRunnerProxy(modelID=modelID,
                                                onTermination=lambda: None,
                                                logger=_LOGGER))

            returnCodes = [runner.stopGracefully() for runner in runners]

        self.assertEqual(returnCodes, [0] * len(runners))
    def run(self):
        with ModelSwapperInterface() as modelSwapper:
            engine = repository.engineFactory()
            while True:
                with engine.connect() as conn:
                    pendingStacks = repository.retryOnTransientErrors(
                        repository.getAutostackMetricsPendingDataCollection)(
                            conn)

                if not pendingStacks:
                    time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC)
                    continue

                # Build a sequence of autostack metric requests
                requests = []
                for autostack, metrics in pendingStacks:
                    refBase = len(requests)
                    requests.extend(
                        AutostackMetricRequest(refID=refBase + i,
                                               autostack=autostack,
                                               metric=metric)
                        for i, metric in enumerate(metrics))

                # Collect, aggregate, and stream metric data
                self._processAutostackMetricRequests(engine, requests,
                                                     modelSwapper)
  def __init__(self, modelID):
    """
    :param modelID: model ID; string
    """
    self._logger = _getLogger()

    self._modelID = modelID

    self._swapperAPI = ModelSwapperInterface()

    self._archiver = _ModelArchiver(self._modelID)

    # "deleteModel" command handler sets this flag to force our processing
    # loop to terminate
    self._done = False

    modelSwapperConfig = ModelSwapperConfig()

    self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint(
      "model_runner", "target_requests_per_checkpoint")

    self._profiling = (
      modelSwapperConfig.getboolean("debugging", "profiling") or
      self._logger.isEnabledFor(logging.DEBUG))

    if self._profiling:
      self._logger.info("Profiling is turned on")

      self._modelLoadSec = 0
Beispiel #6
0
def runServer():
  # Get the current list of custom metrics
  appConfig = Config("application.conf",
                     os.environ["APPLICATION_CONFIG_PATH"])

  engine = repository.engineFactory(appConfig)
  global gCustomMetrics
  now = datetime.datetime.utcnow()

  with engine.connect() as conn:
    gCustomMetrics = dict(
      (m.name, [m, now]) for m in repository.getCustomMetrics(conn))

  queueName = appConfig.get("metric_listener", "queue_name")

  global gProfiling
  gProfiling = (appConfig.getboolean("debugging", "profiling") or
                LOGGER.isEnabledFor(logging.DEBUG))
  del appConfig

  metricStreamer = MetricStreamer()
  modelSwapper = ModelSwapperInterface()

  with MessageBusConnector() as bus:
    if not bus.isMessageQeueuePresent(queueName):
      bus.createMessageQueue(mqName=queueName, durable=True)
    LOGGER.info("Waiting for messages. To exit, press CTRL+C")
    with bus.consume(queueName) as consumer:
      messages = []
      messageRxTimes = []
      while True:
        message = consumer.pollOneMessage()
        if message is not None:
          messages.append(message)
          if gProfiling:
            messageRxTimes.append(time.time())

        if message is None or len(messages) >= MAX_MESSAGES_PER_BATCH:
          if messages:
            # Process the batch
            try:
              _handleBatch(engine,
                           messages,
                           messageRxTimes,
                           metricStreamer,
                           modelSwapper)
            except Exception:  # pylint: disable=W0703
              LOGGER.exception("Unknown failure in processing messages.")
              # Make sure that we ack messages when there is an unexpected error
              # to avoid getting hung forever on one bad record.

            # Ack all the messages
            messages[-1].ack(multiple=True)
            # Clear the message buffer
            messages = []
            messageRxTimes = []
          else:
            # Queue is empty, wait before retrying
            time.sleep(POLL_DELAY_SEC)
  def __init__(self, concurrency):
    """
    concurrency: allowed number of model slots
    """
    self._logger = _getLogger()

    self._profiling = (
      ModelSwapperConfig().getboolean("debugging", "profiling") or
      self._logger.isEnabledFor(logging.DEBUG))

    # Allowed number of model slots
    self._concurrency = concurrency

    # Input-reader thread target function sets this when it starts running to
    # let our event loop know that things are off to a good start
    self._notificationReaderStartedEvent = threading.Event()

    self._notificationMutex = threading.Lock()
    # Mutex used to guaranteed that no further model input notifications will
    # be added to main event queue once self._stopNotificationReader is set

    # _runNotificationReaderThread will not process any more notifications
    # once it detects that this flag is true
    self._stopNotificationReader = False

    # The event loop will exit some time after an event handler sets this flag
    # to True
    self._eventLoopStopPending = False

    # (non-thread-safe) The tuple of all slot agents
    self._slotAgents = tuple(SlotAgent(slotID=i) for i in xrange(concurrency))
    assert self._slotAgents

    # Thread-safe event queue for SwapController
    self._eventQ = Queue.Queue()

    # Main event loop's ModelSwapperInterface instance. MUST NOT use from
    # threads because ModelSwapperInterface
    self._mainSwapper = ModelSwapperInterface()

    # A (non-thread-safe) FIFO of models that are waiting to be scheduled for
    # running; there is incoming data for them that needs to be processed
    self._waitingModelsFIFO = []

    # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances
    self._runningModelsMap = dict()

    # A (non-thread-safe) list of free slot indexes into the self._slotsAgents
    # tuple
    self._freeSlots = list(xrange(len(self._slotAgents)))

    # (non-thread-safe) Indexes of SlotAgents pending preemption
    self._pendingPreemptSlotsSet = set()

    self._notificationReaderThread = threading.Thread(
      target=self._runNotificationReaderThread,
      name="%s-input-reader-%s" % (self.__class__.__name__, id(self)))
    # Allow process to exit even if thread is still running
    self._notificationReaderThread.setDaemon(True)
 def runConsumerThread(destList, numExpected):
     with ModelSwapperInterface() as swapper:
         with swapper.consumeResults() as consumer:
             for batch in consumer:
                 destList.append(batch)
                 batch.ack()
                 _LOGGER.info("Got result batch=%r", batch)
                 if len(destList) == numExpected:
                     break
    def _runNotificationReaderThread(self):
        """ Read model data notifications and pass them to the event loop """
        self._logger.info("Notification Reader thread is running")

        # Let the main event loop know that this thread started successfully
        self._notificationReaderStartedEvent.set()

        with ModelSwapperInterface() as swapperAPI:
            # First, make sure our notification message queue exists, so we don't
            # miss any new notifications while we're checking for models with pending
            # input
            self._logger.info("SWAPPER_API: %r", swapperAPI)
            swapperAPI.initSchedulerNotification()

            # At start, notify main event loop of each model whose input is non-empty
            self._logger.info("Checking for models with pending input")

            i = 0
            for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(),
                                        1):
                self._logger.debug("Input pending for model=%s", modelID)
                self._newInputNotifyTS(modelID=modelID)

            self._logger.info("%s model(s) had pending input", i)

            # Service the SwapController's input queue util stop is requested
            with swapperAPI.consumeModelSchedulerNotifications() as consumer:
                numHandledNotifications = 0
                try:
                    for notification in consumer:

                        with self._notificationMutex:

                            if self._stopNotificationReader:
                                self._logger.info(
                                    "Notification reader exiting due to stop request"
                                )
                                break

                            self._newInputNotifyTS(modelID=notification.value)

                            notification.ack()

                            numHandledNotifications += 1
                    else:
                        raise Exception(
                            "Unexpected termination of consumer loop in "
                            "Notification Reader")
                finally:
                    self._logger.info(
                        "Control is leaving notification reader loop after processing %s "
                        "notifications", numHandledNotifications)
def createHTMModel(modelId, params):
    """ Dispatch command to create HTM model

  :param modelId: unique identifier of the metric row

  :param modelParams: model params for creating a scalar model per ModelSwapper
    interface

  :param modelSwapper: htmengine.model_swapper.model_swapper_interface object
  """
    with ModelSwapperInterface() as modelSwapper:
        modelSwapper.defineModel(modelID=modelId,
                                 args=params,
                                 commandID=createGuid())
  def testSubmitResults(self, messageBusConnectorClassMock):
    results = [
      ModelCommandResult(commandID="abc", method="testMethod", status=0,
        args={'key1': 4098, 'key2': 4139}),
      ModelInferenceResult(rowID="foo", status=0, anomalyScore=1),
      ModelInferenceResult(rowID="bar", status=0, anomalyScore=2)
    ]

    messageBusConnectorMock = messageBusConnectorClassMock.return_value

    interface = ModelSwapperInterface()

    modelID = "foofar"
    msg = ResultMessagePackager.marshal(
      modelID=modelID,
      batchState=BatchPackager.marshal(batch=results))

    interface.submitResults(modelID=modelID, results=results)

    mqName = interface._resultsQueueName

    messageBusConnectorMock.publish.assert_called_once_with(mqName, msg,
                                                            persistent=True)
Beispiel #12
0
    def testStartModelRunnerAndStopIt(self):
        # Simple test that starts a ModelRunner and stops it gracefully
        # TODO send command to model and verify output

        modelID = "abcdef"

        with ModelSwapperInterface() as swapper:
            modelInputMQ = swapper._getModelInputQName(modelID=modelID)

        with amqp_test_utils.managedQueueDeleter(modelInputMQ):
            with MessageBusConnector() as bus:
                bus.createMessageQueue(modelInputMQ, durable=True)

            runner = slot_agent.ModelRunnerProxy(modelID=modelID,
                                                 onTermination=lambda: None,
                                                 logger=_LOGGER)

            returnCode = runner.stopGracefully()

            self.assertEqual(returnCode, 0)
class SwapController(object):
  """ Orchestrate model swapping across a fixed set of SlotAgent instances based
  on incoming model commands and records.
  """


  # Event methods for the SwapController's event loop. These are handled by
  # instance methods with names constructed as follows:
  # "_handle" + method + "Event" (e.g., _handleStopEventLoopRequestEvent)
  _NEW_INPUT_NOTIFY_METHOD = "NewInputNotify"
  _MODEL_DONE_NOTIFY_METHOD = "ModelDoneNotify"
  _STOP_EVENT_LOOP_REQUEST_METHOD = "StopEventLoopRequest"


  _NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC = 5
  _NOTIFICATION_READER_THREAD_JOIN_TIMEOUT_SEC = 10

  _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD = 1

  def __init__(self, concurrency):
    """
    concurrency: allowed number of model slots
    """
    self._logger = _getLogger()

    self._profiling = (
      ModelSwapperConfig().getboolean("debugging", "profiling") or
      self._logger.isEnabledFor(logging.DEBUG))

    # Allowed number of model slots
    self._concurrency = concurrency

    # Input-reader thread target function sets this when it starts running to
    # let our event loop know that things are off to a good start
    self._notificationReaderStartedEvent = threading.Event()

    self._notificationMutex = threading.Lock()
    # Mutex used to guaranteed that no further model input notifications will
    # be added to main event queue once self._stopNotificationReader is set

    # _runNotificationReaderThread will not process any more notifications
    # once it detects that this flag is true
    self._stopNotificationReader = False

    # The event loop will exit some time after an event handler sets this flag
    # to True
    self._eventLoopStopPending = False

    # (non-thread-safe) The tuple of all slot agents
    self._slotAgents = tuple(SlotAgent(slotID=i) for i in xrange(concurrency))
    assert self._slotAgents

    # Thread-safe event queue for SwapController
    self._eventQ = Queue.Queue()

    # Main event loop's ModelSwapperInterface instance. MUST NOT use from
    # threads because ModelSwapperInterface
    self._mainSwapper = ModelSwapperInterface()

    # A (non-thread-safe) FIFO of models that are waiting to be scheduled for
    # running; there is incoming data for them that needs to be processed
    self._waitingModelsFIFO = []

    # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances
    self._runningModelsMap = dict()

    # A (non-thread-safe) list of free slot indexes into the self._slotsAgents
    # tuple
    self._freeSlots = list(xrange(len(self._slotAgents)))

    # (non-thread-safe) Indexes of SlotAgents pending preemption
    self._pendingPreemptSlotsSet = set()

    self._notificationReaderThread = threading.Thread(
      target=self._runNotificationReaderThread,
      name="%s-input-reader-%s" % (self.__class__.__name__, id(self)))
    # Allow process to exit even if thread is still running
    self._notificationReaderThread.setDaemon(True)


  @logEntryExit(_getLogger, logging.INFO)
  @logExceptions(_getLogger)
  def run(self):
    """ Run SwapController; blocking """
    # Start our input-reader thread
    self._logger.info("Starting Notification Reader thread")
    self._notificationReaderThread.start()
    self._notificationReaderStartedEvent.wait(
      timeout=self._NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC)
    assert self._notificationReaderStartedEvent.is_set(), \
      "Notification-reader thread failed to start in time"

    self._logger.info("Notification Reader started, now entering Event Loop")

    requestedStopOfRemainingModels = False

    while True:
      if self._eventLoopStopPending:
        if not self._runningModelsMap and not self._waitingModelsFIFO:
          # All models are idle now, so close Slot Agents and bail out
          for sa in self._slotAgents:
            sa.close()

          self._logger.info("Closed all Slot Agents; leaving event loop")
          break

        elif not self._waitingModelsFIFO and not requestedStopOfRemainingModels:
          # Only running models remain, so request to stop them gracefully
          assert self._runningModelsMap

          for modelInfo in self._runningModelsMap.itervalues():
            if modelInfo.slotIndex not in self._pendingPreemptSlotsSet:
              self._slotAgents[modelInfo.slotIndex].stopModel()
              self._pendingPreemptSlotsSet.add(modelInfo.slotIndex)

          requestedStopOfRemainingModels = True
          self._logger.info("Requested stop of remaining running models")


      # Get and handle next event
      evt = self._eventQ.get()
      method = evt["method"]
      handler = getattr(self, "_handle" + method + "Event")
      handler(**evt)


  def requestStopTS(self):
    """ [thread-safe; non-blocking] Enqueue a request to stop the
    SwapController. This will cause the instance's run() method to return
    eventually
    """
    # This will prevent notification reader thread from processing any more
    # notifications. By the time we release the lock, no new notifications
    # will be placed in our event queue, so we can proceed with graceful
    # shutdown.
    with self._notificationMutex:
      self._stopNotificationReader = True

    self._logger.info("Requested stop of notification processing")

    # Asynchronously request graceful shutdown of our event loop. This allows
    # all remaining events from the notification-reader to be handled
    # prior to commensing graceful shutdown.
    self._logger.info("Requesting event-loop shutdown")
    self._eventQ.put({"method" : self._STOP_EVENT_LOOP_REQUEST_METHOD})


  def _newInputNotifyTS(self, modelID):
    """ [thread-safe] Notify Model Swapper that new input data arrived for the
    given model

    :param modelID: ID of the model for which new data arrived
    """
    self._eventQ.put(
      {"method" : self._NEW_INPUT_NOTIFY_METHOD, "modelID" : modelID})


  def _modelDoneNotifyTS(self, modelID, exitStatus):
    """ [thread-safe] Notify Model Swapper that a ModelRunner process completed.
    This method is passed as a callback to the SlotAgent that
    runs the model.

    :param modelID: model ID of model that completed execution
    :param exitStatus: the model's ModelRunner process exit status per
      os.WEXITSTATUS
    """
    self._eventQ.put({"method" : self._MODEL_DONE_NOTIFY_METHOD,
                      "modelID" : modelID, "exitStatus" : exitStatus,
                      "endTime" : time.time()})


  def _handleStopEventLoopRequestEvent(self, method):  # pylint: disable=W0613
    """ Set a flag to signal our event loop that it's time for graceful shutdown
    of the event loop. The event is enqueued by the "stop request" hander after
    it blocks further processing of notifications by notification reader.
    """
    self._eventLoopStopPending = True
    self._logger.info("Set _eventLoopStopPending")


  def _handleNewInputNotifyEvent(self, method,  # pylint: disable=W0613
                                 modelID):
    """ Notification that new input was queued up for a particular model """
    runningModelInfo = self._runningModelsMap.get(modelID)
    if runningModelInfo is not None:
      # This model is already running
      runningModelInfo.updateTimestamp()

    elif modelID not in self._waitingModelsFIFO:
      # This model was not running and is not awaiting execution

      # TODO: for a large _waitingModelsFIFO, the membership check may be slow.
      #  Consider employing a map of all models of interest with current state
      #  (waiting/pending/running/idle) reflected in the items for O(1) lookups,
      #  but profile first to see if it's a problem!

      # NOTE: it's possible that the model has already processed all its input
      #  and we're handling this notification belatedly, and this may result in
      #  unnecessary start-up of its ModelRunner. We should generally be pretty
      #  well caught up with notifications, and this shouldn't be an issue.
      #  However, if necessary, we can mitigate this by checking the model's
      #  input queue *just before* starting the model.

      if self._freeSlots:
        # No models should be waiting if we have a free slot
        assert not self._waitingModelsFIFO, repr(self._waitingModelsFIFO)

        # Assign the model to a free slot
        self._assignModelToFreeSlot(modelID)

      else:
        # This model needs to wait until resources become available
        self._waitingModelsFIFO.append(modelID)

        if self._profiling:
          self._logger.info("{TAG:SWAP.SC.MODEL.WAIT} model=%s; "
                            "numWaitingModels=%s; numPendingPreemptSlots=%s",
                            modelID, len(self._waitingModelsFIFO),
                            len(self._pendingPreemptSlotsSet))

        self._requestPreemptionOfRunningSlotIfNeededAndPossible()


  def _handleModelDoneNotifyEvent(self, method,  # pylint: disable=W0613
                                  modelID, exitStatus, endTime):
    """ Notification that a particular model completed execution and the
    SlotAgent that ran it is now available for a new assignment.

    exitStatus: the exit status of the ModelRunner process (per os.WEXITSTATUS)
    """
    doneModelInfo = self._runningModelsMap.pop(modelID)


    if self._profiling:
      self._logger.info(
        "{TAG:SWAP.SC.MODEL.DONE} model=%s; slot=%d; exitStatus=%d; "
        "duration=%s; numRunningModels=%s; numWaitingModels=%s", modelID,
        doneModelInfo.slotIndex, exitStatus, endTime - doneModelInfo.startTime,
        len(self._runningModelsMap), len(self._waitingModelsFIFO))

    assert doneModelInfo.slotIndex not in self._freeSlots
    assert 0 <= doneModelInfo.slotIndex < len(self._slotAgents)
    assert len(self._freeSlots) < len(self._slotAgents)

    self._freeSlots.append(doneModelInfo.slotIndex)

    # Drop the slot from pending-preempt set in case it was scheduled for
    # preemption
    self._pendingPreemptSlotsSet.discard(doneModelInfo.slotIndex)

    # Ack SlotAgent to ready it for the next model
    self._slotAgents[doneModelInfo.slotIndex].releaseSlot()

    if self._mainSwapper.modelInputPending(modelID):
      # There is more unprocessed input data for the completed model,
      # so notify ourselves asynchronously to schedule this model
      self._newInputNotifyTS(modelID)

    if self._waitingModelsFIFO:
      # Start a waiting model, now that we know there is a free slot
      newModelID = self._waitingModelsFIFO.pop(0)
      self._assignModelToFreeSlot(newModelID)

      self._requestPreemptionOfRunningSlotIfNeededAndPossible()


  def _assignModelToFreeSlot(self, modelID):
    """ Assign the given model to a free slot """
    assert modelID not in self._runningModelsMap
    assert modelID not in self._waitingModelsFIFO

    freeSlotIndex = self._freeSlots.pop()

    self._slotAgents[freeSlotIndex].startModel(
      modelID=modelID,
      modelFinishedCallback=partial(self._modelDoneNotifyTS, modelID))

    self._runningModelsMap[modelID] = _RunningModelInfo(freeSlotIndex)

    assert ((len(self._runningModelsMap) + len(self._freeSlots)) ==
            len(self._slotAgents)), (
      len(self._runningModelsMap), len(self._freeSlots), len(self._slotAgents))

    self._logger.debug(
      "{TAG:SWAP.SC.MODEL.ASSIGN} model=%s; slot=%s; numRunningModels=%s; "
      "numFreeSlots=%s; numWaitingModels=%s; numPendingPreemptSlots=%s",
      modelID, freeSlotIndex, len(self._runningModelsMap), len(self._freeSlots),
      len(self._waitingModelsFIFO), len(self._pendingPreemptSlotsSet))


  def _requestPreemptionOfRunningSlotIfNeededAndPossible(self):
    """ Schedule a single slot for preemption if needed and possible """
    # There shouldn't be any free slots when we're asked to preempt
    assert not self._freeSlots, repr(self._freeSlots)

    if (len(self._waitingModelsFIFO) <= len(self._pendingPreemptSlotsSet) or
        len(self._pendingPreemptSlotsSet) >= len(self._slotAgents)):
      # Not needed or no preemptable slots
      return

    # Find an LRU non-pending-preempt busy slot agent, and request to
    # preempt it
    lru = sorted(
      ((i.slotIndex, i.timestamp)
        for i in self._runningModelsMap.itervalues()
        if i.slotIndex not in self._pendingPreemptSlotsSet),
      key=lambda element: element[1])

    slotIndex, timestamp = lru.pop(0)

    # Request preemption of the LRU slot
    self._slotAgents[slotIndex].stopModel()
    self._pendingPreemptSlotsSet.add(slotIndex)

    if self._profiling:
      self._logger.info(
        "{TAG:SWAP.SC.SLOT.PREEMPT.REQ} slot=%d with timestamp=%s; "
        "numWaitingModels=%s; numPendingPreemptSlots=%s",
        slotIndex, timestamp, len(self._waitingModelsFIFO),
        len(self._pendingPreemptSlotsSet))


  @abortProgramOnAnyException(
    _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD,
    logger=_getLogger())
  @logEntryExit(_getLogger, logging.INFO)
  @logExceptions(_getLogger)
  def _runNotificationReaderThread(self):
    """ Read model data notifications and pass them to the event loop """
    self._logger.info("Notification Reader thread is running")

    # Let the main event loop know that this thread started successfully
    self._notificationReaderStartedEvent.set()


    with ModelSwapperInterface() as swapperAPI:
      # First, make sure our notification message queue exists, so we don't
      # miss any new notifications while we're checking for models with pending
      # input
      self._logger.info("SWAPPER_API: %r", swapperAPI)
      swapperAPI.initSchedulerNotification()

      # At start, notify main event loop of each model whose input is non-empty
      self._logger.info("Checking for models with pending input")

      i = 0
      for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(), 1):
        self._logger.debug("Input pending for model=%s", modelID)
        self._newInputNotifyTS(modelID=modelID)

      self._logger.info("%s model(s) had pending input", i)

      # Service the SwapController's input queue util stop is requested
      with swapperAPI.consumeModelSchedulerNotifications() as consumer:
        numHandledNotifications = 0
        try:
          for notification in consumer:

            with self._notificationMutex:

              if self._stopNotificationReader:
                self._logger.info(
                  "Notification reader exiting due to stop request")
                break

              self._newInputNotifyTS(modelID=notification.value)

              notification.ack()

              numHandledNotifications += 1
          else:
            raise Exception("Unexpected termination of consumer loop in "
                            "Notification Reader")
        finally:
          self._logger.info(
            "Control is leaving notification reader loop after processing %s "
            "notifications", numHandledNotifications)
    def testModelSwapper(self):
        """Simple end-to-end test of the model swapper system."""

        modelSchedulerSubprocess = self._startModelSchedulerSubprocess()
        self.addCleanup(lambda: modelSchedulerSubprocess.kill() if
                        modelSchedulerSubprocess.returncode is None else None)

        modelID = "foobar"
        resultBatches = []

        with ModelSwapperInterface() as swapperAPI:
            possibleModels = getScalarMetricWithTimeOfDayParams(metricData=[0],
                                                                minVal=0,
                                                                maxVal=1000)

            # Submit requests including a model creation command and two data rows.
            args = possibleModels[0]
            args["inputRecordSchema"] = (
                FieldMetaInfo("c0", FieldMetaType.datetime,
                              FieldMetaSpecial.timestamp),
                FieldMetaInfo("c1", FieldMetaType.float,
                              FieldMetaSpecial.none),
            )

            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")

            # Attempt to define the same model again
            _LOGGER.info("Defining the model again")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd2")

            # Send input rows to the model
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info("Submitting batch of %d input rows...",
                         len(inputRows))
            swapperAPI.submitRequests(modelID=modelID, requests=inputRows)

            _LOGGER.info("These models have pending input: %s",
                         swapperAPI.getModelsWithInputPending())

            # Retrieve all results.
            # NOTE: We collect results via background thread to avoid
            # deadlocking the test runner in the event consuming blocks unexpectedly
            _LOGGER.info("Reading all batches of results...")

            numBatchesExpected = 3
            resultBatches.extend(
                self._consumeResults(numBatchesExpected, timeout=20))

            self.assertEqual(len(resultBatches), numBatchesExpected)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

            # Delete the model
            _LOGGER.info("Deleting the model")
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")

            _LOGGER.info("Waiting for model deletion result")
            resultBatches.extend(self._consumeResults(1, timeout=20))

            self.assertEqual(len(resultBatches), 4)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=modelID)))

            # Try deleting the model again, to make sure there are no exceptions
            _LOGGER.info("Attempting to delete the model again")
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")

        # Verify results

        # First result batch should be the first defineModel result
        batch = resultBatches[0]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd1")

        # The second result batch should for the second defineModel result for the
        # same model
        batch = resultBatches[1]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd2")

        # The third batch should be for the two input rows
        batch = resultBatches[2]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), len(inputRows))

        for inputRow, result in zip(inputRows, batch.objects):
            self.assertIsInstance(result, ModelInferenceResult)
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.rowID, inputRow.rowID)
            self.assertIsInstance(result.anomalyScore, float)

        # The fourth batch should be for the "deleteModel"
        batch = resultBatches[3]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "deleteModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "deleteModelCmd1")

        # Signal Model Scheduler Service subprocess to shut down and wait for it
        waitResult = dict()

        def runWaiterThread():
            try:
                waitResult["returnCode"] = modelSchedulerSubprocess.wait()
            except:
                _LOGGER.exception(
                    "Waiting for modelSchedulerSubprocess failed")
                waitResult["exceptionInfo"] = traceback.format_exc()
                raise
            return

        modelSchedulerSubprocess.terminate()
        waiterThread = threading.Thread(target=runWaiterThread)
        waiterThread.setDaemon(True)
        waiterThread.start()
        waiterThread.join(timeout=30)
        self.assertFalse(waiterThread.isAlive())

        self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
Beispiel #15
0
    def testCollectAndPublishMetrics(self):
        # Start Metric Collector, create a set of Metrics, wait for it to collect
        # some metrics and to publish them to the metric_exchange, then validate
        # attributes of the published metrics.
        #
        # TODO Add more metric types
        # TODO Deeper validation of the published metrics

        # Start our own instance of metric collector and wait for data points
        with self._startModelSchedulerSubprocess() as modelSchedulerSubprocess, \
            self._startMetricCollectorSubprocess() as metricCollectorSubprocess:
            # Create some models for metric collector to harvest
            region = "us-west-2"
            namespace = "AWS/EC2"
            resourceType = ResourceTypeNames.EC2_INSTANCE

            engine = repository.engineFactory()
            adapter = createCloudwatchDatasourceAdapter()

            ec2Instances = adapter.describeResources(region=region,
                                                     resourceType=resourceType)

            self.assertGreater(len(ec2Instances), 0)

            maxModels = 10

            ec2Instances = ec2Instances[:min(maxModels, Quota.getInstanceQuota(
            ))]

            metricInstances = []

            _LOGGER.info("Starting %d models", len(ec2Instances))
            self.assertGreater(len(ec2Instances), 0)
            for ec2Instance in ec2Instances:

                metricSpec = {
                    "region": region,
                    "namespace": namespace,
                    "metric": "CPUUtilization",
                    "dimensions": {
                        "InstanceId": ec2Instance["resID"]
                    }
                }

                modelSpec = {
                    "datasource": "cloudwatch",
                    "metricSpec": metricSpec
                }

                metricId = adapter.monitorMetric(modelSpec)

                with engine.connect() as conn:
                    repository.setMetricStatus(conn, metricId,
                                               MetricStatus.ACTIVE)

                metricInstances.append(metricId)

            _LOGGER.info("Waiting for results from models...")

            seenMetricIDs = set()
            allMetricIDs = set(metricInstances)

            # Register a timeout so we won't deadlock the test
            def onTimeout(resultsQueueName):
                _LOGGER.error(
                    "Timed out waiting to get results from models; numResults=%d; "
                    "expected=%d", len(seenMetricIDs), len(allMetricIDs))

                # HACK delete model swapper results queue to abort the consumer
                try:
                    with MessageBusConnector() as bus:
                        bus.deleteMessageQueue(resultsQueueName)
                except Exception:
                    _LOGGER.exception("Failed to delete results mq=%s",
                                      resultsQueueName)
                    raise

            with ModelSwapperInterface() as modelSwapper:
                with modelSwapper.consumeResults() as consumer:
                    timer = threading.Timer(
                        120, onTimeout, args=[modelSwapper._resultsQueueName])
                    timer.start()
                    try:
                        for batch in consumer:
                            seenMetricIDs.add(batch.modelID)
                            batch.ack()
                            if seenMetricIDs == allMetricIDs:
                                break
                        else:
                            self.fail(
                                "Expected %d results, but got only %d: %s" % (
                                    len(allMetricIDs),
                                    len(seenMetricIDs),
                                    seenMetricIDs,
                                ))
                        _LOGGER.info("Got %d results from models",
                                     len(seenMetricIDs))
                    finally:
                        timer.cancel()

            # Terminate metric_collector subprocess gracefully to avoid too much
            # error logging junk on the terminal
            metricCollectorSubprocess.send_signal(signal.SIGINT)

            # Terminate metric_collector subprocess gracefully to avoid too much
            # error logging junk on the terminal
            modelSchedulerSubprocess.send_signal(signal.SIGINT)
Beispiel #16
0
  def run(self):
    """ Collect metric data and status for active metrics
    """
    # NOTE: the process pool must be created BEFORE this main (parent) process
    # creates any global or class-level shared resources (e.g., boto
    # connection) that would have undersirable consequences when
    # replicated into and used by forked child processes (e.g., the same MySQL
    # connection socket file descriptor used by multiple processes). And we
    # can't take advantage of the process Pool's maxtasksperchild feature
    # either (for the same reason)
    self._log.info("Starting grok Metric Collector")
    resultsQueue = multiprocessing.Manager().JoinableQueue()

    recvPipe, sendPipe = multiprocessing.Pipe(False)

    processPool = multiprocessing.Pool(
      processes=self._WORKER_PROCESS_POOL_SIZE,
      maxtasksperchild=None)

    try:
      with ModelSwapperInterface() as modelSwapper:
        engine = repository.engineFactory()
        while True:
          startTime = time.time()

          if startTime > self._nextCacheGarbageCollectionTime:
            # TODO: unit-test
            self._garbageCollectInfoCache()

          # Determine which metrics are due for an update
          metricsToUpdate = self._getCandidateMetrics(engine)

          filterDuration = time.time() - startTime

          if not metricsToUpdate:
            time.sleep(self._NO_PENDING_METRICS_SLEEP_SEC)
            continue

          # Collect metric data
          collectionStartTime = time.time()

          poolResults = self._collectDataForMetrics(metricsToUpdate,
                                                    processPool,
                                                    resultsQueue)

          # Process/dispatch results in parallel in another thread as results
          # become available in resultsQueue
          dispatchThread = (
            threading.Thread(target=self._processAndDispatchThreadTarget,
                             args=(engine,
                                   metricsToUpdate,
                                   resultsQueue,
                                   modelSwapper,
                                   sendPipe)))
          dispatchStartTime = time.time()
          dispatchThread.start()

          # Syncronize with processPool
          poolResults.wait() # Wait for collection tasks to complete

          metricPollDuration = time.time() - collectionStartTime

          resultsQueue.join() # Block until all tasks completed...

          # Syncronize with dispatchThread
          resultsQueue.put(self._SENTINEL) # Signal to dispatchThread that
                                           # there are no more results to
                                           # process.
          resultsQueue.join()
          numEmpty, numErrors = recvPipe.recv() # Get dispatchThread stats

          dispatchDuration = time.time() - dispatchStartTime

          self._log.info(
            "Processed numMetrics=%d; numEmpty=%d; numErrors=%d; "
            "duration=%.4fs (filter=%.4fs; query=%.4fs; dispatch=%.4fs)",
            len(metricsToUpdate), numEmpty, numErrors,
            time.time() - startTime, filterDuration,
            metricPollDuration, dispatchDuration)
    finally:
      self._log.info("Exiting Metric Collector run-loop")
      processPool.terminate()
      processPool.join()
    def testCloneModel(self):

        modelSchedulerSubprocess = self._startModelSchedulerSubprocess()
        self.addCleanup(lambda: modelSchedulerSubprocess.kill() if
                        modelSchedulerSubprocess.returncode is None else None)

        modelID = "abc"
        destModelID = "def"

        resultBatches = []

        with ModelSwapperInterface() as swapperAPI:
            args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0],
                                                             minVal=0,
                                                             maxVal=1000)

            # Submit requests including a model creation command and two data rows.
            args["inputRecordSchema"] = (
                FieldMetaInfo("c0", FieldMetaType.datetime,
                              FieldMetaSpecial.timestamp),
                FieldMetaInfo("c1", FieldMetaType.float,
                              FieldMetaSpecial.none),
            )

            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 1)

            # Clone the just-defined model
            _LOGGER.info("Cloning model")
            swapperAPI.cloneModel(modelID,
                                  destModelID,
                                  commandID="cloneModelCmd1")

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 2)

            # Send input rows to the clone
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info("Submitting batch of %d input rows...",
                         len(inputRows))
            swapperAPI.submitRequests(modelID=destModelID, requests=inputRows)

            _LOGGER.info("These models have pending input: %s",
                         swapperAPI.getModelsWithInputPending())

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 3)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

            # Delete the model
            _LOGGER.info("Deleting the model")
            swapperAPI.deleteModel(modelID=destModelID,
                                   commandID="deleteModelCmd1")

            _LOGGER.info("Waiting for model deletion result")
            resultBatches.extend(self._consumeResults(1, timeout=20))

            self.assertEqual(len(resultBatches), 4)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=destModelID)))

        # Verify results

        # First result batch should be the defineModel result
        batch = resultBatches[0]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd1")

        # The second result batch should for the cloneModel result
        batch = resultBatches[1]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "cloneModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "cloneModelCmd1")

        # The third batch should be for the two input rows
        batch = resultBatches[2]
        self.assertEqual(batch.modelID, destModelID)
        self.assertEqual(len(batch.objects), len(inputRows))

        for inputRow, result in zip(inputRows, batch.objects):
            self.assertIsInstance(result, ModelInferenceResult)
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.rowID, inputRow.rowID)
            self.assertIsInstance(result.anomalyScore, float)
            self.assertIsInstance(result.multiStepBestPredictions, dict)

        # The fourth batch should be for the "deleteModel"
        batch = resultBatches[3]
        self.assertEqual(batch.modelID, destModelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "deleteModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "deleteModelCmd1")

        # Signal Model Scheduler Service subprocess to shut down and wait for it
        waitResult = dict()

        def runWaiterThread():
            try:
                waitResult["returnCode"] = modelSchedulerSubprocess.wait()
            except:
                _LOGGER.exception(
                    "Waiting for modelSchedulerSubprocess failed")
                waitResult["exceptionInfo"] = traceback.format_exc()
                raise
            return

        modelSchedulerSubprocess.terminate()
        waiterThread = threading.Thread(target=runWaiterThread)
        waiterThread.setDaemon(True)
        waiterThread.start()
        waiterThread.join(timeout=30)
        self.assertFalse(waiterThread.isAlive())

        self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
def deleteHTMModel(modelId):
    with ModelSwapperInterface() as modelSwapper:
        modelSwapper.deleteModel(modelID=modelId, commandID=createGuid())
    def _auxTestRunModelWithFullThenIncrementalCheckpoints(
            self, classifierEnabled):
        modelID = "foobar"
        checkpointMgr = model_checkpoint_mgr.ModelCheckpointMgr()

        args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0],
                                                         minVal=0,
                                                         maxVal=1000)

        args["modelConfig"]["modelParams"]["clEnable"] = classifierEnabled

        # Submit requests including a model creation command and two data rows.
        args["inputRecordSchema"] = (
            FieldMetaInfo("c0", FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo("c1", FieldMetaType.float, FieldMetaSpecial.none),
        )

        with ModelSwapperInterface() as swapperAPI:
            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")
            # Send input rows to the model
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows), inputRows[0].rowID, inputRows[-1].rowID)
            swapperAPI.submitRequests(modelID=modelID, requests=inputRows)
            # Run model_runner and collect results
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=2,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            self.assertEqual(len(resultBatches), 2, repr(resultBatches))
            # First result batch should be the first defineModel result
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), 1)
            result = batch.objects[0]
            self.assertIsInstance(result, ModelCommandResult)
            self.assertEqual(result.method, "defineModel")
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.commandID, "defineModelCmd1")
            # The second result batch should be for the two input rows
            batch = resultBatches[1]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows))
            for inputRow, result in zip(inputRows, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            # Verify model checkpoint
            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertEqual(len(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME]),
                             2,
                             msg=repr(attrs))
            self.assertNotIn(model_runner._ModelArchiver.
                             _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                             attrs,
                             msg=repr(attrs))
            # Now, check incremental checkpointing
            inputRows2 = [
                ModelInputRow(
                    rowID=2,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 20), 2.7]),
                ModelInputRow(
                    rowID=3,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 25), 3.9]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows2), inputRows2[0].rowID, inputRows2[-1].rowID)
            inputBatchID = swapperAPI.submitRequests(modelID=modelID,
                                                     requests=inputRows2)
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows2))
            for inputRow, result in zip(inputRows2, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertSequenceEqual(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME],
                                     [inputBatchID],
                                     msg=repr(attrs))
            self.assertIn(model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                          attrs,
                          msg=repr(attrs))
            self.assertSequenceEqual(
                model_runner._ModelArchiver._decodeDataSamples(
                    attrs[model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]),
                [row.data for row in inputRows2],
                msg=repr(attrs))
            # Final run with incremental checkpointing
            inputRows3 = [
                ModelInputRow(
                    rowID=4,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 30), 4.7]),
                ModelInputRow(
                    rowID=5,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 35), 5.9]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows3), inputRows3[0].rowID, inputRows3[-1].rowID)
            inputBatchID = swapperAPI.submitRequests(modelID=modelID,
                                                     requests=inputRows3)
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows3))
            for inputRow, result in zip(inputRows3, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertSequenceEqual(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME],
                                     [inputBatchID],
                                     msg=repr(attrs))
            self.assertIn(model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                          attrs,
                          msg=repr(attrs))
            self.assertSequenceEqual(
                model_runner._ModelArchiver._decodeDataSamples(
                    attrs[model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]),
                [row.data for row in itertools.chain(inputRows2, inputRows3)],
                msg=repr(attrs))
            # Delete the model
            _LOGGER.info("Deleting the model=%s", modelID)
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            self.assertEqual(len(resultBatches), 1, repr(resultBatches))
            # First result batch should be the first defineModel result
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), 1)
            result = batch.objects[0]
            self.assertIsInstance(result, ModelCommandResult)
            self.assertEqual(result.method, "deleteModel")
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.commandID, "deleteModelCmd1")
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=modelID)))

            # The model checkpoint should be gone too
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.load(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.loadModelDefinition(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.loadCheckpointAttributes(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.remove(modelID)
class ModelRunner(object):

    # How many exception traceback tail characters to include in error results
    _MAX_TRACEBACK_TAIL = 400

    def __init__(self, modelID):
        """
    :param modelID: model ID; string
    """
        self._logger = _getLogger()

        self._modelID = modelID

        self._swapperAPI = ModelSwapperInterface()

        self._archiver = _ModelArchiver(self._modelID)

        # "deleteModel" command handler sets this flag to force our processing
        # loop to terminate
        self._done = False

        modelSwapperConfig = ModelSwapperConfig()

        self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint(
            "model_runner", "target_requests_per_checkpoint")

        self._profiling = (modelSwapperConfig.getboolean(
            "debugging", "profiling")
                           or self._logger.isEnabledFor(logging.DEBUG))

        if self._profiling:
            self._logger.info("Profiling is turned on")

            self._modelLoadSec = 0

    @property
    def _model(self):
        """ An OPF Model object or None if not loaded yet """
        return self._archiver.model

    @property
    def _inputRowEncoder(self):
        """ An _InputRowEncoder object or None if model not loaded yet """
        return self._archiver.inputRowEncoder

    @property
    def _checkpointMgr(self):
        return self._archiver.checkpointMgr

    def __repr__(self):
        return "%s<modelID=%s>" % (self.__class__.__name__, self._modelID)

    def __enter__(self):
        """ Context Manager protocol method. Allows a ModelRunner instance
    to be used in a "with" statement for automatic clean-up

    Parameters:
    ------------------------------------------------------------------------
    retval:     self.
    """
        return self

    def __exit__(self, _excType, _excVal, _excTb):
        """ Context Manager protocol method. Allows a ModelRunner instance
    to be used in a "with" statement for automatic cleanup

    Returns: False so as not to suppress the exception, if any
    """
        self.close()
        return False

    def close(self):
        """ Clean up """
        self._logger.debug("%r: Closing...", self)
        self._swapperAPI.close()

    @logExceptions(_getLogger())
    def run(self):
        startTime = time.time()

        self._logger.debug("%r: Starting run", self)

        totalBatches = 0
        totalRequests = 0
        totalDupBatches = 0

        # Retrieve the batch IDs that went into the current checkpoint since
        # previous checkpoint
        modelCheckpointBatchIDSet = self._archiver.modelCheckpointBatchIDSet

        try:
            while not self._done:
                currentRunBatchIDSet = set()
                currentRunInputSamples = []
                currentRunNumRequests = 0
                lastRequestBatch = None

                with self._swapperAPI.consumeRequests(
                        modelID=self._modelID, blocking=False) as consumer:

                    if self._profiling:
                        batchStartTime = time.time()

                    # Process the next run of batches until
                    # self._targetMaxRequestsPerCheckpoint is reached or exceeded
                    for candidateBatch in consumer:
                        if (candidateBatch.batchID in currentRunBatchIDSet
                                or candidateBatch.batchID
                                in modelCheckpointBatchIDSet):
                            self._logger.warn(
                                "%r: Already processed this batch=%s, skipping it (we must "
                                "have lost channel/connection before or during ack)",
                                self, candidateBatch.batchID)

                            # Make it go away for good
                            candidateBatch.ack()
                            totalDupBatches += 1
                            continue

                        # NOTE: lastRequestBatch is used after this loop to ack this and
                        # preceding batches
                        lastRequestBatch = candidateBatch

                        currentRunBatchIDSet.add(lastRequestBatch.batchID)

                        totalBatches += 1

                        inputObjects = lastRequestBatch.objects
                        numItems = len(inputObjects)
                        currentRunNumRequests += numItems
                        totalRequests += numItems

                        self._logger.debug(
                            "%r: Processing input batch #%s; batch=%s, numItems=%s...",
                            self, totalBatches, lastRequestBatch.batchID,
                            numItems)

                        if self._profiling:
                            procStartTime = time.time()
                            self._modelLoadSec = 0

                        # Process the input batch
                        results = self._processInputBatch(
                            inputObjects, currentRunInputSamples)

                        # Send results
                        if self._profiling:
                            submitStartTime = time.time()

                        self._swapperAPI.submitResults(modelID=self._modelID,
                                                       results=results)

                        if self._profiling:
                            now = time.time()

                            tailRowTimestampISO = tailRowID = None
                            # Assumption: no empty batches
                            if isinstance(inputObjects[-1], ModelInputRow):
                                # Assumption: entire batch consistes of ModelInputRow objects
                                tailRowTimestampISO = inputObjects[-1].data[
                                    0].isoformat() + "Z"
                                tailRowID = inputObjects[-1].rowID

                            self._logger.info(
                                "{TAG:SWAP.MR.BATCH.DONE} model=%s; batch=%s; numItems=%s; "
                                "tailRowID=%s; tailRowTS=%s; duration=%.4fs; "
                                "loadDuration=%.4fs; procDuration=%.4fs; submitDuration=%.4fs; "
                                "totalBatches=%s; totalItems=%s",
                                self._modelID, lastRequestBatch.batchID,
                                len(results), tailRowID, tailRowTimestampISO,
                                now - batchStartTime, self._modelLoadSec,
                                submitStartTime - procStartTime -
                                self._modelLoadSec, now - submitStartTime,
                                totalBatches, totalRequests)

                        if self._done:
                            self._logger.debug(
                                "%r: command handler requested exit, leaving "
                                "consumer loop", self)
                            break

                        if currentRunNumRequests >= self._targetMaxRequestsPerCheckpoint:
                            self._logger.debug(
                                "End of current run: currentRunNumRequests=%s",
                                currentRunNumRequests)
                            break
                    else:  # for
                        self._done = True
                        self._logger.debug(
                            "%r: No more input batches, leaving consumer loop",
                            self)

                    # Checkpoint the model and ack all request batches processed in this
                    # run
                    if lastRequestBatch is not None:
                        assert len(currentRunBatchIDSet) > 0

                        modelCheckpointBatchIDSet = currentRunBatchIDSet

                        # Checkpoint the model.
                        if self._model is not None:

                            if self._profiling:
                                checkpointStartTime = time.time()

                            self._archiver.saveModel(
                                currentRunBatchIDSet=currentRunBatchIDSet,
                                currentRunInputSamples=currentRunInputSamples)

                            if self._profiling:
                                self._logger.info(
                                    "%r: {TAG:SWAP.MR.CHKPT.DONE} currentRunNumRequests=%s; "
                                    "currentRunNumBatches=%s; duration=%.4fs",
                                    self, currentRunNumRequests,
                                    len(currentRunBatchIDSet),
                                    time.time() - checkpointStartTime)

                        # Ack the last request batch and all unacked batches before it
                        # consumed during this run
                        lastRequestBatch.ack(multiple=True)

                    if not self._done:
                        # Check if SwapController wants to preempt us (it closes the other
                        # end of our stdin to signal the intention)
                        readReadyList = select.select((sys.stdin, ), (), (),
                                                      0)[0]
                        if readReadyList:
                            self._logger.debug(
                                "%r: SwapController wants to preempt us, "
                                "leaving", self)
                            self._done = True
        finally:
            if totalBatches == 0:
                self._logger.warn("%r: zero input batches were processed",
                                  self)

            self._logger.info(
                "%r: {TAG:SWAP.MR.FINAL.SUMMARY} totalBatches=%s; totalRequests=%s; "
                "totalDupBatches=%s; duration=%s", self, totalBatches,
                totalRequests, totalDupBatches,
                time.time() - startTime)

    def _processInputBatch(self, inputObjects, currentRunInputSamples):
        """ Process a batch of model commands and/or inference input data rows

    :param inputObjects: a sequence of ModelCommand or ModelInputRow objects

    :param currentRunInputSamples: a list; the data of input rows will be
      appended to this list for input rows that were processed successfully

    :returns: a sequence of the corresponding ModelCommandResult or
      ModelInferenceResult objects
    TODO: unit-test return value
    """
        results = []

        for request in inputObjects:
            if isinstance(request, ModelCommand):
                results.append(self._processModelCommand(request))

            elif isinstance(request, ModelInputRow):
                results.append(
                    self._processInputRow(request, currentRunInputSamples))

            else:
                raise ValueError("Unexpected request: " + repr(request))

        return results

    def _processModelCommand(self, command):
        """
    command: ModelCommand instance

    Returns: a ModelCommandResult instance
    """
        self._logger.info("%r: Processing model command: %r", self, command)
        try:
            if command.method == "defineModel":
                return self._defineModel(command)
            elif command.method == "deleteModel":
                return self._deleteModel(command)
            elif command.method == "cloneModel":
                return self._cloneModel(command)
            else:
                raise _ModelRunnerError(errno=htmengineerrno.ERR_INVALID_ARG,
                                        msg="Unknown command method=%s" %
                                        (command.method, ))

        except (Exception, _ModelRunnerError) as e:  # pylint: disable=W0703
            self._logger.exception("%r: %r failed;", self, command)
            return ModelCommandResult(
                commandID=command.commandID,
                method=command.method,
                status=(e.errno if isinstance(e, _ModelRunnerError) else
                        htmengineerrno.ERR),
                errorMessage="%s - %r failed on modelID=%s: %s. (tb: ...%s)" %
                (datetime.utcnow().isoformat(), command, self._modelID, str(e)
                 or repr(e),
                 traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:]))

    def _defineModel(self, command):
        """ Handle the "defineModel" command

    command: ModelCommand instance for the "defineModel" command

    retval: ModelCommandResult instance

    raises _ModelRunnerError
    """
        newModelDefinition = dict(
            modelParams=dict(modelConfig=command.args["modelConfig"],
                             inferenceArgs=command.args["inferenceArgs"]),
            inputSchema=command.args["inputRecordSchema"])

        try:
            self._checkpointMgr.define(modelID=self._modelID,
                                       definition=newModelDefinition)
            assert self._model is None
        except model_checkpoint_mgr.ModelAlreadyExists:
            # Check if the existing model has the same definition
            existingModelDefinition = self._checkpointMgr.loadModelDefinition(
                self._modelID)

            if newModelDefinition != existingModelDefinition:
                self._logger.error(
                    "defineModel: modelID=%s already exists with different "
                    "creation meta data; existing=%r; new=%r", self._modelID,
                    existingModelDefinition, newModelDefinition)

                raise _ModelRunnerError(
                    errno=htmengineerrno.ERR_MODEL_ALREADY_EXISTS,
                    msg="defineModel: modelID=%s already exists with different "
                    "creation meta data" % (self._modelID, ))
            else:
                # newModelDefinition is the same as existingModelDefinition; might be
                # side-effect of at-least-once delivery guarantee of the reliable data
                # path, so treat it as success
                self._logger.warn(
                    "defineModel: modelID=%s already exists with same creation meta "
                    "data (side-effect of at-least-once delivery guarantee?)",
                    self._modelID)

        return ModelCommandResult(commandID=command.commandID,
                                  method=command.method,
                                  status=0)

    def _cloneModel(self, command):
        """ Handle the "cloneModel" command

    command: ModelCommand instance for the "cloneModel" command

    retval: ModelCommandResult instance
    """
        try:
            self._checkpointMgr.clone(self._modelID, command.args["modelID"])
        except model_checkpoint_mgr.ModelNotFound as e:
            errorMessage = "%r: Cloning failed - source model not found (%r)" % (
                self,
                e,
            )
            self._logger.error(errorMessage)
            return ModelCommandResult(commandID=command.commandID,
                                      method=command.method,
                                      status=htmengineerrno.ERR_NO_SUCH_MODEL,
                                      errorMessage=errorMessage)
        except model_checkpoint_mgr.ModelAlreadyExists as e:
            # Assuming it's the result of at-least-once delivery semantics
            self._logger.warn(
                "%r: Cloning destination already exists, suppressing (%r)",
                self, e)

        return ModelCommandResult(commandID=command.commandID,
                                  method=command.method,
                                  status=0)

    def _deleteModel(self, command):
        """ Handle the "deleteModel" command

    command: ModelCommand instance for the "deleteModel" command

    retval: ModelCommandResult instance

    raises _ModelRunnerError
    """
        try:
            # Delete the model's checkpoint
            try:
                self._checkpointMgr.remove(modelID=self._modelID)
            except model_checkpoint_mgr.ModelNotFound:
                # Suppress it: could be side-effect of at-least-once delivery
                # guarantee, so treat it as success
                self._logger.warn(
                    "deleteModel: modelID=%s not found (side-effect "
                    "of at-least-once delivery guarantee?); "
                    "suppressing error", self._modelID)
            # Clean up model's resources in ModelSwapperInterface
            self._swapperAPI.cleanUpAfterModelDeletion(self._modelID)
        finally:
            self._archiver = _ModelArchiver(self._modelID)
            self._done = True

        return ModelCommandResult(commandID=command.commandID,
                                  method=command.method,
                                  status=0)

    def _processInputRow(self, row, currentRunInputSamples):
        """
    :param row: ModelInputRow instance

    :param currentRunInputSamples: a list; the input row's data will be appended
      to this list if the row is processed successfully

    :returns: a ModelInferenceResult instance
    """
        try:
            if self._model is None:
                self._loadModel()

            # Convert a flat input row into a format that is consumable by an OPF
            # model
            self._inputRowEncoder.appendRecord(row.data)
            inputRecord = self._inputRowEncoder.getNextRecordDict()

            # Infer
            r = self._model.run(inputRecord)

            currentRunInputSamples.append(row.data)

            return ModelInferenceResult(
                rowID=row.rowID,
                status=0,
                anomalyScore=r.inferences["anomalyScore"],
                multiStepBestPredictions=r.inferences.get(
                    "multiStepBestPredictions"))

        except (Exception, _ModelRunnerError) as e:  # pylint: disable=W0703
            self._logger.exception("%r: Inference failed for row=%r", self,
                                   row)
            return ModelInferenceResult(
                rowID=row.rowID,
                status=(e.errno if isinstance(e, _ModelRunnerError) else
                        htmengineerrno.ERR),
                errorMessage=
                "%s - Inference failed for rowID=%s of modelID=%s (%r): "
                "(tb: ...%s)" %
                (datetime.utcnow().isoformat(), row.rowID, self._modelID, e,
                 traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:]))

    def _loadModel(self):
        """ Load the model and construct the input row encoder

    Side-effect: self._model and self._inputRowEncoder are loaded;
      self._modelLoadSec is set if profiling is turned on
    """
        if self._model is None:
            if self._profiling:
                startTime = time.time()

            # Load the model
            self._archiver.loadModel()

            if self._profiling:
                self._modelLoadSec = time.time() - startTime
class ModelRunner(object):

  # How many exception traceback tail characters to include in error results
  _MAX_TRACEBACK_TAIL = 400


  def __init__(self, modelID):
    """
    :param modelID: model ID; string
    """
    self._logger = _getLogger()

    self._modelID = modelID

    self._swapperAPI = ModelSwapperInterface()

    self._archiver = _ModelArchiver(self._modelID)

    # "deleteModel" command handler sets this flag to force our processing
    # loop to terminate
    self._done = False

    modelSwapperConfig = ModelSwapperConfig()

    self._targetMaxRequestsPerCheckpoint = modelSwapperConfig.getint(
      "model_runner", "target_requests_per_checkpoint")

    self._profiling = (
      modelSwapperConfig.getboolean("debugging", "profiling") or
      self._logger.isEnabledFor(logging.DEBUG))

    if self._profiling:
      self._logger.info("Profiling is turned on")

      self._modelLoadSec = 0


  @property
  def _model(self):
    """ An OPF Model object or None if not loaded yet """
    return self._archiver.model


  @property
  def _inputRowEncoder(self):
    """ An _InputRowEncoder object or None if model not loaded yet """
    return self._archiver.inputRowEncoder


  @property
  def _checkpointMgr(self):
    return self._archiver.checkpointMgr


  def __repr__(self):
    return "%s<modelID=%s>" % (self.__class__.__name__, self._modelID)


  def __enter__(self):
    """ Context Manager protocol method. Allows a ModelRunner instance
    to be used in a "with" statement for automatic clean-up

    Parameters:
    ------------------------------------------------------------------------
    retval:     self.
    """
    return self


  def __exit__(self, _excType, _excVal, _excTb):
    """ Context Manager protocol method. Allows a ModelRunner instance
    to be used in a "with" statement for automatic cleanup

    Returns: False so as not to suppress the exception, if any
    """
    self.close()
    return False


  def close(self):
    """ Clean up """
    self._logger.debug("%r: Closing...", self)
    self._swapperAPI.close()


  @logExceptions(_getLogger())
  def run(self):
    startTime = time.time()

    self._logger.debug("%r: Starting run", self)

    totalBatches = 0
    totalRequests = 0
    totalDupBatches = 0

    # Retrieve the batch IDs that went into the current checkpoint since
    # previous checkpoint
    modelCheckpointBatchIDSet = self._archiver.modelCheckpointBatchIDSet

    try:
      while not self._done:
        currentRunBatchIDSet = set()
        currentRunInputSamples = []
        currentRunNumRequests = 0
        lastRequestBatch = None

        with self._swapperAPI.consumeRequests(
            modelID=self._modelID, blocking=False) as consumer:

          if self._profiling:
            batchStartTime = time.time()

          # Process the next run of batches until
          # self._targetMaxRequestsPerCheckpoint is reached or exceeded
          for candidateBatch in consumer:
            if (candidateBatch.batchID in currentRunBatchIDSet or
                candidateBatch.batchID in modelCheckpointBatchIDSet):
              self._logger.warn(
                "%r: Already processed this batch=%s, skipping it (we must "
                "have lost channel/connection before or during ack)",
                self, candidateBatch.batchID)

              # Make it go away for good
              candidateBatch.ack()
              totalDupBatches += 1
              continue

            # NOTE: lastRequestBatch is used after this loop to ack this and
            # preceding batches
            lastRequestBatch = candidateBatch

            currentRunBatchIDSet.add(lastRequestBatch.batchID)

            totalBatches += 1

            inputObjects = lastRequestBatch.objects
            numItems = len(inputObjects)
            currentRunNumRequests += numItems
            totalRequests += numItems

            self._logger.debug(
              "%r: Processing input batch #%s; batch=%s, numItems=%s...",
              self, totalBatches, lastRequestBatch.batchID, numItems)

            if self._profiling:
              procStartTime = time.time()
              self._modelLoadSec = 0

            # Process the input batch
            results = self._processInputBatch(inputObjects,
                                              currentRunInputSamples)

            # Send results
            if self._profiling:
              submitStartTime = time.time()

            self._swapperAPI.submitResults(modelID=self._modelID,
                                           results=results)

            if self._profiling:
              now = time.time()

              tailRowTimestampISO = tailRowID = None
              # Assumption: no empty batches
              if isinstance(inputObjects[-1], ModelInputRow):
                # Assumption: entire batch consistes of ModelInputRow objects
                tailRowTimestampISO = inputObjects[-1].data[0].isoformat() + "Z"
                tailRowID = inputObjects[-1].rowID

              self._logger.info(
                "{TAG:SWAP.MR.BATCH.DONE} model=%s; batch=%s; numItems=%s; "
                "tailRowID=%s; tailRowTS=%s; duration=%.4fs; "
                "loadDuration=%.4fs; procDuration=%.4fs; submitDuration=%.4fs; "
                "totalBatches=%s; totalItems=%s", self._modelID,
                lastRequestBatch.batchID, len(results), tailRowID,
                tailRowTimestampISO, now - batchStartTime, self._modelLoadSec,
                submitStartTime - procStartTime - self._modelLoadSec,
                now - submitStartTime, totalBatches, totalRequests)

            if self._done:
              self._logger.debug("%r: command handler requested exit, leaving "
                                 "consumer loop", self)
              break

            if currentRunNumRequests >= self._targetMaxRequestsPerCheckpoint:
              self._logger.debug("End of current run: currentRunNumRequests=%s",
                                 currentRunNumRequests)
              break
          else: # for
            self._done = True
            self._logger.debug(
              "%r: No more input batches, leaving consumer loop", self)

          # Checkpoint the model and ack all request batches processed in this
          # run
          if lastRequestBatch is not None:
            assert len(currentRunBatchIDSet) > 0

            modelCheckpointBatchIDSet = currentRunBatchIDSet

            # Checkpoint the model.
            if self._model is not None:

              if self._profiling:
                checkpointStartTime = time.time()

              self._archiver.saveModel(
                currentRunBatchIDSet=currentRunBatchIDSet,
                currentRunInputSamples=currentRunInputSamples)

              if self._profiling:
                self._logger.info(
                  "%r: {TAG:SWAP.MR.CHKPT.DONE} currentRunNumRequests=%s; "
                  "currentRunNumBatches=%s; duration=%.4fs",
                  self, currentRunNumRequests, len(currentRunBatchIDSet),
                  time.time() - checkpointStartTime)

            # Ack the last request batch and all unacked batches before it
            # consumed during this run
            lastRequestBatch.ack(multiple=True)

          if not self._done:
            # Check if SwapController wants to preempt us (it closes the other
            # end of our stdin to signal the intention)
            readReadyList = select.select((sys.stdin,), (), (), 0)[0]
            if readReadyList:
              self._logger.debug("%r: SwapController wants to preempt us, "
                                 "leaving", self)
              self._done = True
    finally:
      if totalBatches == 0:
        self._logger.warn("%r: zero input batches were processed", self)

      self._logger.info(
        "%r: {TAG:SWAP.MR.FINAL.SUMMARY} totalBatches=%s; totalRequests=%s; "
        "totalDupBatches=%s; duration=%s", self, totalBatches, totalRequests,
        totalDupBatches, time.time() - startTime)



  def _processInputBatch(self, inputObjects, currentRunInputSamples):
    """ Process a batch of model commands and/or inference input data rows

    :param inputObjects: a sequence of ModelCommand or ModelInputRow objects

    :param currentRunInputSamples: a list; the data of input rows will be
      appended to this list for input rows that were processed successfully

    :returns: a sequence of the corresponding ModelCommandResult or
      ModelInferenceResult objects
    TODO: unit-test return value
    """
    results = []

    for request in inputObjects:
      if isinstance(request, ModelCommand):
        results.append(self._processModelCommand(request))

      elif isinstance(request, ModelInputRow):
        results.append(self._processInputRow(request, currentRunInputSamples))

      else:
        raise ValueError("Unexpected request: " + repr(request))

    return results


  def _processModelCommand(self, command):
    """
    command: ModelCommand instance

    Returns: a ModelCommandResult instance
    """
    self._logger.info("%r: Processing model command: %r", self, command)
    try:
      if command.method == "defineModel":
        return self._defineModel(command)
      elif command.method == "deleteModel":
        return self._deleteModel(command)
      elif command.method == "cloneModel":
        return self._cloneModel(command)
      else:
        raise _ModelRunnerError(
          errno=htmengineerrno.ERR_INVALID_ARG,
          msg="Unknown command method=%s" % (command.method,))

    except (Exception, _ModelRunnerError) as e:  # pylint: disable=W0703
      self._logger.exception("%r: %r failed;", self, command)
      return ModelCommandResult(
        commandID=command.commandID, method=command.method,
        status=(e.errno if isinstance(e, _ModelRunnerError) else
                htmengineerrno.ERR),
        errorMessage="%s - %r failed on modelID=%s: %s. (tb: ...%s)" % (
          datetime.utcnow().isoformat(), command, self._modelID,
          str(e) or repr(e),
          traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:]))


  def _defineModel(self, command):
    """ Handle the "defineModel" command

    command: ModelCommand instance for the "defineModel" command

    retval: ModelCommandResult instance

    raises _ModelRunnerError
    """
    newModelDefinition = dict(
      modelParams=dict(modelConfig=command.args["modelConfig"],
                       inferenceArgs=command.args["inferenceArgs"]),
      inputSchema=command.args["inputRecordSchema"]
    )

    try:
      self._checkpointMgr.define(modelID=self._modelID,
                                 definition=newModelDefinition)
      assert self._model is None
    except model_checkpoint_mgr.ModelAlreadyExists:
      # Check if the existing model has the same definition
      existingModelDefinition = self._checkpointMgr.loadModelDefinition(
        self._modelID)

      if newModelDefinition != existingModelDefinition:
        self._logger.error(
          "defineModel: modelID=%s already exists with different "
          "creation meta data; existing=%r; new=%r", self._modelID,
          existingModelDefinition, newModelDefinition)

        raise _ModelRunnerError(
          errno=htmengineerrno.ERR_MODEL_ALREADY_EXISTS,
          msg="defineModel: modelID=%s already exists with different "
              "creation meta data" % (self._modelID,))
      else:
        # newModelDefinition is the same as existingModelDefinition; might be
        # side-effect of at-least-once delivery guarantee of the reliable data
        # path, so treat it as success
        self._logger.warn(
          "defineModel: modelID=%s already exists with same creation meta "
          "data (side-effect of at-least-once delivery guarantee?)",
          self._modelID)

    return ModelCommandResult(commandID=command.commandID,
                              method=command.method, status=0)


  def _cloneModel(self, command):
    """ Handle the "cloneModel" command

    command: ModelCommand instance for the "cloneModel" command

    retval: ModelCommandResult instance
    """
    try:
      self._checkpointMgr.clone(self._modelID, command.args["modelID"])
    except model_checkpoint_mgr.ModelNotFound as e:
      errorMessage = "%r: Cloning failed - source model not found (%r)" % (
        self, e,)
      self._logger.error(errorMessage)
      return ModelCommandResult(commandID=command.commandID,
                                method=command.method,
                                status=htmengineerrno.ERR_NO_SUCH_MODEL,
                                errorMessage=errorMessage)
    except model_checkpoint_mgr.ModelAlreadyExists as e:
      # Assuming it's the result of at-least-once delivery semantics
      self._logger.warn(
        "%r: Cloning destination already exists, suppressing (%r)", self, e)

    return ModelCommandResult(commandID=command.commandID,
                              method=command.method, status=0)


  def _deleteModel(self, command):
    """ Handle the "deleteModel" command

    command: ModelCommand instance for the "deleteModel" command

    retval: ModelCommandResult instance

    raises _ModelRunnerError
    """
    try:
      # Delete the model's checkpoint
      try:
        self._checkpointMgr.remove(modelID=self._modelID)
      except model_checkpoint_mgr.ModelNotFound:
        # Suppress it: could be side-effect of at-least-once delivery
        # guarantee, so treat it as success
        self._logger.warn("deleteModel: modelID=%s not found (side-effect "
                          "of at-least-once delivery guarantee?); "
                          "suppressing error", self._modelID)
      # Clean up model's resources in ModelSwapperInterface
      self._swapperAPI.cleanUpAfterModelDeletion(self._modelID)
    finally:
      self._archiver = _ModelArchiver(self._modelID)
      self._done = True

    return ModelCommandResult(commandID=command.commandID,
                              method=command.method, status=0)


  def _processInputRow(self, row, currentRunInputSamples):
    """
    :param row: ModelInputRow instance

    :param currentRunInputSamples: a list; the input row's data will be appended
      to this list if the row is processed successfully

    :returns: a ModelInferenceResult instance
    """
    try:
      if self._model is None:
        self._loadModel()

      # Convert a flat input row into a format that is consumable by an OPF
      # model
      self._inputRowEncoder.appendRecord(row.data)
      inputRecord = self._inputRowEncoder.getNextRecordDict()

      # Infer
      r = self._model.run(inputRecord)

      currentRunInputSamples.append(row.data)

      return ModelInferenceResult(
        rowID=row.rowID,
        status=0,
        anomalyScore=r.inferences["anomalyScore"],
        multiStepBestPredictions=r.inferences.get("multiStepBestPredictions"))

    except (Exception, _ModelRunnerError) as e:  # pylint: disable=W0703
      self._logger.exception("%r: Inference failed for row=%r", self, row)
      return ModelInferenceResult(
        rowID=row.rowID,
        status=(e.errno if isinstance(e, _ModelRunnerError) else
                htmengineerrno.ERR),
        errorMessage="%s - Inference failed for rowID=%s of modelID=%s (%r): "
                     "(tb: ...%s)" % (
                       datetime.utcnow().isoformat(),
                       row.rowID, self._modelID, e,
                       traceback.format_exc()[-self._MAX_TRACEBACK_TAIL:]))


  def _loadModel(self):
    """ Load the model and construct the input row encoder

    Side-effect: self._model and self._inputRowEncoder are loaded;
      self._modelLoadSec is set if profiling is turned on
    """
    if self._model is None:
      if self._profiling:
        startTime = time.time()

      # Load the model
      self._archiver.loadModel()

      if self._profiling:
        self._modelLoadSec = time.time() - startTime
Beispiel #22
0
    def run(self):
        """
    Consumes pending results.  Once result batch arrives, it will be dispatched
    to the correct model command result handler.

    :see: `_processModelCommandResult` and `_processModelInferenceResults`
    """
        # Properties for publishing model command results on RabbitMQ exchange
        modelCommandResultProperties = MessageProperties(
            deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE,
            headers=dict(dataType="model-cmd-result"))

        # Properties for publishing model inference results on RabbitMQ exchange
        modelInferenceResultProperties = MessageProperties(
            deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE)

        # Declare an exchange for forwarding our results
        with amqp.synchronous_amqp_client.SynchronousAmqpClient(
                amqp.connection.getRabbitmqConnectionParameters(
                )) as amqpClient:
            amqpClient.declareExchange(self._modelResultsExchange,
                                       exchangeType="fanout",
                                       durable=True)

        with ModelSwapperInterface() as modelSwapper, MessageBusConnector(
        ) as bus:
            with modelSwapper.consumeResults() as consumer:
                for batch in consumer:
                    if self._profiling:
                        batchStartTime = time.time()

                    inferenceResults = []
                    for result in batch.objects:
                        try:
                            if isinstance(result, ModelCommandResult):
                                self._processModelCommandResult(
                                    batch.modelID, result)
                                # Construct model command result message for consumption by
                                # downstream processes
                                try:
                                    cmdResultMessage = self._composeModelCommandResultMessage(
                                        modelID=batch.modelID,
                                        cmdResult=result)
                                except (ObjectNotFoundError,
                                        MetricNotMonitoredError):
                                    pass
                                else:
                                    bus.publishExg(
                                        exchange=self._modelResultsExchange,
                                        routingKey="",
                                        body=self._serializeModelResult(
                                            cmdResultMessage),
                                        properties=modelCommandResultProperties
                                    )
                            elif isinstance(result, ModelInferenceResult):
                                inferenceResults.append(result)
                            else:
                                self._log.error("Unsupported ModelResult=%r",
                                                result)
                        except ObjectNotFoundError:
                            self._log.exception(
                                "Error processing result=%r "
                                "from model=%s", result, batch.modelID)

                    if inferenceResults:
                        result = self._processModelInferenceResults(
                            inferenceResults, metricID=batch.modelID)

                        if result is not None:
                            # Construct model results payload for consumption by
                            # downstream processes
                            metricRow, dataRows = result
                            resultsMessage = self._composeModelInferenceResultsMessage(
                                metricRow, dataRows)

                            payload = self._serializeModelResult(
                                resultsMessage)

                            bus.publishExg(
                                exchange=self._modelResultsExchange,
                                routingKey="",
                                body=payload,
                                properties=modelInferenceResultProperties)

                    batch.ack()

                    if self._profiling:
                        if inferenceResults:
                            if result is not None:
                                # pylint: disable=W0633
                                metricRow, rows = result
                                rowIdRange = ("%s..%s" %
                                              (rows[0].rowid, rows[-1].rowid)
                                              if len(rows) > 1 else str(
                                                  rows[0].rowid))
                                self._log.info(
                                    "{TAG:ANOM.BATCH.INF.DONE} model=%s; "
                                    "numItems=%d; rows=[%s]; tailRowTS=%s; duration=%.4fs; "
                                    "ds=%s; name=%s", batch.modelID,
                                    len(batch.objects), rowIdRange,
                                    rows[-1].timestamp.isoformat() + "Z",
                                    time.time() - batchStartTime,
                                    metricRow.datasource, metricRow.name)
                        else:
                            self._log.info(
                                "{TAG:ANOM.BATCH.CMD.DONE} model=%s; "
                                "numItems=%d; duration=%.4fs", batch.modelID,
                                len(batch.objects),
                                time.time() - batchStartTime)

        self._log.info("Stopped processing model results")
class SwapController(object):
    """ Orchestrate model swapping across a fixed set of SlotAgent instances based
  on incoming model commands and records.
  """

    # Event methods for the SwapController's event loop. These are handled by
    # instance methods with names constructed as follows:
    # "_handle" + method + "Event" (e.g., _handleStopEventLoopRequestEvent)
    _NEW_INPUT_NOTIFY_METHOD = "NewInputNotify"
    _MODEL_DONE_NOTIFY_METHOD = "ModelDoneNotify"
    _STOP_EVENT_LOOP_REQUEST_METHOD = "StopEventLoopRequest"

    _NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC = 5
    _NOTIFICATION_READER_THREAD_JOIN_TIMEOUT_SEC = 10

    _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD = 1

    def __init__(self, concurrency):
        """
    concurrency: allowed number of model slots
    """
        self._logger = _getLogger()

        self._profiling = (ModelSwapperConfig().getboolean(
            "debugging", "profiling")
                           or self._logger.isEnabledFor(logging.DEBUG))

        # Allowed number of model slots
        self._concurrency = concurrency

        # Input-reader thread target function sets this when it starts running to
        # let our event loop know that things are off to a good start
        self._notificationReaderStartedEvent = threading.Event()

        self._notificationMutex = threading.Lock()
        # Mutex used to guaranteed that no further model input notifications will
        # be added to main event queue once self._stopNotificationReader is set

        # _runNotificationReaderThread will not process any more notifications
        # once it detects that this flag is true
        self._stopNotificationReader = False

        # The event loop will exit some time after an event handler sets this flag
        # to True
        self._eventLoopStopPending = False

        # (non-thread-safe) The tuple of all slot agents
        self._slotAgents = tuple(
            SlotAgent(slotID=i) for i in xrange(concurrency))
        assert self._slotAgents

        # Thread-safe event queue for SwapController
        self._eventQ = Queue.Queue()

        # Main event loop's ModelSwapperInterface instance. MUST NOT use from
        # threads because ModelSwapperInterface
        self._mainSwapper = ModelSwapperInterface()

        # A (non-thread-safe) FIFO of models that are waiting to be scheduled for
        # running; there is incoming data for them that needs to be processed
        self._waitingModelsFIFO = []

        # A (non-thread-safe) map of modelIDs to _RunningModelInfo instances
        self._runningModelsMap = dict()

        # A (non-thread-safe) list of free slot indexes into the self._slotsAgents
        # tuple
        self._freeSlots = list(xrange(len(self._slotAgents)))

        # (non-thread-safe) Indexes of SlotAgents pending preemption
        self._pendingPreemptSlotsSet = set()

        self._notificationReaderThread = threading.Thread(
            target=self._runNotificationReaderThread,
            name="%s-input-reader-%s" % (self.__class__.__name__, id(self)))
        # Allow process to exit even if thread is still running
        self._notificationReaderThread.setDaemon(True)

    @logEntryExit(_getLogger, logging.INFO)
    @logExceptions(_getLogger())
    def run(self):
        """ Run SwapController; blocking """
        # Start our input-reader thread
        self._logger.info("Starting Notification Reader thread")
        self._notificationReaderThread.start()
        self._notificationReaderStartedEvent.wait(
            timeout=self._NOTIFICATION_READER_THREAD_START_WAIT_TIMEOUT_SEC)
        assert self._notificationReaderStartedEvent.is_set(), \
          "Notification-reader thread failed to start in time"

        self._logger.info(
            "Notification Reader started, now entering Event Loop")

        requestedStopOfRemainingModels = False

        while True:
            if self._eventLoopStopPending:
                if not self._runningModelsMap and not self._waitingModelsFIFO:
                    # All models are idle now, so close Slot Agents and bail out
                    for sa in self._slotAgents:
                        sa.close()

                    self._logger.info(
                        "Closed all Slot Agents; leaving event loop")
                    break

                elif not self._waitingModelsFIFO and not requestedStopOfRemainingModels:
                    # Only running models remain, so request to stop them gracefully
                    assert self._runningModelsMap

                    for modelInfo in self._runningModelsMap.itervalues():
                        if modelInfo.slotIndex not in self._pendingPreemptSlotsSet:
                            self._slotAgents[modelInfo.slotIndex].stopModel()
                            self._pendingPreemptSlotsSet.add(
                                modelInfo.slotIndex)

                    requestedStopOfRemainingModels = True
                    self._logger.info(
                        "Requested stop of remaining running models")

            # Get and handle next event
            evt = self._eventQ.get()
            method = evt["method"]
            handler = getattr(self, "_handle" + method + "Event")
            handler(**evt)

    def requestStopTS(self):
        """ [thread-safe; non-blocking] Enqueue a request to stop the
    SwapController. This will cause the instance's run() method to return
    eventually
    """
        # This will prevent notification reader thread from processing any more
        # notifications. By the time we release the lock, no new notifications
        # will be placed in our event queue, so we can proceed with graceful
        # shutdown.
        with self._notificationMutex:
            self._stopNotificationReader = True

        self._logger.info("Requested stop of notification processing")

        # Asynchronously request graceful shutdown of our event loop. This allows
        # all remaining events from the notification-reader to be handled
        # prior to commensing graceful shutdown.
        self._logger.info("Requesting event-loop shutdown")
        self._eventQ.put({"method": self._STOP_EVENT_LOOP_REQUEST_METHOD})

    def _newInputNotifyTS(self, modelID):
        """ [thread-safe] Notify Model Swapper that new input data arrived for the
    given model

    :param modelID: ID of the model for which new data arrived
    """
        self._eventQ.put({
            "method": self._NEW_INPUT_NOTIFY_METHOD,
            "modelID": modelID
        })

    def _modelDoneNotifyTS(self, modelID, exitStatus):
        """ [thread-safe] Notify Model Swapper that a ModelRunner process completed.
    This method is passed as a callback to the SlotAgent that
    runs the model.

    :param modelID: model ID of model that completed execution
    :param exitStatus: the model's ModelRunner process exit status per
      os.WEXITSTATUS
    """
        self._eventQ.put({
            "method": self._MODEL_DONE_NOTIFY_METHOD,
            "modelID": modelID,
            "exitStatus": exitStatus,
            "endTime": time.time()
        })

    def _handleStopEventLoopRequestEvent(self, method):  # pylint: disable=W0613
        """ Set a flag to signal our event loop that it's time for graceful shutdown
    of the event loop. The event is enqueued by the "stop request" hander after
    it blocks further processing of notifications by notification reader.
    """
        self._eventLoopStopPending = True
        self._logger.info("Set _eventLoopStopPending")

    def _handleNewInputNotifyEvent(
            self,
            method,  # pylint: disable=W0613
            modelID):
        """ Notification that new input was queued up for a particular model """
        runningModelInfo = self._runningModelsMap.get(modelID)
        if runningModelInfo is not None:
            # This model is already running
            runningModelInfo.updateTimestamp()

        elif modelID not in self._waitingModelsFIFO:
            # This model was not running and is not awaiting execution

            # TODO: for a large _waitingModelsFIFO, the membership check may be slow.
            #  Consider employing a map of all models of interest with current state
            #  (waiting/pending/running/idle) reflected in the items for O(1) lookups,
            #  but profile first to see if it's a problem!

            # NOTE: it's possible that the model has already processed all its input
            #  and we're handling this notification belatedly, and this may result in
            #  unnecessary start-up of its ModelRunner. We should generally be pretty
            #  well caught up with notifications, and this shouldn't be an issue.
            #  However, if necessary, we can mitigate this by checking the model's
            #  input queue *just before* starting the model.

            if self._freeSlots:
                # No models should be waiting if we have a free slot
                assert not self._waitingModelsFIFO, repr(
                    self._waitingModelsFIFO)

                # Assign the model to a free slot
                self._assignModelToFreeSlot(modelID)

            else:
                # This model needs to wait until resources become available
                self._waitingModelsFIFO.append(modelID)

                if self._profiling:
                    self._logger.info(
                        "{TAG:SWAP.SC.MODEL.WAIT} model=%s; "
                        "numWaitingModels=%s; numPendingPreemptSlots=%s",
                        modelID, len(self._waitingModelsFIFO),
                        len(self._pendingPreemptSlotsSet))

                self._requestPreemptionOfRunningSlotIfNeededAndPossible()

    def _handleModelDoneNotifyEvent(
            self,
            method,  # pylint: disable=W0613
            modelID,
            exitStatus,
            endTime):
        """ Notification that a particular model completed execution and the
    SlotAgent that ran it is now available for a new assignment.

    exitStatus: the exit status of the ModelRunner process (per os.WEXITSTATUS)
    """
        doneModelInfo = self._runningModelsMap.pop(modelID)

        if self._profiling:
            self._logger.info(
                "{TAG:SWAP.SC.MODEL.DONE} model=%s; slot=%d; exitStatus=%d; "
                "duration=%s; numRunningModels=%s; numWaitingModels=%s",
                modelID, doneModelInfo.slotIndex, exitStatus,
                endTime - doneModelInfo.startTime, len(self._runningModelsMap),
                len(self._waitingModelsFIFO))

        assert doneModelInfo.slotIndex not in self._freeSlots
        assert 0 <= doneModelInfo.slotIndex < len(self._slotAgents)
        assert len(self._freeSlots) < len(self._slotAgents)

        self._freeSlots.append(doneModelInfo.slotIndex)

        # Drop the slot from pending-preempt set in case it was scheduled for
        # preemption
        self._pendingPreemptSlotsSet.discard(doneModelInfo.slotIndex)

        # Ack SlotAgent to ready it for the next model
        self._slotAgents[doneModelInfo.slotIndex].releaseSlot()

        if self._mainSwapper.modelInputPending(modelID):
            # There is more unprocessed input data for the completed model,
            # so notify ourselves asynchronously to schedule this model
            self._newInputNotifyTS(modelID)

        if self._waitingModelsFIFO:
            # Start a waiting model, now that we know there is a free slot
            newModelID = self._waitingModelsFIFO.pop(0)
            self._assignModelToFreeSlot(newModelID)

            self._requestPreemptionOfRunningSlotIfNeededAndPossible()

    def _assignModelToFreeSlot(self, modelID):
        """ Assign the given model to a free slot """
        assert modelID not in self._runningModelsMap
        assert modelID not in self._waitingModelsFIFO

        freeSlotIndex = self._freeSlots.pop()

        self._slotAgents[freeSlotIndex].startModel(
            modelID=modelID,
            modelFinishedCallback=partial(self._modelDoneNotifyTS, modelID))

        self._runningModelsMap[modelID] = _RunningModelInfo(freeSlotIndex)

        assert ((len(self._runningModelsMap) + len(self._freeSlots)) == len(
            self._slotAgents)), (len(self._runningModelsMap),
                                 len(self._freeSlots), len(self._slotAgents))

        self._logger.debug(
            "{TAG:SWAP.SC.MODEL.ASSIGN} model=%s; slot=%s; numRunningModels=%s; "
            "numFreeSlots=%s; numWaitingModels=%s; numPendingPreemptSlots=%s",
            modelID, freeSlotIndex, len(self._runningModelsMap),
            len(self._freeSlots), len(self._waitingModelsFIFO),
            len(self._pendingPreemptSlotsSet))

    def _requestPreemptionOfRunningSlotIfNeededAndPossible(self):
        """ Schedule a single slot for preemption if needed and possible """
        # There shouldn't be any free slots when we're asked to preempt
        assert not self._freeSlots, repr(self._freeSlots)

        if (len(self._waitingModelsFIFO) <= len(self._pendingPreemptSlotsSet)
                or len(self._pendingPreemptSlotsSet) >= len(self._slotAgents)):
            # Not needed or no preemptable slots
            return

        # Find an LRU non-pending-preempt busy slot agent, and request to
        # preempt it
        lru = sorted(((i.slotIndex, i.timestamp)
                      for i in self._runningModelsMap.itervalues()
                      if i.slotIndex not in self._pendingPreemptSlotsSet),
                     key=lambda element: element[1])

        slotIndex, timestamp = lru.pop(0)

        # Request preemption of the LRU slot
        self._slotAgents[slotIndex].stopModel()
        self._pendingPreemptSlotsSet.add(slotIndex)

        if self._profiling:
            self._logger.info(
                "{TAG:SWAP.SC.SLOT.PREEMPT.REQ} slot=%d with timestamp=%s; "
                "numWaitingModels=%s; numPendingPreemptSlots=%s", slotIndex,
                timestamp, len(self._waitingModelsFIFO),
                len(self._pendingPreemptSlotsSet))

    @abortProgramOnAnyException(
        _EXIT_CODE_ON_FAILURE_OF_NOTIFICATION_READER_THREAD,
        logger=_getLogger())
    @logEntryExit(_getLogger, logging.INFO)
    @logExceptions(_getLogger())
    def _runNotificationReaderThread(self):
        """ Read model data notifications and pass them to the event loop """
        self._logger.info("Notification Reader thread is running")

        # Let the main event loop know that this thread started successfully
        self._notificationReaderStartedEvent.set()

        with ModelSwapperInterface() as swapperAPI:
            # First, make sure our notification message queue exists, so we don't
            # miss any new notifications while we're checking for models with pending
            # input
            self._logger.info("SWAPPER_API: %r", swapperAPI)
            swapperAPI.initSchedulerNotification()

            # At start, notify main event loop of each model whose input is non-empty
            self._logger.info("Checking for models with pending input")

            i = 0
            for i, modelID in enumerate(swapperAPI.getModelsWithInputPending(),
                                        1):
                self._logger.debug("Input pending for model=%s", modelID)
                self._newInputNotifyTS(modelID=modelID)

            self._logger.info("%s model(s) had pending input", i)

            # Service the SwapController's input queue util stop is requested
            with swapperAPI.consumeModelSchedulerNotifications() as consumer:
                numHandledNotifications = 0
                try:
                    for notification in consumer:

                        with self._notificationMutex:

                            if self._stopNotificationReader:
                                self._logger.info(
                                    "Notification reader exiting due to stop request"
                                )
                                break

                            self._newInputNotifyTS(modelID=notification.value)

                            notification.ack()

                            numHandledNotifications += 1
                    else:
                        raise Exception(
                            "Unexpected termination of consumer loop in "
                            "Notification Reader")
                finally:
                    self._logger.info(
                        "Control is leaving notification reader loop after processing %s "
                        "notifications", numHandledNotifications)