def test_no_hang_when_group_id_cycled(self):
        planner = WorkPlanner()

        def my_state():
            return State(next_group_id=99999,
                         active_commands=LinkedList(),
                         current_group_id=99999,
                         next_group_commands=set(),
                         is_shutdown=False)

        planner = WorkPlanner(init_state_factory=my_state)

        tracker = GroupTracker()
        thread_count = 1
        for i in range(10):
            planner.add_command(process_event())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    # import pudb.remote
                    # pudb.remote.set_trace(term_size=(120, 40), port=9998)
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        groups_processed = tracker.get_tracks()
        self.assertEqual([99999, 10**5, 0, 1, 2, 3, 4,
                          5, 6, 7], groups_processed)
    def test_parallelism_is_possible(self):
        planner = WorkPlanner()
        for i in range(40):
            planner.add_command(entrypoint())

        for j in range(4):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received")
                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        break

                    sleep(0.5)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [Thread(target=fn, args=(planner, )) for t in range(4)]
        time_1 = time.time()
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        time_2 = time.time()
        logging.info('Processing time %s', time_2 - time_1)
        if exc:
            raise exc
        self.assertTrue(planner.is_empty(), 'Not all commands were read out')
        # Every thread sleeps for 500ms. 40 commands * 0.5 gives 20 seconds if
        # the commands executed sequentially
        self.assertLess(time_2 - time_1, 19, 'Suspiciously slow')
    def test_groups_processed_sequentially_4_threads(self):
        planner = WorkPlanner()
        group_idx = 0

        def ret_values(cmd: BaseMessage) -> bool:
            nonlocal group_idx
            # We don't care about the group distribution logic
            # in this test. Instead, we concentrate how different group
            # numbers are processed by the workers and the order
            # in which they are allowed to process the messages.
            #
            # _assign_group is invoked under a lock acquired, so this
            # increment is thread-safe.
            values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
            ret = bool(values[group_idx])
            group_idx += 1
            return ret

        setattr(planner, '_should_increase_group',
                Mock(side_effect=ret_values))
        tracker = GroupTracker()
        thread_count = 4
        for i in range(10):
            planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        groups_processed = tracker.get_tracks()
        self.assertEqual([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], groups_processed)
    def test_workers_not_blocked_by_future_work(self):
        planner = WorkPlanner()
        group_idx = 0

        tracker = TimeTracker()
        thread_count = 2
        # We add way more commands than we have workers now
        for i in range(8):
            planner.add_command(broadcast())

        planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)

                    if isinstance(cmd, BroadcastHAStates):
                        time.sleep(1)

                    if isinstance(cmd, EntrypointRequest):
                        planner.shutdown()

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]

        t0 = time.time()
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        tracks = tracker.get_tracks()

        idx, (cmd, ts) = self.find(tracks,
                             lambda a: isinstance(a[0], EntrypointRequest),
                             'EntrypointRequest not processed')
        self.assertTrue(ts - t0 < 3)
        self.assertTrue(len(tracks) < 4)
    def test_entrypoint_request_processed_asap(self):
        planner = WorkPlanner()
        group_idx = 0

        def ret_values(cmd: BaseMessage) -> bool:
            nonlocal group_idx
            # We don't care about the group distribution logic
            # in this test. Instead, we concentrate how different group
            # numbers are processed by the workers and the order
            # in which they are allowed to process the messages.
            #
            # _assign_group is invoked under a lock acquired, so this
            # increment is thread-safe.
            values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
            ret = bool(values[group_idx])
            group_idx += 1
            return ret

        tracker = TimeTracker()
        thread_count = 4
        planner.add_command(broadcast())
        planner.add_command(broadcast())
        planner.add_command(broadcast())
        planner.add_command(entrypoint())

        for j in range(thread_count):
            planner.add_command(Die())

        exc = None

        def fn(planner: WorkPlanner):
            nonlocal exc
            try:
                while True:
                    LOG.log(TRACE, "Requesting for a work")
                    cmd = planner.get_next_command()
                    LOG.log(TRACE, "The command is received %s [group=%s]",
                            type(cmd), cmd.group)
                    if isinstance(cmd, BroadcastHAStates):
                        time.sleep(1.5)

                    if isinstance(cmd, Die):
                        LOG.log(TRACE,
                                "Poison pill is received - exiting. Bye!")
                        planner.notify_finished(cmd)
                        break
                    tracker.log(cmd)
                    LOG.log(TRACE, "The job is done, notifying the planner")
                    planner.notify_finished(cmd)
                    LOG.log(TRACE, "Notified. ")

            except Exception as e:
                LOG.exception('*** ERROR ***')
                exc = e

        workers = [
            Thread(target=fn, args=(planner, )) for t in range(thread_count)
        ]
        for t in workers:
            t.start()

        for t in workers:
            t.join()
        if exc:
            raise exc
        tracks = tracker.get_tracks()
        (cmd, ts) = tracks[0]
        self.assertTrue(isinstance(cmd, EntrypointRequest))
Example #6
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.OK:
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                     continue
             if current_status in (ServiceHealth.FAILED,
                                   ServiceHealth.STOPPED):
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                     # Consul may report failure of a process multiple
                     # times, so we don't want to send duplicate failure
                     # notifications, it may cause delay in cleanup
                     # activities.
                     continue
             if current_status == ServiceHealth.UNKNOWN:
                 # We got service status as UNKNOWN, that means hax was
                 # notified about process failure but hax couldn't
                 # confirm if the process is in failed state or have
                 # failed and restarted. So, we will not loose the
                 # event and try again to confirm the real time
                 # process status by enqueing a broadcast event
                 # specific to this process.
                 # It is expected that the process status gets
                 # eventually confirmed as either failed or passing (OK).
                 # This situation typically arises due to delay
                 # in receiving failure notification during which the
                 # corresponding process might be restarting or have
                 # already restarted. Thus it is important to confirm
                 # the real time status of the process before
                 # broadcasting failure.
                 current_status = ServiceHealth.UNKNOWN
                 planner.add_command(
                     BroadcastHAStates(states=[
                         HAState(fid=state.fid, status=ServiceHealth.FAILED)
                     ],
                         reply_to=None))
             if current_status not in (ServiceHealth.UNKNOWN,
                                       ServiceHealth.OFFLINE):
                 # We also need to account and report the failure of remote
                 # Motr processes to this node's hax and motr processes.
                 # When Consul reports a remote process failure, hax
                 # confirms its current status from Consul KV and updates
                 # the list of failed services and also adds it to the
                 # broadcast list.
                 if current_status != ServiceHealth.OK:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                 else:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                 self.consul.update_process_status(
                     ConfHaProcess(
                         chp_event=event,
                         chp_type=int(
                             m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                         chp_pid=0,
                         fid=state.fid))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states