def test_no_hang_when_group_id_cycled(self): planner = WorkPlanner() def my_state(): return State(next_group_id=99999, active_commands=LinkedList(), current_group_id=99999, next_group_commands=set(), is_shutdown=False) planner = WorkPlanner(init_state_factory=my_state) tracker = GroupTracker() thread_count = 1 for i in range(10): planner.add_command(process_event()) for j in range(thread_count): planner.add_command(Die()) exc = None def fn(planner: WorkPlanner): nonlocal exc try: while True: LOG.log(TRACE, "Requesting for a work") # import pudb.remote # pudb.remote.set_trace(term_size=(120, 40), port=9998) cmd = planner.get_next_command() LOG.log(TRACE, "The command is received %s [group=%s]", type(cmd), cmd.group) if isinstance(cmd, Die): LOG.log(TRACE, "Poison pill is received - exiting. Bye!") planner.notify_finished(cmd) break tracker.log(cmd) LOG.log(TRACE, "The job is done, notifying the planner") planner.notify_finished(cmd) LOG.log(TRACE, "Notified. ") except Exception as e: LOG.exception('*** ERROR ***') exc = e workers = [ Thread(target=fn, args=(planner, )) for t in range(thread_count) ] for t in workers: t.start() for t in workers: t.join() if exc: raise exc groups_processed = tracker.get_tracks() self.assertEqual([99999, 10**5, 0, 1, 2, 3, 4, 5, 6, 7], groups_processed)
def test_parallelism_is_possible(self): planner = WorkPlanner() for i in range(40): planner.add_command(entrypoint()) for j in range(4): planner.add_command(Die()) exc = None def fn(planner: WorkPlanner): nonlocal exc try: while True: LOG.log(TRACE, "Requesting for a work") cmd = planner.get_next_command() LOG.log(TRACE, "The command is received") if isinstance(cmd, Die): LOG.log(TRACE, "Poison pill is received - exiting. Bye!") break sleep(0.5) LOG.log(TRACE, "The job is done, notifying the planner") planner.notify_finished(cmd) LOG.log(TRACE, "Notified. ") except Exception as e: LOG.exception('*** ERROR ***') exc = e workers = [Thread(target=fn, args=(planner, )) for t in range(4)] time_1 = time.time() for t in workers: t.start() for t in workers: t.join() time_2 = time.time() logging.info('Processing time %s', time_2 - time_1) if exc: raise exc self.assertTrue(planner.is_empty(), 'Not all commands were read out') # Every thread sleeps for 500ms. 40 commands * 0.5 gives 20 seconds if # the commands executed sequentially self.assertLess(time_2 - time_1, 19, 'Suspiciously slow')
def test_groups_processed_sequentially_4_threads(self): planner = WorkPlanner() group_idx = 0 def ret_values(cmd: BaseMessage) -> bool: nonlocal group_idx # We don't care about the group distribution logic # in this test. Instead, we concentrate how different group # numbers are processed by the workers and the order # in which they are allowed to process the messages. # # _assign_group is invoked under a lock acquired, so this # increment is thread-safe. values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] ret = bool(values[group_idx]) group_idx += 1 return ret setattr(planner, '_should_increase_group', Mock(side_effect=ret_values)) tracker = GroupTracker() thread_count = 4 for i in range(10): planner.add_command(entrypoint()) for j in range(thread_count): planner.add_command(Die()) exc = None def fn(planner: WorkPlanner): nonlocal exc try: while True: LOG.log(TRACE, "Requesting for a work") cmd = planner.get_next_command() LOG.log(TRACE, "The command is received %s [group=%s]", type(cmd), cmd.group) if isinstance(cmd, Die): LOG.log(TRACE, "Poison pill is received - exiting. Bye!") planner.notify_finished(cmd) break tracker.log(cmd) LOG.log(TRACE, "The job is done, notifying the planner") planner.notify_finished(cmd) LOG.log(TRACE, "Notified. ") except Exception as e: LOG.exception('*** ERROR ***') exc = e workers = [ Thread(target=fn, args=(planner, )) for t in range(thread_count) ] for t in workers: t.start() for t in workers: t.join() if exc: raise exc groups_processed = tracker.get_tracks() self.assertEqual([0, 1, 1, 2, 2, 3, 3, 4, 4, 5], groups_processed)
def test_workers_not_blocked_by_future_work(self): planner = WorkPlanner() group_idx = 0 tracker = TimeTracker() thread_count = 2 # We add way more commands than we have workers now for i in range(8): planner.add_command(broadcast()) planner.add_command(entrypoint()) for j in range(thread_count): planner.add_command(Die()) exc = None def fn(planner: WorkPlanner): nonlocal exc try: while True: LOG.log(TRACE, "Requesting for a work") cmd = planner.get_next_command() LOG.log(TRACE, "The command is received %s [group=%s]", type(cmd), cmd.group) if isinstance(cmd, BroadcastHAStates): time.sleep(1) if isinstance(cmd, EntrypointRequest): planner.shutdown() if isinstance(cmd, Die): LOG.log(TRACE, "Poison pill is received - exiting. Bye!") planner.notify_finished(cmd) break tracker.log(cmd) LOG.log(TRACE, "The job is done, notifying the planner") planner.notify_finished(cmd) LOG.log(TRACE, "Notified. ") except Exception as e: LOG.exception('*** ERROR ***') exc = e workers = [ Thread(target=fn, args=(planner, )) for t in range(thread_count) ] t0 = time.time() for t in workers: t.start() for t in workers: t.join() if exc: raise exc tracks = tracker.get_tracks() idx, (cmd, ts) = self.find(tracks, lambda a: isinstance(a[0], EntrypointRequest), 'EntrypointRequest not processed') self.assertTrue(ts - t0 < 3) self.assertTrue(len(tracks) < 4)
def test_entrypoint_request_processed_asap(self): planner = WorkPlanner() group_idx = 0 def ret_values(cmd: BaseMessage) -> bool: nonlocal group_idx # We don't care about the group distribution logic # in this test. Instead, we concentrate how different group # numbers are processed by the workers and the order # in which they are allowed to process the messages. # # _assign_group is invoked under a lock acquired, so this # increment is thread-safe. values = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] ret = bool(values[group_idx]) group_idx += 1 return ret tracker = TimeTracker() thread_count = 4 planner.add_command(broadcast()) planner.add_command(broadcast()) planner.add_command(broadcast()) planner.add_command(entrypoint()) for j in range(thread_count): planner.add_command(Die()) exc = None def fn(planner: WorkPlanner): nonlocal exc try: while True: LOG.log(TRACE, "Requesting for a work") cmd = planner.get_next_command() LOG.log(TRACE, "The command is received %s [group=%s]", type(cmd), cmd.group) if isinstance(cmd, BroadcastHAStates): time.sleep(1.5) if isinstance(cmd, Die): LOG.log(TRACE, "Poison pill is received - exiting. Bye!") planner.notify_finished(cmd) break tracker.log(cmd) LOG.log(TRACE, "The job is done, notifying the planner") planner.notify_finished(cmd) LOG.log(TRACE, "Notified. ") except Exception as e: LOG.exception('*** ERROR ***') exc = e workers = [ Thread(target=fn, args=(planner, )) for t in range(thread_count) ] for t in workers: t.start() for t in workers: t.join() if exc: raise exc tracks = tracker.get_tracks() (cmd, ts) = tracks[0] self.assertTrue(isinstance(cmd, EntrypointRequest))
def update_process_failure(self, planner: WorkPlanner, ha_states: List[HAState]) -> List[HAState]: new_ha_states: List[HAState] = [] for state in ha_states: # We are only concerned with process statuses. if state.fid.container == ObjT.PROCESS.value: current_status = self.consul.get_process_current_status( state.status, state.fid) if current_status == ServiceHealth.OK: if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STARTED'): continue if current_status in (ServiceHealth.FAILED, ServiceHealth.STOPPED): if (self.consul.get_process_local_status( state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'): # Consul may report failure of a process multiple # times, so we don't want to send duplicate failure # notifications, it may cause delay in cleanup # activities. continue if current_status == ServiceHealth.UNKNOWN: # We got service status as UNKNOWN, that means hax was # notified about process failure but hax couldn't # confirm if the process is in failed state or have # failed and restarted. So, we will not loose the # event and try again to confirm the real time # process status by enqueing a broadcast event # specific to this process. # It is expected that the process status gets # eventually confirmed as either failed or passing (OK). # This situation typically arises due to delay # in receiving failure notification during which the # corresponding process might be restarting or have # already restarted. Thus it is important to confirm # the real time status of the process before # broadcasting failure. current_status = ServiceHealth.UNKNOWN planner.add_command( BroadcastHAStates(states=[ HAState(fid=state.fid, status=ServiceHealth.FAILED) ], reply_to=None)) if current_status not in (ServiceHealth.UNKNOWN, ServiceHealth.OFFLINE): # We also need to account and report the failure of remote # Motr processes to this node's hax and motr processes. # When Consul reports a remote process failure, hax # confirms its current status from Consul KV and updates # the list of failed services and also adds it to the # broadcast list. if current_status != ServiceHealth.OK: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED else: event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED self.consul.update_process_status( ConfHaProcess( chp_event=event, chp_type=int( m0HaProcessType.M0_CONF_HA_PROCESS_M0D), chp_pid=0, fid=state.fid)) new_ha_states.append( HAState(fid=state.fid, status=current_status)) else: new_ha_states.append(state) return new_ha_states