def wait_for_all(self, promise: HaLinkMessagePromise, timeout_sec: float = 30.0): """ Blocks the current thread until all of the messages in promise._ids are reported by Motr as delivered. Raises NotDelivered exception when timeout_sec exceeds. """ condition = Condition() skip_await = False with self.lock: self.groom_unsorted(promise) self.waiting_clients[promise] = condition skip_await = promise in self.recently_delivered while not promise.is_empty(): if skip_await: LOG.log( TRACE, 'Promise %s has been confirmed before, ' 'no need to block', promise) skip_await = False else: with condition: LOG.log(TRACE, 'Blocking until %s is confirmed', promise) condition.wait(timeout=timeout_sec) with self.lock: self._verify_delivered(promise, timeout_sec) if not promise.is_empty(): self.waiting_clients[promise] = condition
def wait_for_all(self, promise: HaLinkMessagePromise, timeout_sec: float = 30.0): """ Blocks the current thread until all of the messages in promise._ids are reported by Motr as delivered. Raises NotDelivered exception when timeout_sec exceeds. """ condition = Condition() with self.lock: self.waiting_clients[promise] = condition LOG.log(TRACE, 'waiting clients %s', self.waiting_clients) while not promise.is_empty(): with condition: LOG.log(TRACE, 'Blocking until %s is confirmed', promise) condition.wait(timeout=timeout_sec) with self.lock: if promise not in self.recently_delivered: raise NotDelivered('None of message tags =' + str(promise) + ' were delivered to Motr') confirmed_msgs = self.recently_delivered.pop(promise) LOG.log(TRACE, 'Thread unblocked - %s just received', confirmed_msgs) del self.waiting_clients[promise] promise.exclude_ids(confirmed_msgs) if not promise.is_empty(): self.waiting_clients[promise] = condition
def check_if_delivered_locked( self, promise: HaLinkMessagePromise) -> HaLinkMessagePromise: if not self.lock.locked(): raise RuntimeError('DeliveryHerald.lock not acquired') if promise in self.recently_delivered: confirmed_msgs = self.recently_delivered.pop(promise) LOG.debug('Thread unblocked - %s just received', confirmed_msgs) del self.waiting_clients[promise] promise.exclude_ids(confirmed_msgs) return promise
def _verify_delivered(self, promise: HaLinkMessagePromise, timeout_sec: float): """ Verify if any message in promise._ids are reported by Motr as delivered. Calling function should hold the self.lock. """ del self.waiting_clients[promise] if promise not in self.recently_delivered: raise NotDelivered('None of message tags =' + str(promise) + ' were delivered to Motr within ' + str(timeout_sec) + ' seconds timeout') confirmed_msgs = self.recently_delivered.pop(promise) LOG.log(TRACE, 'Thread unblocked - %s just received', confirmed_msgs) promise.exclude_ids(confirmed_msgs)
def notify_hax_stop(self): LOG.debug('Notifying hax stop') hax_fid = self.consul_util.get_hax_fid() hax_endpoint = self.consul_util.get_hax_endpoint() ids = self._ffi.hax_stop(self._ha_ctx, hax_fid.to_c(), make_c_str(hax_endpoint)) self.herald.wait_for_all(HaLinkMessagePromise(ids))
def test_exception_raised_by_timeout(self): herald = DeliveryHerald() notified_ok = True def fn(): try: sleep(1.5) herald.notify_delivered(MessageId(halink_ctx=43, tag=3)) except: logging.exception('*** ERROR ***') notified_ok = False t = Thread(target=fn) t.start() m = MessageId try: with self.assertRaises(NotDelivered): herald.wait_for_any(HaLinkMessagePromise( [m(42, 1), m(42, 3), m(42, 4)]), timeout_sec=5) finally: t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread')
def test_works_under_load(self): herald = DeliveryHerald() notified_ok = True def fn(msg: MessageId): try: sleep(1.5) herald.notify_delivered(msg) except: logging.exception('*** ERROR ***') notified_ok = False threads = [ Thread(target=fn, args=(MessageId(100, i), )) for i in range(1, 32) ] for t in threads: t.start() def m(x): return MessageId(halink_ctx=100, tag=x) try: herald.wait_for_all(HaLinkMessagePromise( [m(5), m(25), m(28), m(31)]), timeout_sec=5) finally: for t in threads: t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread')
def test_works_if_all_messages_confirmed(self): herald = DeliveryHerald() notified_ok = True def fn(): try: sleep(1.5) herald.notify_delivered(MessageId(halink_ctx=42, tag=3)) herald.notify_delivered(MessageId(halink_ctx=42, tag=1)) except: logging.exception('*** ERROR ***') notified_ok = False t = Thread(target=fn) t.start() m = MessageId try: herald.wait_for_all(HaLinkMessagePromise([m(42, 1), m(42, 3)]), timeout_sec=5) finally: t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread')
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): LOG.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.planner.add_command( BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None: fid = Fid.parse(payload['conf_sdev']) if fid.is_null(): logging.debug('Fid is 0:0. Skipping the message.') return q: Queue = Queue(1) self.queue.put( BroadcastHAStates(states=[HAState(fid, status='offline')], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def wait_for_all(self, promise: HaLinkMessagePromise, timeout_sec: float = 30.0): """ Blocks the current thread until all of the messages in promise._ids are reported by Motr as delivered. Raises NotDelivered exception when timeout_sec exceeds. """ condition = Condition() with self.lock: self.waiting_clients[promise] = condition LOG.log(TRACE, 'waiting clients %s', self.waiting_clients) while not promise.is_empty(): with condition: LOG.log(TRACE, 'Blocking until %s is confirmed', promise) condition.wait(timeout=timeout_sec) with self.lock: self._verify_delivered(promise, timeout_sec) if not promise.is_empty(): self.waiting_clients[promise] = condition
def handle_device_state_set(self, payload: Dict[str, Any]) -> None: # To add check for multiple object entries in a payload. # for objinfo in payload: hastate: Optional[HAState] = self.to_ha_state(payload) if not hastate: LOG.debug('No ha states to broadcast.') return q: Queue = Queue(1) LOG.debug('HA broadcast, node: %s device: %s state: %s', payload['node'], payload['device'], payload['state']) self.queue.put(BroadcastHAStates(states=[hastate], reply_to=q)) ids: List[MessageId] = q.get() self.herald.wait_for_any(HaLinkMessagePromise(ids))
def test_if_delivered_earlier_than_awaited_notified_immediately(self): herald = DeliveryHerald() notified_ok = True thread_count = 1 latch = CountDownLatch(thread_count) def fn(msg: MessageId): try: LOG.debug('Thread started') herald.notify_delivered(msg) LOG.debug('Notified delivery %s', msg) latch.count_down() LOG.debug('Main thread unblocked') except: logging.exception('*** ERROR ***') notified_ok = False threads = [ Thread(target=fn, args=(MessageId(100, i + 1), )) for i in range(thread_count) ] for t in threads: t.start() # Block until all the threads come to latch.count_down() and thus # the message is notified for sure latch.await() def m(x): return MessageId(halink_ctx=100, tag=x) try: started = time() herald.wait_for_all(HaLinkMessagePromise([m(1)]), timeout_sec=2) finished = time() finally: for t in threads: t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread') self.assertLess( finished - started, 5, 'Awaiting thread was unblocked only by a timeout. It means ' 'that unsorted_deliveries was analyzed too late.' )
def test_if_delivered_earlier_than_awaited_wait_many(self): herald = DeliveryHerald() notified_ok = True thread_count = 6 latch = CountDownLatch(thread_count) def fn(msg: MessageId): try: LOG.debug('Thread started') herald.notify_delivered(msg) LOG.debug('Notified delivery %s', msg) latch.count_down() LOG.debug('Main thread unblocked') except: logging.exception('*** ERROR ***') notified_ok = False threads = [ Thread(target=fn, args=(MessageId(100, i + 1), )) for i in range(thread_count) ] for t in threads: t.start() # Block until all the threads come to latch.count_down() and thus # the message is notified for sure latch.await() def m(x): return MessageId(halink_ctx=100, tag=x) try: herald.wait_for_all(HaLinkMessagePromise([m(1), m(5)]), timeout_sec=2) finally: for t in threads: t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread') self.assertEqual(4, len(herald.unsorted_deliveries.keys()))
def test_it_works(self): herald = DeliveryHerald() notified_ok = True def fn(): try: sleep(1.5) herald.notify_delivered(MessageId(halink_ctx=100, tag=1)) except: logging.exception('*** ERROR ***') notified_ok = False t = Thread(target=fn) t.start() m = MessageId herald.wait_for_any(HaLinkMessagePromise( [m(100, 1), m(100, 3), m(100, 4)]), timeout_sec=10) t.join() self.assertTrue(notified_ok, 'Unexpected exception appeared in notifier thread')
def _do_work(self, q: Queue, motr: Motr): ffi = motr._ffi LOG.info('Handler thread has started') ffi.adopt_motr_thread() def pull_msg(): try: return q.get(block=False) except Empty: return None try: while True: try: LOG.debug('Waiting for the next message') item = pull_msg() while item is None: time.sleep(0.2) if self.is_stopped: raise StopIteration() item = pull_msg() LOG.debug('Got %s message from queue', item) if isinstance(item, FirstEntrypointRequest): LOG.debug('first entrypoint request, broadcast FAILED') ids: List[MessageId] = motr.broadcast_ha_states([ HAState(fid=item.process_fid, status=ServiceHealth.FAILED) ]) LOG.debug('waiting for broadcast of %s for ep: %s', ids, item.remote_rpc_endpoint) self.herald.wait_for_all(HaLinkMessagePromise(ids)) motr.send_entrypoint_request_reply( EntrypointRequest( reply_context=item.reply_context, req_id=item.req_id, remote_rpc_endpoint=item.remote_rpc_endpoint, process_fid=item.process_fid, git_rev=item.git_rev, pid=item.pid, is_first_request=item.is_first_request)) elif isinstance(item, EntrypointRequest): # While replying any Exception is catched. In such a # case, the motr process will receive EAGAIN and # hence will need to make new attempt by itself motr.send_entrypoint_request_reply(item) elif isinstance(item, ProcessEvent): self._update_process_status(q, item.evt) elif isinstance(item, HaNvecGetEvent): fn = motr.ha_nvec_get_reply # If a consul-related exception appears, it will # be processed by repeat_if_fails. # # This thread will become blocked until that # intermittent error gets resolved. decorated = (repeat_if_fails(wait_seconds=5))(fn) decorated(item) elif isinstance(item, BroadcastHAStates): LOG.info('HA states: %s', item.states) ha_states = self.update_process_failure(q, item.states) result: List[MessageId] = motr.broadcast_ha_states( ha_states) if item.reply_to: item.reply_to.put(result) elif isinstance(item, StobIoqError): LOG.info('Stob IOQ: %s', item.fid) payload = dump_json(item) LOG.debug('Stob IOQ JSON: %s', payload) offset = self.eq_publisher.publish('stob-ioq', payload) LOG.debug('Written to epoch: %s', offset) elif isinstance(item, SnsRepairStatus): LOG.info('Requesting SNS repair status') status = motr.get_repair_status(item.fid) LOG.info('SNS repair status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStatus): LOG.info('Requesting SNS rebalance status') status = motr.get_rebalance_status(item.fid) LOG.info('SNS rebalance status is received: %s', status) item.reply_to.put(status) elif isinstance(item, SnsRebalanceStart): LOG.info('Requesting SNS rebalance start') motr.start_rebalance(item.fid) elif isinstance(item, SnsRebalanceStop): LOG.info('Requesting SNS rebalance stop') motr.stop_rebalance(item.fid) elif isinstance(item, SnsRebalancePause): LOG.info('Requesting SNS rebalance pause') motr.pause_rebalance(item.fid) elif isinstance(item, SnsRebalanceResume): LOG.info('Requesting SNS rebalance resume') motr.resume_rebalance(item.fid) elif isinstance(item, SnsRepairStart): LOG.info('Requesting SNS repair start') motr.start_repair(item.fid) elif isinstance(item, SnsRepairStop): LOG.info('Requesting SNS repair stop') motr.stop_repair(item.fid) elif isinstance(item, SnsRepairPause): LOG.info('Requesting SNS repair pause') motr.pause_repair(item.fid) elif isinstance(item, SnsRepairResume): LOG.info('Requesting SNS repair resume') motr.resume_repair(item.fid) else: LOG.warning('Unsupported event type received: %s', item) except StopIteration: raise except Exception: # no op, swallow the exception LOG.exception('**ERROR**') except StopIteration: ffi.shun_motr_thread() finally: LOG.info('Handler thread has exited')