class FakeRpcInterface: @typecheck def __init__(self, cages_list): self._cages_list = cages_list self._queue = InterlockedQueue() self._pass = 0 self._stopped = Event() def get_cages(self): try: self._cages = self._cages_list[self._pass] except IndexError: self._cages = {} self._stopped.set() self._pass += 1 return list(self._cages.keys()) def get_nodes(self, cage): return self._cages[cage] def process_event(self, node, cage, up_down, probe_result): self._queue.push((node, cage, up_down, probe_result)) def extract_events(self): result = {} event = self._queue.pop(1.0) while event is not None: node, cage, up_down, probe_result = event events = result.setdefault("{0:s}.{1:s}".format(node, cage), []) if up_down == "up": events.append(probe_result) else: events.append(None) event = self._queue.pop(1.0) return result
class FakeRpcInterface: @typecheck def __init__(self, cages_list): self._cages_list = cages_list self._queue = InterlockedQueue() self._pass = 0 self._stopped = Event() def get_cages(self): try: self._cages = self._cages_list[self._pass] except IndexError: self._cages = {} self._stopped.set() self._pass += 1 return list(self._cages.keys()) def get_nodes(self, cage): return self._cages[cage] def process_event(self, node, cage, up_down, probe_result): self._queue.push((node, cage, up_down, probe_result)) def extract_events(self): result = {} event = self._queue.pop(1.0) while event is not None: node, cage, up_down, probe_result = event events = result.setdefault("{0:s}.{1:s}".format(node, cage), []) if up_down == "up": events.append(probe_result) else: events.append(None) event = self._queue.pop(1.0) return result
after = time() assert after - before >= 0.1 assert r.expired r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert not r.acquire_shared(sl) # shared acquiring times out after = time() assert after - before >= 0.1 assert r.expired ################################### ilq = InterlockedQueue() ilq.push(1) r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert r.pop(ilq) == 1 # popping succeeds after = time() assert after - before < 0.01 assert not r.expired r = Request(timeout = 0.1, interface = "test", protocol = "test") before = time() assert r.pop(ilq) is None # popping times out after = time() assert after - before >= 0.1 assert r.expired
after = time() assert after - before >= 0.1 ################################### ilq = InterlockedQueue() t = Timeout(0.11) before = time() assert t.pop(ilq) is None after = time() assert after - before >= 0.1 assert t.expired t.reset(0.1) ilq.push(1) before = time() assert t.pop(ilq) == 1 after = time() assert after - before < 0.01 assert not t.expired ################################### print("ok") ################################################################################ # EOF
after = time() assert after - before >= 0.1 ################################### ilq = InterlockedQueue() t = Timeout(0.1) before = time() assert t.pop(ilq) is None after = time() assert after - before >= 0.1 assert t.expired t.reset(0.1) ilq.push(1) before = time() assert t.pop(ilq) == 1 after = time() assert after - before < 0.01 assert not t.expired ################################### print("ok") ################################################################################ # EOF
class HealthMonitor: def __init__(self, **kwargs): self._rpc_interface = pmnc.interfaces.get_interface("rpc") if self._rpc_interface is None: raise Exception("health monitor requires enabled rpc interface") self._probe_thread_pool = pmnc.shared_pools.get_private_thread_pool() self._up_cages = {} # { cage: { node: { location: ..., probe_result: ... } } } self._up_down_queue = InterlockedQueue() self._request_timeout = pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_event = kwargs["process_event"] self._probe_cage = kwargs["probe_cage"] ################################### def start(self): self._probe_thread = HeavyThread( target=self._probe_thread_proc, name="health_monitor:probe" ) # always called "health_monitor" self._probe_thread.start() def stop(self): self._probe_thread.stop() ################################### # this method is executed in a private thread and is scheduling probe calls # to cages known to the RPC interface or previously probed and found to be up def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = { known_cage: { known_node: dict(location=known_location, probe_result=None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request( timeout=self._request_timeout, interface="__health_monitor__", protocol="n/a", parameters=dict(auth_tokens=dict()), description="probing cage {0:s} at {1:s}".format(cage, cage_location), ) self._probe_thread_pool.enqueue( request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {} ) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore ################################### # this method merges cages known to the RPC interface with cages # previously probed and known to be up, such merging is necessary # because if a cage dies just before its next advertisement broadcast, # it would disappear from known, but will not be probed again and # hence thought to be up forever @staticmethod def _merge_cages(known_cages: dict, up_cages: dict): probe_cages = known_cages # merging in place for up_cage, up_nodes in up_cages.items(): for up_node, up_cage_info in up_nodes.items(): probe_nodes = probe_cages.setdefault(up_cage, {}) if up_node in probe_nodes: cage_info = probe_nodes[up_node] if cage_info["location"] == up_cage_info["location"]: cage_info.update(probe_result=up_cage_info["probe_result"]) else: cage_info.update(probe_result="restarted") # note this case else: probe_nodes[up_node] = up_cage_info ################################### # a call to this method is enqueued to a private thread pool # for each cage to probe on every pass of _probe_thread def wu_probe_cage(self, node, cage, location, prev_probe_result): if pmnc.request.expired: # no need to report anything for a probing request return pmnc.log.debug("sending probe") try: probe_result = self._probe_cage(node, cage, location) except: pmnc.log.warning("probe failed: {0:s}".format(exc_string())) self._up_down_queue.push((node, cage, "down")) else: pmnc.log.debug("probe returned successfully") if prev_probe_result == "restarted": # if the cage has restarted self._up_down_queue.push((node, cage, "down")) # we push "down" event first self._up_down_queue.push((node, cage, "up", location, probe_result)) ################################### # this method is invoked by one of the private pool threads # to send the actual probe call to the cage being probed @typecheck def _probe_cage(self, node, cage, location) -> dict: # health monitor has to create rpc resources manually, not using # pmnc(cage) syntax, because we need to access exact cage at exact # node and location (i.e. host and port) and to avoid discovery connect_timeout = pmnc.config_resource_rpc.get("discovery_timeout") rpc = pmnc.protocol_rpc.Resource( "{0:s}.{1:s}".format(node, cage), broadcast_address=("n/a", 0), discovery_timeout=connect_timeout, multiple_timeout_allowance=0.0, flock_id="unused", exact_locations={cage: location}, # this prevents discovery pool__resource_name=cage, ) rpc.connect() try: rpc.begin_transaction( "", source_module_name=__name__, transaction_options={}, resource_args=(), resource_kwargs={} ) try: probe_result = rpc.health_monitor_event.probe() # there, an RPC call except: rpc.rollback() raise else: rpc.commit() finally: rpc.disconnect() return probe_result # if the cage returns anything but a dict, it is considered a failure ################################### # this method is called by the _probe_thread during its idle times # to fetch up/down events posted to the _up_down_queue by the probe # threads and in response to maintain structures such as _up_cages def _poll_up_down_queue(self, timeout: float) -> bool: # returns "should keep running" poll_timeout = Timeout(timeout) while not poll_timeout.expired: pop_timeout = Timeout(min(poll_timeout.remain, 1.0)) while not pop_timeout.expired: event = pop_timeout.pop(self._up_down_queue) if event is not None: try: node, cage, up_down, *args = event if up_down == "up": location, probe_result = args # add the cage to cages known to be up and schedule # application notification call if it was down or # returned a different probe result cage_info = self._up_cages.setdefault(cage, {}).setdefault(node, {}) if not cage_info or cage_info["probe_result"] != probe_result: self._schedule_up_down_event(node, cage, "up", probe_result) cage_info.update(location=location, probe_result=probe_result) elif up_down == "down": # remove the cage from cages known to be up and schedule # application notification call it was up if self._up_cages.setdefault(cage, {}).pop(node, None): self._schedule_up_down_event(node, cage, "down") except: pmnc.log.error(exc_string()) # log and ignore if current_thread().stopped(): return False return True ################################### # this method is called by the _probe_thread in response to change # of some cage's state detected in _poll_up_down_queue def _schedule_up_down_event(self, node, cage, up_down, probe_result=None): # application notification invokes methods from health_monitor_event module # and must be executed just like a regular request from some interface request = pmnc.interfaces.begin_request( timeout=self._request_timeout, interface="__health_monitor__", protocol="n/a", parameters=dict(auth_tokens=dict()), description="cage {0:s}.{1:s} is {2:s}".format(node, cage, up_down), ) # note that this request is not waited upon pmnc.interfaces.enqueue(request, self.wu_process_event, (node, cage, up_down, probe_result)) ################################### # this method is invoked by one of the interfaces pool threads to register # the event of some cage going up or down by calling an appropriate method # from the health_monitor_event module @typecheck def wu_process_event(self, node: str, cage: str, up_down: one_of("up", "down"), probe_result: optional(dict)): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): self._process_event(node, cage, up_down, probe_result) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline ################################### def _process_event(self, node, cage, up_down, probe_result): if up_down == "up": pmnc.health_monitor_event.cage_up(node, cage, probe_result) elif up_down == "down": pmnc.health_monitor_event.cage_down(node, cage)
class HealthMonitor: def __init__(self, **kwargs): self._rpc_interface = pmnc.interfaces.get_interface("rpc") if self._rpc_interface is None: raise Exception("health monitor requires enabled rpc interface") self._probe_thread_pool = pmnc.shared_pools.get_private_thread_pool() self._up_cages = {} # { cage: { node: { location: ..., probe_result: ... } } } self._up_down_queue = InterlockedQueue() self._request_timeout = pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_event = kwargs["process_event"] self._probe_cage = kwargs["probe_cage"] ################################### def start(self): self._probe_thread = HeavyThread(target = self._probe_thread_proc, name = "health_monitor:probe") # always called "health_monitor" self._probe_thread.start() def stop(self): self._probe_thread.stop() ################################### # this method is executed in a private thread and is scheduling probe calls # to cages known to the RPC interface or previously probed and found to be up def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = \ { known_cage: { known_node: dict(location = known_location, probe_result = None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request(timeout = self._request_timeout, interface = "__health_monitor__", protocol = "n/a", parameters = dict(auth_tokens = dict()), description = "probing cage {0:s} at {1:s}".format(cage, cage_location)) self._probe_thread_pool.enqueue(request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {}) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore ################################### # this method merges cages known to the RPC interface with cages # previously probed and known to be up, such merging is necessary # because if a cage dies just before its next advertisement broadcast, # it would disappear from known, but will not be probed again and # hence thought to be up forever @staticmethod def _merge_cages(known_cages: dict, up_cages: dict): probe_cages = known_cages # merging in place for up_cage, up_nodes in up_cages.items(): for up_node, up_cage_info in up_nodes.items(): probe_nodes = probe_cages.setdefault(up_cage, {}) if up_node in probe_nodes: cage_info = probe_nodes[up_node] if cage_info["location"] == up_cage_info["location"]: cage_info.update(probe_result = up_cage_info["probe_result"]) else: cage_info.update(probe_result = "restarted") # note this case else: probe_nodes[up_node] = up_cage_info ################################### # a call to this method is enqueued to a private thread pool # for each cage to probe on every pass of _probe_thread def wu_probe_cage(self, node, cage, location, prev_probe_result): if pmnc.request.expired: # no need to report anything for a probing request return if pmnc.log.debug: pmnc.log.debug("sending probe") try: probe_result = self._probe_cage(node, cage, location) except: pmnc.log.warning("probe failed: {0:s}".format(exc_string())) self._up_down_queue.push((node, cage, "down")) else: if pmnc.log.debug: pmnc.log.debug("probe returned successfully") if prev_probe_result == "restarted": # if the cage has restarted self._up_down_queue.push((node, cage, "down")) # we push "down" event first self._up_down_queue.push((node, cage, "up", location, probe_result)) ################################### # this method is invoked by one of the private pool threads # to send the actual probe call to the cage being probed @typecheck def _probe_cage(self, node, cage, location) -> dict: # health monitor has to create rpc resources manually, not using # pmnc(cage) syntax, because we need to access exact cage at exact # node and location (i.e. host and port) and to avoid discovery connect_timeout = pmnc.config_resource_rpc.get("discovery_timeout") rpc = pmnc.protocol_rpc.Resource("{0:s}.{1:s}".format(node, cage), broadcast_address = ("n/a", 0), discovery_timeout = connect_timeout, multiple_timeout_allowance = 0.0, flock_id = "unused", exact_locations = { cage: location }, # this prevents discovery pool__resource_name = cage) rpc.connect() try: rpc.begin_transaction("", source_module_name = __name__, transaction_options = {}, resource_args = (), resource_kwargs = {}) try: probe_result = rpc.health_monitor_event.probe() # there, an RPC call except: rpc.rollback() raise else: rpc.commit() finally: rpc.disconnect() return probe_result # if the cage returns anything but a dict, it is considered a failure ################################### # this method is called by the _probe_thread during its idle times # to fetch up/down events posted to the _up_down_queue by the probe # threads and in response to maintain structures such as _up_cages def _poll_up_down_queue(self, timeout: float) -> bool: # returns "should keep running" poll_timeout = Timeout(timeout) while not poll_timeout.expired: pop_timeout = Timeout(min(poll_timeout.remain, 1.0)) while not pop_timeout.expired: event = pop_timeout.pop(self._up_down_queue) if event is not None: try: node, cage, up_down, *args = event if up_down == "up": location, probe_result = args # add the cage to cages known to be up and schedule # application notification call if it was down or # returned a different probe result cage_info = self._up_cages.setdefault(cage, {}).setdefault(node, {}) if not cage_info or cage_info["probe_result"] != probe_result: self._schedule_up_down_event(node, cage, "up", probe_result) cage_info.update(location = location, probe_result = probe_result) elif up_down == "down": # remove the cage from cages known to be up and schedule # application notification call it was up if self._up_cages.setdefault(cage, {}).pop(node, None): self._schedule_up_down_event(node, cage, "down") except: pmnc.log.error(exc_string()) # log and ignore if current_thread().stopped(): return False return True ################################### # this method is called by the _probe_thread in response to change # of some cage's state detected in _poll_up_down_queue def _schedule_up_down_event(self, node, cage, up_down, probe_result = None): # application notification invokes methods from health_monitor_event module # and must be executed just like a regular request from some interface request = pmnc.interfaces.begin_request( timeout = self._request_timeout, interface = "__health_monitor__", protocol = "n/a", parameters = dict(auth_tokens = dict()), description = "cage {0:s}.{1:s} is {2:s}".format(node, cage, up_down)) # note that this request is not waited upon pmnc.interfaces.enqueue(request, self.wu_process_event, (node, cage, up_down, probe_result)) ################################### # this method is invoked by one of the interfaces pool threads to register # the event of some cage going up or down by calling an appropriate method # from the health_monitor_event module @typecheck def wu_process_event(self, node: str, cage: str, up_down: one_of("up", "down"), probe_result: optional(dict)): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): self._process_event(node, cage, up_down, probe_result) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline ################################### def _process_event(self, node, cage, up_down, probe_result): if up_down == "up": pmnc.health_monitor_event.cage_up(node, cage, probe_result) elif up_down == "down": pmnc.health_monitor_event.cage_down(node, cage)
after = time() assert after - before > 0.1 assert r.expired r = Request(timeout=0.2, interface="test", protocol="test") before = time() assert not r.acquire_shared(sl) # shared acquiring times out after = time() assert after - before > 0.1 assert r.expired ################################### ilq = InterlockedQueue() ilq.push(1) r = Request(timeout=0.1, interface="test", protocol="test") before = time() assert r.pop(ilq) == 1 # popping succeeds after = time() assert after - before < 0.01 assert not r.expired r = Request(timeout=0.2, interface="test", protocol="test") before = time() assert r.pop(ilq) is None # popping times out after = time() assert after - before > 0.1 assert r.expired
class Interface: # SMPP interface @typecheck def __init__(self, name: str, *, server_address: (str, int), connect_timeout: float, response_timeout: float, ping_interval: optional(float), system_id: str, password: str, system_type: str, esme_ton: byte, esme_npi: byte, esme_addr: str, esme_type: one_of("rcvr", "xmit", "xcvr"), request_timeout: optional(float) = None, **kwargs): # this kwargs allows for extra application-specific # settings in config_interface_smpp_X.py self._name = name self._response_timeout = response_timeout if ping_interval: self._ping_timeout = Timeout(ping_interval) self._ping_response_timeout = Timeout(response_timeout) else: self._ping_timeout = self._ping_response_timeout = None self._ping_request = None self._in_q = InterlockedQueue() self._out_q = InterlockedQueue() self._inflight = InflightRequests() self._ceased = Event() if esme_type == "rcvr": bind_pdu = BindReceiverPDU elif esme_type == "xmit": bind_pdu = BindTransmitterPDU elif esme_type == "xcvr": bind_pdu = BindTransceiverPDU self._create_connection = \ lambda: _SMPPConnection(name, self._in_q, self._out_q, self._inflight, server_address = server_address, connect_timeout = connect_timeout, response_timeout = response_timeout, system_id = system_id, password = password, system_type = system_type, esme_ton = esme_ton, esme_npi = esme_npi, esme_addr = esme_addr, bind_pdu = bind_pdu) self._request_timeout = request_timeout or \ pmnc.config_interfaces.get("request_timeout") # this is now static if pmnc.request.self_test == __name__: # self-test self._process_request = kwargs["process_request"] name = property(lambda self: self._name) ceased = property(lambda self: self._ceased.is_set()) ################################### def start(self): self._maintainer = HeavyThread(target = self._maintainer_proc, name = "{0:s}/mnt".format(self.name)) self._maintainer.start() def cease(self): self._ceased.set() self._maintainer.stop() def stop(self): pass ################################### def _maintainer_proc(self): while not current_thread().stopped(): try: # try to establish a connection, do it infinitely or until the interface is stopped while True: try: self._connection = self._create_connection() self._connection.start() except: pmnc.log.error(exc_string()) failure_timeout = max(self._request_timeout, 30.0) if current_thread().stopped(failure_timeout): return else: break # while True try: while not current_thread().stopped() and not self._connection.failed: # process incoming PDUs req = self._in_q.pop(1.0) if req is not None: self._handle_pdu(req) # if there is an outstanding ping request, check for response if self._ping_request and self._ping_response_timeout.expired: ping_request, self._ping_request = self._ping_request, None _wait_response(ping_request, 0.001) # if it's time to send another ping request, do so if self._ping_timeout and self._ping_timeout.expired: try: self._ping_request = EnquireLinkPDU.create() self._out_q.push(self._ping_request) self._ping_response_timeout.reset() finally: self._ping_timeout.reset() finally: self._connection.stop() except: pmnc.log.error(exc_string()) # log and ignore ################################### # this method processes the request PDUs received by this interface from SMSC def _handle_pdu(self, req): if isinstance(req, EnquireLinkPDU): # respond to pings automatically resp = req.create_response() self._out_q.push(resp) else: # note that this interface does not wait for its requests to complete request = pmnc.interfaces.begin_request( timeout = self._request_timeout, interface = self._name, protocol = "smpp", parameters = dict(auth_tokens = dict()), description = "incoming {0:s}".format(req)) pmnc.interfaces.enqueue(request, self.wu_handle_pdu, (req, ), {}) ################################### @typecheck def wu_handle_pdu(self, req: RequestPDU): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): request = dict(pdu = req) response = dict(pdu = req.create_nack(error_codes.ESME_RUNKNOWNERR)) try: self._process_request(request, response) except: response["pdu"] = req.create_nack(error_codes.ESME_RSYSERR) raise finally: self._out_q.push(response["pdu"]) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline ################################### def _process_request(self, request, response): handler_module_name = "interface_{0:s}".format(self._name) pmnc.__getattr__(handler_module_name).process_request(request, response) ################################### # this method is called by the coupled resources to send a request PDU to SMSC @typecheck def send(self, req: RequestPDU, timeout: optional(float) = None) -> optional(ResponsePDU): self._out_q.push(req) if timeout is not None: return _wait_response(req, min(timeout, self._response_timeout))