def start(): # thread pools are started before anything else thread_count = pmnc.config.get("thread_count") sweep_period = pmnc.config.get("sweep_period") _start_thread_pools(thread_count, sweep_period) # now the interfaces can be started as they have worker threads to delegate requests to interface_names = () if not pmnc.request.self_test: try: interface_names = pmnc.config.get("interfaces") except: pmnc.log.error(exc_string()) # log and ignore, empty tuple remains try: _start_interfaces(interface_names) except: pmnc.log.error(exc_string()) # log and ignore # finally, if we are the health monitor _start_health_monitor()
def _poller_proc(self, source_cage): while not current_thread().stopped(): try: # attach a new fake request to this thread so that # network access in RPC call times out properly fake_request(timeout = self._request_timeout, interface = self._name) pmnc.request.describe("polling cage {0:s}".format(source_cage)) # fetch a request from the remote cage # and extract the call information from it try: revrpc_request = self._poll(source_cage) except: pmnc.log.error(exc_string()) # failure to fetch a request current_thread().stopped(self._request_timeout) # results in idle delay continue if revrpc_request is None: continue elif current_thread().stopped(): break assert revrpc_request.pop("target_cage") == __cage__, "expected call to this cage" assert revrpc_request.pop("source_cage") == source_cage, "expected call from {0:s}".format(source_cage) self._process_request(source_cage, revrpc_request) except: pmnc.log.error(exc_string())
def wu_process_request(self, source_cage: valid_cage_name, module: str, method: str, args: tuple, kwargs: dict, request_dict: dict): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below # create a copy of and impersonate the original request, note that # the timeout is adjusted if current request has less time left request = pmnc.request.from_dict(request_dict, timeout=pmnc.request.remain) try: if pmnc.log.debug: pmnc.log.debug("impersonating request {0:s}".format( request.description)) original_request = current_thread()._request current_thread()._request = request try: with pmnc.performance.request_processing(): result = self._process_revrpc_request( module, method, args, kwargs) finally: current_thread()._request = original_request except: error = exc_string() pmnc.log.error( "incoming reverse RPC call failed: {0:s}".format(error)) revrpc_response = dict(exception=error) deliver_message = "reverse RPC error" else: revrpc_response = dict(result=result) deliver_message = "reverse RPC response" # submit the response back to the source cage using the request's unique id, # note that the delivery is done on behalf and within the local request if pmnc.log.debug: pmnc.log.debug("delivering {0:s}".format(deliver_message)) self._post(source_cage, request_dict["unique_id"], revrpc_response) except: pmnc.log.error(exc_string()) # log and ignore success = False else: if pmnc.log.debug: pmnc.log.debug( "{0:s} has been delivered".format(deliver_message)) success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline
def close(self): with self._db_cache_lock: for db in self._db_cache.values(): try: db.close() except: pmnc.log.error(exc_string()) # log and ignore self._db_cache.clear() try: self._env.close() except: pmnc.log.error(exc_string()) # log and ignore
def wu_process_request(self, source_cage: valid_cage_name, module: str, method: str, args: tuple, kwargs: dict, request_dict: dict): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below # create a copy of and impersonate the original request, note that # the timeout is adjusted if current request has less time left request = pmnc.request.from_dict(request_dict, timeout = pmnc.request.remain) try: pmnc.log.debug("impersonating request {0:s}".format(request.description)) original_request = current_thread()._request current_thread()._request = request try: with pmnc.performance.request_processing(): result = self._process_revrpc_request(module, method, args, kwargs) finally: current_thread()._request = original_request except: error = exc_string() pmnc.log.error("incoming reverse RPC call failed: {0:s}".format(error)) revrpc_response = dict(exception = error) deliver_message = "reverse RPC error" else: revrpc_response = dict(result = result) deliver_message = "reverse RPC response" # submit the response back to the source cage using the request's unique id, # note that the delivery is done on behalf and within the local request pmnc.log.debug("delivering {0:s}".format(deliver_message)) self._post(source_cage, request_dict["unique_id"], revrpc_response) except: pmnc.log.error(exc_string()) # log and ignore success = False else: pmnc.log.debug("{0:s} has been delivered".format(deliver_message)) success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline
def _maintainer_proc(self): while not current_thread().stopped(): try: # try to establish a connection, do it infinitely or until the interface is stopped while True: try: self._connection = self._create_connection() self._connection.start() except: pmnc.log.error(exc_string()) failure_timeout = max(self._request_timeout, 30.0) if current_thread().stopped(failure_timeout): return else: break # while True try: while not current_thread().stopped() and not self._connection.failed: # process incoming PDUs req = self._in_q.pop(1.0) if req is not None: self._handle_pdu(req) # if there is an outstanding ping request, check for response if self._ping_request and self._ping_response_timeout.expired: ping_request, self._ping_request = self._ping_request, None _wait_response(ping_request, 0.001) # if it's time to send another ping request, do so if self._ping_timeout and self._ping_timeout.expired: try: self._ping_request = EnquireLinkPDU.create() self._out_q.push(self._ping_request) self._ping_response_timeout.reset() finally: self._ping_timeout.reset() finally: self._connection.stop() except: pmnc.log.error(exc_string()) # log and ignore
def _stop_interfaces(): # first, we make sure the interfaces cease to accept new # incoming requests but may keep processing already accepted for interface_name, interface in _interfaces.items(): pmnc.log.message("ceasing interface {0:s}".format(interface_name)) try: interface.cease() except: pmnc.log.message("interface {0:s} failed to cease: {1:s}".\ format(interface_name, exc_string())) # log and ignore else: pmnc.log.message("interface {0:s} has ceased".format(interface_name)) # then we stop the request factory to stress the fact # that no more requests can be created any more _request_factory.stop() request_count = _request_factory.count pmnc.log.message("no more requests can be created{0:s}".\ format(request_count > 0 and ", {0:d} request(s) are still active".\ format(request_count) or "")) # then we wait for all existing requests to complete for # at most one full timeout, after which they are dead anyway request_timeout = pmnc.config.get("request_timeout") if not _request_factory.wait(request_timeout): request_count = _request_factory.count if request_count > 0: pmnc.log.warning("proceeding with shutdown although {0:d} request(s) " "are still active".format(request_count)) # finally the interfaces are stopped completely # and no further processing is performed for interface_name, interface in _interfaces.items(): pmnc.log.message("stopping interface {0:s}".format(interface_name)) try: interface.stop() except: pmnc.log.message("interface {0:s} failed to stop: {1:s}".\ format(interface_name, exc_string())) # log and ignore else: pmnc.log.message("interface {0:s} has stopped".format(interface_name))
def _process_message(self, message_id, pkt, request_timeout): message_text = pkt.pop("XPmncMessageText") message_description = "JMS message {0:s}{1:s}".format(message_id, ", corr.id {0:s}".format(pkt["JMSCorrelationID"]) if pkt.get("JMSCorrelationID") else "") request = pmnc.interfaces.begin_request( timeout = min(request_timeout, self._request_timeout), interface = self._name, protocol = "jms", parameters = dict(auth_tokens = dict()), description = message_description) try: pmnc.interfaces.enqueue(request, self.wu_process_request, (message_id, message_text, pkt.copy())).wait() except WorkUnitTimedOut: pmnc.log.error("{0:s} processing timed out".format(message_description)) success = None except: pmnc.log.error("{0:s} processing failed: {1:s}".\ format(message_description, exc_string())) success = False else: if pmnc.log.debug: pmnc.log.debug("{0:s} processing succeeded".format(message_description)) self._processed_messages.add(message_id) success = True finally: pmnc.interfaces.end_request(success, request) return success == True
def send(self, packet_b: bytes): packet_length = len(packet_b) pmnc.log.info("sending UDP packet to {0[0]:s}:{0[1]:d}, {1:d} byte(s)".\ format(self._server_address, packet_length)) try: if not select([], [self._socket], [], pmnc.request.remain)[1]: raise Exception("request deadline writing data to {0:s}".format(self._server_info)) if self._broadcast: sent = self._socket.sendto(packet_b, self._server_address) else: sent = self._socket.send(packet_b) if sent != packet_length: raise Exception("packet truncated, sent {0:d} byte(s)") except: pmnc.log.warning("sending UDP packet to {0[0]:s}:{0[1]:d}, {1:d} byte(s) failed: {2:s}".\ format(self._server_address, packet_length, exc_string())) raise else: pmnc.log.info("UDP packet to {0[0]:s}:{0[1]:d}, {1:d} byte(s) has been sent".\ format(self._server_address, packet_length))
def send(self, message_text: str, **kwargs) -> str: try: header_fields = " and {0:d} custom header field(s): {1:s}".\ format(len(kwargs), ", ".join(kwargs.keys())) \ if kwargs else "" correlation_id = kwargs.get("JMSCorrelationID") message_description = \ "JMS message{0:s} with {1:d} content byte(s){2:s}".\ format(" {0:s}".format(correlation_id) if correlation_id else "", len(message_text), header_fields) pmnc.log.info("sending {0:s}".format(message_description)) try: pkt = self._sync_adapter_command("SEND", XPmncMessageText = message_text, **kwargs) except: pmnc.log.warning("sending {0:s} failed: {1:s}".\ format(message_description, exc_string())) raise else: self._message_id = pkt["XPmncMessageID"] except: ResourceError.rethrow(recoverable = True) # no irreversible changes else: return self._message_id
def _remove_file(self, file_name): try: _retry_remove_file(file_name, 2.0) except: pmnc.log.warning(exc_string()) else: self._processed_files.remove(file_name)
def wu_handle_pdu(self, req: RequestPDU): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): request = dict(pdu = req) response = dict(pdu = req.create_nack(error_codes.ESME_RUNKNOWNERR)) try: self._process_request(request, response) except: response["pdu"] = req.create_nack(error_codes.ESME_RSYSERR) raise finally: self._out_q.push(response["pdu"]) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline
def _reader_proc(self): self._read_data = b"" while True: try: try: pdu = PDU.read(self) except SMPPPDUReadError as e: # mask out the exception from if isinstance(e.__cause__, _SMPPConnection.ThreadStopped): # the thread being stopped break # while else: raise if isinstance(pdu, RequestPDU): if pmnc.log.debug: pmnc.log.debug("<< [REQ] {0:s}".format(str(pdu))) self._in_q.push(pdu) elif isinstance(pdu, ResponsePDU): req = self._inflight.remove(pdu) if req is not None: if pmnc.log.debug: pmnc.log.debug("<< [RSP] {0:s}".format(str(pdu))) req.set_response(pdu) else: pmnc.log.warning("<< [RSP?!] {0:s}".format(str(pdu))) else: pmnc.log.warning("<< [???] {0:s}".format(str(pdu))) except: pmnc.log.error(exc_string()) self._failed.set() break
def disconnect(self): try: self._disconnect(self) except: pmnc.log.error(exc_string()) # log and ignore finally: TransactionalResource.disconnect(self)
def _start_interfaces(interfaces: tuple_of(str)): # prepare the global request factory shared across all interfaces # see begin_request method below global _request_factory _request_factory = InterlockedFactory(Request) pmnc.log.message("as of this moment requests can be created") # the configured interfaces are started one by one and kept on file, # technically they can be restarted (presumably with modified configuration) # at runtime by stopping and recreating, but I don't see it big an issue # to restart a cage once its interface settings have changed for interface_name in interfaces: pmnc.log.message("starting interface {0:s}".format(interface_name)) try: config_module_name = "config_interface_{0:s}".format(interface_name) config = pmnc.__getattr__(config_module_name).copy() interface = pmnc.interface.create(interface_name, **config) interface.start() except: pmnc.log.message("interface {0:s} failed to start: {1:s}".\ format(interface_name, exc_string())) # log and ignore else: pmnc.log.message("interface {0:s} has started".format(interface_name)) _interfaces[interface_name] = interface
def run(*, required_dirs: optional(tuple_of(str)) = ()): if getenv("PYTHONHASHSEED") != "0": raise SystemExit( "Running self-tests requires turning off hash randomization, " "set environment variable PYTHONHASHSEED to 0.") current_thread().name = "self_test" _log("***** STARTING SELF-TEST FOR MODULE {0:s} *****".format( main_module_name.upper())) test_cages_dir = _create_temp_cage_copy(required_dirs=required_dirs) try: pmnc = _start_pmnc(test_cages_dir) try: try: current_thread()._pmnc = pmnc # to be used in active_interface assert pmnc.request.self_test == main_module_name pmnc.__getattr__(main_module_name).self_test() except: _log("***** FAILURE: {1:s}".format(main_module_name, exc_string())) else: _log( "***** SUCCESS, BUT EXAMINE THE LOG FOR UNEXPECTED ERRORS *****" ) finally: _stop_pmnc(pmnc) finally: _remove_temp_cage_copy(test_cages_dir)
def _writer_proc(self): try: # perform synchronous bind, using cumulative timeout if not self._write_pdu(self._bind_pdu, False, self._bind_timeout): return _wait_response(self._bind_pdu, self._bind_timeout.remain) self._bound.set() while not current_thread().stopped(): # lifetime loop pdu = self._out_q.pop(1.0) if pdu and not self._write_pdu(pdu, False, Timeout(self._response_timeout)): break # perform synchronous unbind unbind_pdu = UnbindPDU.create() unbind_timeout = Timeout(self._response_timeout) self._write_pdu(unbind_pdu, True, unbind_timeout) _wait_response(unbind_pdu, unbind_timeout.remain) except: pmnc.log.error(exc_string()) self._failed.set()
def read(self, target_filename: str, *, remove: optional(bool) = None, frag_size: optional(int) = None) -> bytes: try: self._remove = remove or False self._target_filename, _ = self._normalize_filename(target_filename) file_size = os_path.getsize(self._target_filename) frag_size = frag_size or 65536 pmnc.log.info("reading {0:d} byte(s) from file {1:s}".\ format(file_size, self._target_filename)) try: data = BytesIO() with open(self._target_filename, "rb") as f: while not pmnc.request.expired: portion = f.read(frag_size) if not portion: break data.write(portion) else: raise Exception("request deadline reading data from file {0:s}".\ format(self._target_filename)) except: pmnc.log.info("reading {0:d} byte(s) from file {1:s} failed: {2:s}".\ format(file_size, self._target_filename, exc_string())) raise else: return data.getvalue() except: ResourceError.rethrow(recoverable = True) # no irreversible changes
def _process_file(self, file_name): request = pmnc.interfaces.begin_request( timeout = self._request_timeout, interface = self._name, protocol = "file", parameters = dict(auth_tokens = dict()), description = "file {0:s}".format(file_name)) # enqueue the request and wait for its completion try: pmnc.interfaces.enqueue(request, self.wu_process_request, (file_name, )).wait() except WorkUnitTimedOut: pmnc.log.error("file processing timed out") success = None except: pmnc.log.error("file processing failed: {0:s}".format(exc_string())) success = False else: if pmnc.log.debug: pmnc.log.debug("file processing succeeded") self._processed_files.add(file_name) success = True finally: pmnc.interfaces.end_request(success, request) return success == True
def disconnect(self): try: self._connection.close() except: pmnc.log.error(exc_string()) # log and ignore finally: SQLResource.disconnect(self)
def _state_thread_proc(): _check_free_disk_space() checkpoint_interval = pmnc.config.get("checkpoint_interval") while not current_thread().stopped(checkpoint_interval): try: _checkpoint_state() _check_free_disk_space() except: pmnc.log.error(exc_string()) # log and ignore try: _shutdown_state() except: pmnc.log.error(exc_string()) # log and ignore
def _checkpoint_state(): with _state_cache_lock: for module_state in _state_cache.values(): try: module_state.checkpoint() except: pmnc.log.error(exc_string()) # log and ignore
def _stderr_reader_proc(self, stderr): try: try: while stderr.read(512): # this light thread exits only upon exception pass finally: stderr.close() except: pmnc.log.error(exc_string()) # log and ignore
def _shutdown_state(): with _state_cache_lock: for module_state in _state_cache.values(): try: module_state.close() except: pmnc.log.error(exc_string()) # log and ignore _state_cache.clear()
def reload(): # start new/stop missing interfaces try: interface_names = pmnc.config.get("interfaces") except: pmnc.log.error(exc_string()) # log and ignore else: try: _update_interfaces(interface_names) except: pmnc.log.error(exc_string()) # log and ignore # restart reconfigured interfaces for interface in _interfaces.values(): interface.reload()
def _listener_proc(self): while not current_thread().stopped(): try: s = self._create_server_socket() try: pmnc.log.message("started listening for packets at {0[0]:s}:{0[1]:d}".\ format(self._listener_address)) while not current_thread().stopped(): if not select([s], [], [], 1.0)[0]: continue try: packet_b, (client_addr, client_port) = s.recvfrom(57344) except socket_timeout: continue except socket_error as e: if e.args[0] == 10054: # workaround for this issue: http://support.microsoft.com/kb/263823 continue else: raise if pmnc.log.debug: pmnc.log.debug("incoming UDP packet from {0:s}:{1:d}, {2:d} byte(s)".\ format(client_addr, client_port, len(packet_b))) # the listener thread initiates the received packet processing try: self._enqueue_packet(packet_b, client_addr, client_port) except: pmnc.log.error(exc_string()) # log and ignore finally: s.close() pmnc.log.message("stopped listening") except: pmnc.log.error(exc_string()) # log and ignore
def disconnect(self): try: if self._process and self._process.poll() is None: pmnc.log.warning("killing runaway process {0:s} (pid {1:d})".\ format(self._executable, self._process.pid)) self._process.kill() except: pmnc.log.error(exc_string()) # log and ignore finally: TransactionalResource.disconnect(self)
def disconnect(self): try: try: self._sync_adapter_command("EXIT") finally: self._stop_adapter(Timeout(min(5.0, pmnc.request.remain))) except: pmnc.log.error(exc_string()) # log and ignore finally: TransactionalResource.disconnect(self)
def disconnect(self): try: try: self._connection.Close() except com_error: self._rethrow_adodb_error() except: pmnc.log.error(exc_string()) # log and ignore finally: SQLResource.disconnect(self)
def _scheduler_proc(self): while not current_thread().stopped(1.1 - modf(time())[0]): try: current_tick = Interface._get_current_tick() try: for tick in range(self._last_tick + 1, current_tick + 1): invocation_time = datetime.fromtimestamp(tick) if self._match(invocation_time.strftime(self._format)): try: self._fire_request(invocation_time) except: pmnc.log.error(exc_string()) # log and ignore finally: self._last_tick = tick except: pmnc.log.error(exc_string()) # log and ignore
def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = \ { known_cage: { known_node: dict(location = known_location, probe_result = None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request(timeout = self._request_timeout, interface = "__health_monitor__", protocol = "n/a", parameters = dict(auth_tokens = dict()), description = "probing cage {0:s} at {1:s}".format(cage, cage_location)) self._probe_thread_pool.enqueue(request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {}) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore
def _config(self): module_properties = {} config_module_name = "config_interface_{0:s}".format(self._name) try: config = pmnc.__getattr__(config_module_name).\ copy(__module_properties = module_properties) except: pmnc.log.error(exc_string()) return None, None else: return config, module_properties["version"]
def wu_restart_state(): try: pmnc.log.warning( "the state is being restarted due to database recovery") pmnc.state.stop() pmnc.state.start() pmnc.log.warning("the state has been restarted") except: pmnc.log.error(exc_string()) # log and ignore finally: with _restart_pending_lock: _restart_pending.clear()
def _sync_request(self, rq: MongoDB_Request, check: optional(callable) = None) -> MongoDB_Response: pmnc.log.info(">> {0:s}".format(rq)) try: rs = self._connection.sync_request(rq, Timeout(pmnc.request.remain)) if rs.cursor_not_found: raise MongoDB_Error("cursor not found") if check: check(rs) except MongoDB_Error as e: pmnc.log.warning("<< {0:s} !! {1:s}".format(rq, exc_string())) ResourceError.rethrow(code=e.code, description=str(e), terminal=e.code is not None) except Exception as e: pmnc.log.warning("<< {0:s} !! {1:s}".format(rq, exc_string())) ResourceError.rethrow(description=str(e)) else: pmnc.log.info("<< OK, {0:s}".format(rs)) return rs
def _stop_health_monitor(): # stop health monitor if it has been started if _health_monitor: pmnc.log.message("stopping health monitor") try: _health_monitor.stop() except: pmnc.log.error("health monitor failed to stop: {0:s}".\ format(exc_string())) else: pmnc.log.message("health monitor has stopped")
def wu_probe_cage(self, node, cage, location, prev_probe_result): if pmnc.request.expired: # no need to report anything for a probing request return pmnc.log.debug("sending probe") try: probe_result = self._probe_cage(node, cage, location) except: pmnc.log.warning("probe failed: {0:s}".format(exc_string())) self._up_down_queue.push((node, cage, "down")) else: pmnc.log.debug("probe returned successfully") if prev_probe_result == "restarted": # if the cage has restarted self._up_down_queue.push((node, cage, "down")) # we push "down" event first self._up_down_queue.push((node, cage, "up", location, probe_result))
def _start_health_monitor(): # if this cage is called health_monitor, start health monitor if __cage__ == "health_monitor": pmnc.log.message("starting health monitor") try: health_monitor = \ pmnc.health_monitor.HealthMonitor() health_monitor.start() except: pmnc.log.error("health monitor failed to start: {0:s}".\ format(exc_string())) else: global _health_monitor _health_monitor = health_monitor pmnc.log.message("health monitor has started")
def _poll_up_down_queue(self, timeout: float) -> bool: # returns "should keep running" poll_timeout = Timeout(timeout) while not poll_timeout.expired: pop_timeout = Timeout(min(poll_timeout.remain, 1.0)) while not pop_timeout.expired: event = pop_timeout.pop(self._up_down_queue) if event is not None: try: node, cage, up_down, *args = event if up_down == "up": location, probe_result = args # add the cage to cages known to be up and schedule # application notification call if it was down or # returned a different probe result cage_info = self._up_cages.setdefault(cage, {}).setdefault(node, {}) if not cage_info or cage_info["probe_result"] != probe_result: self._schedule_up_down_event(node, cage, "up", probe_result) cage_info.update(location=location, probe_result=probe_result) elif up_down == "down": # remove the cage from cages known to be up and schedule # application notification call it was up if self._up_cages.setdefault(cage, {}).pop(node, None): self._schedule_up_down_event(node, cage, "down") except: pmnc.log.error(exc_string()) # log and ignore if current_thread().stopped(): return False return True
def run(*, required_dirs: optional(tuple_of(str)) = ()): current_thread().name = "self_test" _log("***** STARTING SELF-TEST FOR MODULE {0:s} *****".format(main_module_name.upper())) test_cages_dir = _create_temp_cage_copy(required_dirs = required_dirs) try: pmnc = _start_pmnc(test_cages_dir) try: try: current_thread()._pmnc = pmnc # to be used in active_interface assert pmnc.request.self_test == main_module_name pmnc.__getattr__(main_module_name).self_test() except: _log("***** FAILURE: {1:s}".format(main_module_name, exc_string())) else: _log("***** SUCCESS, BUT EXAMINE THE LOG FOR UNEXPECTED ERRORS *****") finally: _stop_pmnc(pmnc) finally: _remove_temp_cage_copy(test_cages_dir)
def wu_process_event(self, node: str, cage: str, up_down: one_of("up", "down"), probe_result: optional(dict)): try: # see for how long the request was on the execution queue up to this moment # and whether it has expired in the meantime, if it did there is no reason # to proceed and we simply bail out if pmnc.request.expired: pmnc.log.error("request has expired and will not be processed") success = False return # goes through finally section below with pmnc.performance.request_processing(): self._process_event(node, cage, up_down, probe_result) except: pmnc.log.error(exc_string()) # log and ignore success = False else: success = True finally: # the request ends itself pmnc.interfaces.end_request(success) # possibly way after deadline
def wu_participate(self, transaction_start, participant_index, resource_name, attrs, args, kwargs, res_args, res_kwargs): # see whether the request by which this transaction was created # has expired in the meantime, and if it has, simply bail out # because the transaction should have long been perished # no attempt to execute the request is taken and no result # is delivered, simply because the transaction is assumed # to already be aborted, nowhere to report the result if pmnc.request.expired: pmnc.log.error("execution of resource {0:s} in transaction " "{1:s} was late".format(resource_name, self)) return try: pmnc.log.debug("resource {0:s} joins transaction {1:s}".\ format(resource_name, self)) resource_instance = None # no instance has been allocated yet resource_in_transaction = False # no transaction has been started on the instance resource_failed = True # (preventive) request execution has been a failure while True: # breaks when the result is obtained, either value or exception # any failure prior to actual resource allocation results # in a recoverable ResourceError, pointlessly terminal try: # the pending interval is measured from the beginning of the # transaction, not from the beginning of the request pending_ms = int((time() - transaction_start) * 1000) pmnc.performance.sample("resource.{0:s}.pending_time".\ format(resource_name), pending_ms) # allocate a resource instance from a specific resource pool resource_pool = pmnc.shared_pools.get_resource_pool(resource_name) resource_instance = resource_pool.allocate() except: # tested result = ResourceError.snap_exception( participant_index = participant_index, recoverable = True, terminal = True) # but not really terminal, break # while True # no instance to terminate # some resource instance has been allocated, beginning a transaction, # a failure would result in a ResourceError, recoverable yet terminal # unless explicitly specified otherwise try: # see if a transaction should be started in as much time as the request has left if pmnc.request.remain < resource_instance.min_time: raise ResourceError(description = "transaction {0:s} is declined by resource instance " # tested "{1:s}".format(self, resource_instance.name), recoverable = True, terminal = False) # the instance stays alive pmnc.log.debug("resource instance {0:s} is used in transaction {1:s}, {2:s}".\ format(resource_instance.name, self, self._resource_ttl(resource_instance))) # begin a new transaction, this is presumably reversible operation resource_instance.begin_transaction(self._xid, source_module_name = self._source_module_name, transaction_options = self._options, resource_args = res_args, resource_kwargs = res_kwargs) except ResourceError as e: result = self._apply_error(participant_index, resource_instance, e) break # while True except: # tested result = ResourceError.snap_exception( participant_index = participant_index, recoverable = True, terminal = True) resource_instance.expire() break # while True else: resource_in_transaction = True # resource instance is now in transaction, executing the request, # a failure would result in a ResourceError, unrecoverable and # terminal unless explicitly specified otherwise try: # replay attribute accesses to obtain the actual target method target_method = resource_instance for attr in attrs: target_method = getattr(target_method, attr) # execute the request, registering the execution time with pmnc.performance.timing("resource.{0:s}.processing_time".format(resource_name)): result = target_method(*args, **kwargs) except ResourceError as e: result = self._apply_error(participant_index, resource_instance, e) break # while True except Exception: # tested result = ResourceError.snap_exception( participant_index = participant_index, recoverable = False, terminal = True) resource_instance.expire() break # while True else: resource_instance.reset_idle_timeout() resource_failed = False break # while True # we got an intermediate result, possibly an exception try: self._results.push((participant_index, result)) # deliver the result to the pending transaction # register the actual result of this participant pmnc.performance.event("resource.{0:s}.transaction_rate.{1:s}".\ format(resource_name, resource_failed and "failure" or "success")) if not resource_in_transaction: # as we couldn't begin a transaction, return "failure" # we are not interested in the decision pmnc.log.debug("resource instance {0:s} is waiting for decision in " "transaction {1:s}".format(resource_instance.name, self)) # figure out whether the resource has to commit or rollback commit_transaction = False if pmnc.request.wait(self._decision): # wait for transaction's decision if self._commit.is_set(): if not resource_failed: commit_transaction = True pmnc.log.debug("resource instance {0:s} decided to commit in transaction " "{1:s}".format(resource_instance.name, self)) else: pmnc.log.warning("resource instance {0:s} had to rollback despite decision to commit " "in transaction {1:s}".format(resource_instance.name, self)) else: pmnc.log.debug("resource instance {0:s} decided to rollback in transaction " "{1:s}".format(resource_instance.name, self)) else: pmnc.log.warning("resource instance {0:s} had to abandon waiting for decision and " "rollback in transaction {1:s}".format(resource_instance.name, self)) # complete the transaction and return the final outcome if commit_transaction: try: resource_instance.commit() except: pmnc.log.error("resource instance {0:s} failed to commit in transaction {1:s}: " "{2:s}".format(resource_instance.name, self, exc_string())) # this is a severe problem resource_instance.expire() return "failure" else: pmnc.log.debug("resource instance {0:s} committed in transaction " "{1:s}".format(resource_instance.name, self)) return "commit" else: try: resource_instance.rollback() except: pmnc.log.warning("resource instance {0:s} failed to rollback in transaction {1:s}: " "{2:s}".format(resource_instance.name, self, exc_string())) # this is not a big deal resource_instance.expire() return "failure" else: pmnc.log.debug("resource instance {0:s} rolled back in transaction " "{1:s}".format(resource_instance.name, self)) return "rollback" finally: if resource_instance: pmnc.log.debug("resource instance {0:s} is being released, {1:s}".\ format(resource_instance.name, self._resource_ttl(resource_instance))) resource_pool.release(resource_instance) except: pmnc.log.error(exc_string()) # this should not normally happen, but do raise # not allow such exception to be silenced
def _perf_thread_proc(): slice10s = TimeSlice(10) sampler60s = CumulativeSampler(60) sampler10s = CurrentSampler(10) while not current_thread().stopped(): try: # drain the queue and append all the data to the current 10s sampler perf_info = _perf_queue.pop(1.0) # this causes a delay of up to 1 sec while perf_info: what, key, value = perf_info if what == "event": sampler10s.tick(key) if key.endswith(".success") or key.endswith(".failure"): sampler10s.tick(key[:-8]) elif what == "sample": sampler10s.sample(key, value) if key.endswith(".success") or key.endswith(".failure"): sampler10s.sample(key[:-8], value) perf_info = _perf_queue.pop(0.0) # see if another 10s have passed next, slice = slice10s.next if next: # start of a 10s slice with _perf_lock: # update the shared 10s stats completed_slice = slice - 1 _perf_dump_10s[completed_slice % 6] = (completed_slice * 10, sampler10s.dump()) sampler60s.collect(sampler10s) # append 10s stats to the 60s sampler sampler10s = CurrentSampler(10) # create a new empty 10s sampler if slice % 6 == 0: # start of a new minute # update the shared 60s stats with the current 60s summary # and reset the shared 10s stats with _perf_lock: completed_minute = slice - 6 _perf_dump_60s.append((completed_minute * 10, sampler60s.dump())) if len(_perf_dump_60s) == 60: _perf_dump_60s.pop(0) _perf_dump_10s[:] = [None] * 6 sampler60s = CumulativeSampler(60) # create a new empty 60s sampler with _perf_lock: # update global statistics wss = get_working_set_size() if wss is not None: _perf_stats.update(wss = wss) cpu_ut_cpu_kt = get_cpu_times() if cpu_ut_cpu_kt is not None: cpu_ut, cpu_kt = cpu_ut_cpu_kt _perf_stats.update(cpu_ut_percent = (cpu_ut - _perf_stats.get("cpu_ut", 0.0)) * 10.0, cpu_kt_percent = (cpu_kt - _perf_stats.get("cpu_kt", 0.0)) * 10.0, cpu_ut = cpu_ut, cpu_kt = cpu_kt) except: pmnc.log.error(exc_string()) # log and ignore
def _probe_thread_proc(self): per_cage_interval = 0.0 # calls to _poll_up_down_queue are interleaved and allow this thread # to maintain structures such as _up_cages in response to events # posted by the probe threads to the _up_down_queue while self._poll_up_down_queue(per_cage_interval): try: # extract all cages currently known to the rpc interface and # merge them with cages previously probed and found to be up, # except for the health_monitor cage itself should be skipped probe_cages = { known_cage: { known_node: dict(location=known_location, probe_result=None) for known_node, known_location in self._rpc_interface.get_nodes(known_cage).items() } for known_cage in self._rpc_interface.get_cages() if known_cage != "health_monitor" } self._merge_cages(probe_cages, self._up_cages) probe_period = pmnc.config.get("probe_period") per_cage_interval = probe_period / (len(probe_cages) + 1) # walk through all cages to be probed and schedule calls to probe # to a private thread pool using fake unregistered requests for cage, nodes in probe_cages.items(): for node, cage_info in nodes.items(): cage_location = cage_info["location"] # note that the requests created here are not registered with # interfaces and enqueued to a different pool too, they are # therefore entitled to termination without warning at shutdown, # this is ok, because they do no useful work for the clients request = Request( timeout=self._request_timeout, interface="__health_monitor__", protocol="n/a", parameters=dict(auth_tokens=dict()), description="probing cage {0:s} at {1:s}".format(cage, cage_location), ) self._probe_thread_pool.enqueue( request, self.wu_probe_cage, (node, cage, cage_location, cage_info["probe_result"]), {} ) # then again yield to polling the queue for a while if not self._poll_up_down_queue(per_cage_interval): break except: pmnc.log.error(exc_string()) # log and ignore
def execute_reverse(target_cage: valid_cage_name, module: valid_module_name, method: valid_method_name, args: tuple, kwargs: dict): # wrap up an RPC call identical to how it's done in protocol_rpc.py request_dict = pmnc.request.to_dict() # remove request parameters that must not cross the RPC border request_dict["parameters"].pop("retry", None) # wrap all the call parameters in a plain dict request = dict(source_cage = __cage__, target_cage = target_cage, module = module, method = method, args = args, kwargs = kwargs, request = request_dict) request_description = "reverse RPC request {0:s}.{1:s} to {2:s}". \ format(module, method, target_cage) # create a one-time response queue just for this request rs_queue = InterlockedQueue() request_id = pmnc.request.unique_id with _rs_queues_lock: _rs_queues[request_id] = rs_queue # register the call as being active try: pmnc.log.info("sending {0:s}".format(request_description)) try: # enqueue the call and wait for response rq_queue = _get_rq_queue(target_cage) rq_queue.push((request_id, request)) response = pmnc.request.pop(rs_queue) if response is None: raise Exception("request deadline waiting for response") try: result = response["result"] except KeyError: raise RPCError(description = response["exception"], terminal = False) except RPCError as e: pmnc.log.warning("{0:s} returned error: {1:s}".\ format(request_description, e.description)) raise except: pmnc.log.warning("{0:s} failed: {1:s}".\ format(request_description, exc_string())) ResourceError.rethrow(recoverable = False) else: pmnc.log.info("reverse RPC request returned successfully") return result finally: with _rs_queues_lock: del _rs_queues[request_id] # unregister the call