def run(self, halt_event): """ poll for I/O ready objects This is a task for the time queue """ if halt_event.is_set(): self._log.info("halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return next_tasks = list() next_interval = time.time() + self._polling_interval try: result_list = self._poller.poll(timeout=self._poll_timeout) except zmq.ZMQError, zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): self._log.info("Interrupted with halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return raise
def main(): """ main entry point """ return_code = 0 args = _parse_command_line() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() reporting_socket = zeromq_context.socket(zmq.PUSH) reporting_socket.setsockopt(zmq.LINGER, 5000) reporting_socket.setsockopt(zmq.SNDHWM, args.hwm) reporting_socket.connect(args.reporting_url) req_socket = None ping_message = {"message-type" : "ping", "message-id" : uuid.uuid1().hex, } result_message = {"message-type" : "ping-result", "url" : args.ping_url, "result" : None, "socket-reconnection-number" : 0, "check-number" : 0, } while not halt_event.is_set(): try: req_socket = _process_one_ping(halt_event, args, zeromq_context, ping_message, result_message, req_socket, reporting_socket) except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): pass else: logging.exception(str(zmq_error)) halt_event.set() return_code = 1 except Exception as instance: logging.exception(str(instance)) halt_event.set() return_code = 1 else: halt_event.wait(args.interval) if req_socket is not None: req_socket.close() reporting_socket.close() zeromq_context.term() return return_code
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) memcached_client = memcache.Client(_memcached_nodes) zeromq_context = zmq.Context() sub_socket = _create_sub_socket(zeromq_context) expected_sequence = { _cache_update_channel : None, } while not halt_event.is_set(): try: topic = sub_socket.recv() assert sub_socket.rcvmore meta = sub_socket.recv() if sub_socket.rcvmore: data = sub_socket.recv() else: data = "" _process_one_event(memcached_client, expected_sequence, topic, meta, data) except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") halt_event.set() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call - ok at shutdown") else: log.exception("zeromq error processing request") return_value = 1 halt_event.set() except Exception: log.exception("error processing request") return_value = 1 halt_event.set() sub_socket.close() zeromq_context.term() log.info("program teminates: return value = {0}".format(return_value)) return return_value
def _process_handoff(zeromq_context, halt_event, node_dict, pull_socket, pull_socket_uri, client_tag, source_node_name, dest_node_name, segment_row): log = logging.getLogger("_process_handoff") log.info("start ({0}, {1}) from {2}".format(segment_row["unified_id"], segment_row["conjoined_part"], source_node_name)) reader_socket = ReqSocket(zeromq_context, _reader_address_dict[source_node_name], client_tag, pull_socket_uri, halt_event) writer_socket = ReqSocket(zeromq_context, _writer_address_dict[dest_node_name], client_tag, pull_socket_uri, halt_event) forwarder = forwarder_coroutine(node_dict, segment_row, writer_socket, reader_socket) next(forwarder) # loop, waiting for messages to the pull socket # until the forwarder tells us it is done while not halt_event.is_set(): try: message = pull_socket.recv_json() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call with halt_event_set") break log.exception(str(zmq_error)) sys.exit(1) data = list() while pull_socket.rcvmore: data.append(pull_socket.recv()) if len(data) == 0: data = None result = forwarder.send(( message, data, )) # the forwarder will yield the string 'done' when it is done if result is not None: assert result == "done", result break reader_socket.close() writer_socket.close() if halt_event.is_set(): raise HaltEvent() log.info("done ({0}, {1}) from {2}".format(segment_row["unified_id"], segment_row["conjoined_part"], source_node_name))
def main(): """ main entry point """ return_code = 0 args = _parse_command_line() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() reporting_socket = zeromq_context.socket(zmq.PUSH) reporting_socket.setsockopt(zmq.LINGER, 5000) reporting_socket.setsockopt(zmq.SNDHWM, args.hwm) reporting_socket.connect(args.reporting_url) req_socket = None ping_message = {"message-type": "ping", "message-id": uuid.uuid1().hex} result_message = { "message-type": "ping-result", "url": args.ping_url, "result": None, "socket-reconnection-number": 0, "check-number": 0, } while not halt_event.is_set(): try: req_socket = _process_one_ping( halt_event, args, zeromq_context, ping_message, result_message, req_socket, reporting_socket ) except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): pass else: logging.exception(str(zmq_error)) halt_event.set() return_code = 1 except Exception as instance: logging.exception(str(instance)) halt_event.set() return_code = 1 else: halt_event.wait(args.interval) if req_socket is not None: req_socket.close() reporting_socket.close() zeromq_context.term() return return_code
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) memcached_client = memcache.Client(_memcached_nodes) zeromq_context = zmq.Context() sub_socket = _create_sub_socket(zeromq_context) expected_sequence = { _cache_update_channel: None, } while not halt_event.is_set(): try: topic = sub_socket.recv() assert sub_socket.rcvmore meta = sub_socket.recv() if sub_socket.rcvmore: data = sub_socket.recv() else: data = "" _process_one_event(memcached_client, expected_sequence, topic, meta, data) except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") halt_event.set() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call - ok at shutdown") else: log.exception("zeromq error processing request") return_value = 1 halt_event.set() except Exception: log.exception("error processing request") return_value = 1 halt_event.set() sub_socket.close() zeromq_context.term() log.info("program teminates: return value = {0}".format(return_value)) return return_value
def _process_request(resources): """ Wait for a reply to our last message from the controller. """ log = logging.getLogger("_process_one_transaction") log.debug("waiting work request") try: request = resources.dealer_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error): raise InterruptedSystemCall() raise assert resources.dealer_socket.rcvmore control = resources.dealer_socket.recv_pyobj() log.debug("user_request_id = {0}; control = {1}".format( request["user-request-id"], control)) assert resources.dealer_socket.rcvmore sequence_row = resources.dealer_socket.recv_pyobj() value_file_path = compute_value_file_path(_repository_path, sequence_row["space_id"], sequence_row["value_file_id"]) control["result"] = "success" control["error-message"] = "" if value_file_path in resources.file_cache: value_file, _ = resources.file_cache[value_file_path] del resources.file_cache[value_file_path] else: try: value_file = open(value_file_path, "rb") except Exception as instance: log.exception("user_request_id = {0}, " \ "read {1}".format(request["user-request-id"], value_file_path)) resources.event_push_client.exception("error_opening_value_file", str(instance)) control["result"] = "error_opening_value_file" control["error-message"] = str(instance) if control["result"] != "success": _send_error_reply(resources, request, control) return read_offset = \ sequence_row["value_file_offset"] + \ (control["left-offset"] * encoded_block_slice_size) read_size = \ sequence_row["size"] - \ (control["left-offset"] * encoded_block_slice_size) - \ (control["right-offset"] * encoded_block_slice_size) # Ticket #84 handle a short block # the last block in the file may be smaller than encoded_block_slice_size # so we might have subtracted too much for the right offset if control["right-offset"] > 0: block_modulus = sequence_row["size"] % encoded_block_slice_size last_block_size = (encoded_block_slice_size if block_modulus == 0 else \ block_modulus) last_block_delta = encoded_block_slice_size - last_block_size read_size += last_block_delta try: value_file.seek(read_offset) encoded_data = value_file.read(read_size) except Exception as instance: log.exception("user_request_id = {0}, " \ "read {1}".format(request["user-request-id"], value_file_path)) resources.event_push_client.exception("error_reading_value_file", str(instance)) control["result"] = "error_reading_vqalue_file" control["error-message"] = str(instance) if control["result"] != "success": value_file.close() _send_error_reply(resources, request, control) return resources.file_cache[value_file_path] = value_file, time.time() if len(encoded_data) != read_size: error_message = "{0} size mismatch {1} {2}".format( request["retrieve-id"], len(encoded_data), read_size) log.error("user_request_id = {0}, {1}".format( request["user-request-id"], error_message)) resources.event_push_client.error("size_mismatch", error_message) control["result"] = "size_mismatch" control["error-message"] = error_message if control["result"] != "success": _send_error_reply(resources, request, control) return encoded_block_list = list(encoded_block_generator(encoded_data)) segment_size = 0 segment_adler32 = 0 segment_md5 = hashlib.md5() for encoded_block in encoded_block_list: segment_size += len(encoded_block) segment_adler32 = zlib.adler32(encoded_block, segment_adler32) segment_md5.update(encoded_block) segment_md5_digest = segment_md5.digest() reply = { "message-type": "retrieve-key-reply", "user-request-id": request["user-request-id"], "client-tag": request["client-tag"], "message-id": request["message-id"], "retrieve-id": request["retrieve-id"], "segment-unified-id": request["segment-unified-id"], "segment-num": request["segment-num"], "segment-size": segment_size, "zfec-padding-size": sequence_row["zfec_padding_size"], "segment-adler32": segment_adler32, "segment-md5-digest": b64encode(segment_md5_digest).decode("utf-8"), "sequence-num": None, "completed": control["completed"], "result": "success", "error-message": "", } push_socket = _get_reply_push_socket(resources, request["client-address"]) push_socket.send_json(reply, zmq.SNDMORE) for encoded_block in encoded_block_list[:-1]: push_socket.send(encoded_block, zmq.SNDMORE) push_socket.send(encoded_block_list[-1])
def _process_handoff(zeromq_context, halt_event, node_dict, pull_socket, pull_socket_uri, client_tag, source_node_name, dest_node_name, segment_row): log = logging.getLogger("_process_handoff") log.info("start ({0}, {1}) from {2}".format(segment_row["unified_id"], segment_row["conjoined_part"], source_node_name)) reader_socket = ReqSocket(zeromq_context, _reader_address_dict[source_node_name], client_tag, pull_socket_uri, halt_event) writer_socket = ReqSocket(zeromq_context, _writer_address_dict[dest_node_name], client_tag, pull_socket_uri, halt_event) forwarder = forwarder_coroutine(node_dict, segment_row, writer_socket, reader_socket) next(forwarder) # loop, waiting for messages to the pull socket # until the forwarder tells us it is done while not halt_event.is_set(): try: message = pull_socket.recv_json() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call with halt_event_set") break log.exception(str(zmq_error)) sys.exit(1) data = list() while pull_socket.rcvmore: data.append(pull_socket.recv()) if len(data) == 0: data = None result = forwarder.send((message, data, )) # the forwarder will yield the string 'done' when it is done if result is not None: assert result == "done", result break reader_socket.close() writer_socket.close() if halt_event.is_set(): raise HaltEvent() log.info("done ({0}, {1}) from {2}".format(segment_row["unified_id"], segment_row["conjoined_part"], source_node_name))
def main(worker_id, host_name, base_port, dest_node_name, rep_socket_uri): """ main entry point return 0 on normal termination (exit code) """ log = logging.getLogger("main") client_tag = _client_tag_template.format(worker_id) halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() log.debug("creating pull socket") pull_socket_uri = "tcp://{0}:{1}".format(socket.gethostbyname(host_name), base_port+worker_id) pull_socket = zeromq_context.socket(zmq.PULL) pull_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) log.info("binding pull socket to {0}".format(pull_socket_uri)) pull_socket.bind(pull_socket_uri) req_socket = zeromq_context.socket(zmq.REQ) req_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) req_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark) req_socket.connect(rep_socket_uri) log.info("sending 'start' message") # notify our parent that we are ready to receive work request = {"message-type" : "start", "worker-id" : worker_id} req_socket.send_pyobj(request) log.info("starting message loop") node_dict = None while not halt_event.is_set(): try: message = req_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call wiht halt_event_set") break log.exception(str(zmq_error)) sys.exit(1) assert not req_socket.rcvmore if message["message-type"] == "stop": log.info("'stop' message received") break assert message["message-type"] == "work", message["message-type"] # we expect our parent to send us the node dict in our first message if "node-dict" in message: node_dict = message["node-dict"] assert node_dict is not None # aliases for brevity segment_row = message["segment-row"] source_node_names = message["source-node-names"] # send back enough information to purge the handoffs, if successful request = {"message-type" : "handoff-complete", "worker-id" : worker_id, "handoff-successful" : False, "unified-id" : segment_row["unified_id"], "collection-id" : segment_row["collection_id"], "key" : segment_row["key"], "conjoined-part" : segment_row["conjoined_part"], "handoff-node-id" : segment_row["handoff_node_id"], "source-node-names" : source_node_names, "error-message" : ""} for source_node_name in source_node_names: try: _process_handoff(zeromq_context, halt_event, node_dict, pull_socket, pull_socket_uri, client_tag, source_node_name, dest_node_name, segment_row) except Exception as instance: log.exception(instance) request["error-message"] = "".join([request["error-message"], str(instance)]) else: request["handoff-successful"] = True break req_socket.send_pyobj(request) log.info("end message loop") pull_socket.close() req_socket.close() zeromq_context.term() log.info("program terminates") return 0
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() resources = \ _resources_tuple(halt_event=Event(), zeromq_context=zeromq_context, reply_push_sockets=dict(), pull_socket=zeromq_context.socket(zmq.PULL), io_controller_push_socket=\ zeromq_context.socket(zmq.PUSH), router_socket=zeromq_context.socket(zmq.ROUTER), event_push_client=\ EventPushClient(zeromq_context, "rs_db_pool_controller"), active_retrieves=dict(), pending_work_queue=deque(), available_ident_queue=deque()) log.debug("binding to {0}".format(db_controller_pull_socket_uri)) resources.pull_socket.bind(db_controller_pull_socket_uri) log.debug("connecting to {0}".format(io_controller_pull_socket_uri)) resources.io_controller_push_socket.connect(io_controller_pull_socket_uri) resources.router_socket.setsockopt(zmq.LINGER, 1000) log.debug("binding to {0}".format(db_controller_router_socket_uri)) resources.router_socket.bind(db_controller_router_socket_uri) # we poll the sockets for readability, we assume we can always # write to the router socket poller = zmq.Poller() poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR) poller.register(resources.router_socket, zmq.POLLIN| zmq.POLLERR) worker_processes = list() for index in range(_worker_count): worker_processes.append(_launch_database_pool_worker(index+1)) last_report_time = 0.0 try: while not halt_event.is_set(): for worker_process in worker_processes: poll_subprocess(worker_process) for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) if active_socket is resources.pull_socket: _read_pull_socket(resources) elif active_socket is resources.router_socket: _read_router_socket(resources) else: log.error("unknown socket {0}".format(active_socket)) current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = \ "{0:,} active_retrives, " \ "{1:,} pending_work_queue entries, " \ "{2:,} available_ident_queue entries" \ "".format(len(resources.active_retrieves), len(resources.pending_work_queue), len(resources.available_ident_queue)) log.info(report_message) resources.event_push_client.info( "queue_sizes", report_message, active_retrieves=len(resources.active_retrieves), pending_work_queue=len(resources.pending_work_queue), available_ident_queue=len(resources.available_ident_queue)) last_report_time = current_time except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: for worker_process in worker_processes: terminate_subprocess(worker_process) resources.pull_socket.close() resources.io_controller_push_socket.close() resources.router_socket.close() for push_socket in resources.reply_push_sockets.values(): push_socket.close() resources.event_push_client.close() zeromq_context.term() return return_value
def process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, raw_segment_rows): """ process handoffs of segment rows """ log = logging.getLogger("process_segment_rows") rep_socket_uri = ipc_socket_uri(_socket_dir, args.node_name, "handoff_client") prepare_ipc_path(rep_socket_uri) rep_socket = zeromq_context.socket(zmq.REP) rep_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark) rep_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) log.info("binding rep socket to {0}".format(rep_socket_uri)) rep_socket.bind(rep_socket_uri) log.debug("starting workers") workers = list() for index in range(args.worker_count): worker_id = str(index + 1) workers.append(_start_worker_process(worker_id, args, rep_socket_uri)) # loop until all handoffs have been accomplished log.debug("start handoffs") work_generator = _generate_segment_rows(raw_segment_rows) pending_handoff_count = 0 while not halt_event.is_set(): # get a segment row to process. If we are at EOF, segment_row = None try: source_node_names, segment_row = next(work_generator) except StopIteration: if pending_handoff_count == 0: break else: source_node_names, segment_row = None, None # if we have a segment row, and it is a tombstone, we can act # directly on the node database(s) without sending it to a worker if segment_row is not None: if segment_row["status"] == segment_status_tombstone: _process_tombstone(node_databases, source_node_names, segment_row) _purge_handoff_from_source_nodes( node_databases, source_node_names, segment_row["collection_id"], segment_row["key"], segment_row["unified_id"], segment_row["conjoined_part"], segment_row["handoff_node_id"], segment_status_tombstone) continue assert segment_row["status"] == segment_status_final, \ segment_row["status"] # at this point we eaither have a segment row in final status, or # None, indicating no more data # block until we have a ready worker try: request = rep_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.warn("breaking due to halt_event") break raise assert not rep_socket.rcvmore # see how the worker handled the previous segment (if any) initial_request = False if request["message-type"] == "start": log.info("{0} initial request".format(request["worker-id"])) initial_request = True elif request["handoff-successful"]: log.info("{0} handoff ({1}, {2}) successful".format( request["worker-id"], request["unified-id"], request["conjoined-part"])) assert pending_handoff_count > 0 pending_handoff_count -= 1 _purge_handoff_from_source_nodes( node_databases, request["source-node-names"], request["collection-id"], request["key"], request["unified-id"], request["conjoined-part"], request["handoff-node-id"], segment_status_final) else: log.error("{0} handoff ({1}, {2}) failed: {3}".format( request["worker-id"], request["unified-id"], request["conjoined-part"], request["error-message"])) assert pending_handoff_count > 0 pending_handoff_count -= 1 if segment_row is None: # if we have no more work, tell the worker to stop work_message = {"message-type": "stop"} else: # otherwise, send the segment to the worker work_message = { "message-type": "work", "source-node-names": source_node_names, "segment-row": segment_row } # if this is the worker's first request, send him the node_dict if initial_request: work_message["node-dict"] = node_dict pending_handoff_count += 1 rep_socket.send_pyobj(work_message) log.debug("end of handoffs") for worker in workers: terminate_subprocess(worker) rep_socket.close()
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") prepare_ipc_path(_pull_socket_uri) halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() pull_socket = _bind_pull_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "service_availability") event_push_client.info("program-starts", "service availability monitor starts") message_count = 0 try: ping_process_dict = _start_ping_processes(halt_event) while not halt_event.is_set(): if message_count % len(ping_process_dict) == 0: for ping_process in ping_process_dict.values(): poll_subprocess(ping_process.process) message = pull_socket.recv_pyobj() assert not pull_socket.rcvmore _process_one_message(message, ping_process_dict, event_push_client) message_count += 1 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program terminating normally; interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminating normally") log.debug("terminating subprocesses") _terminate_ping_processes(ping_process_dict) pull_socket.close() event_push_client.close() zeromq_context.term() return return_value
def process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, raw_segment_rows): """ process handoffs of segment rows """ log = logging.getLogger("process_segment_rows") rep_socket_uri = ipc_socket_uri(_socket_dir, args.node_name, "handoff_client") prepare_ipc_path(rep_socket_uri) rep_socket = zeromq_context.socket(zmq.REP) rep_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark) rep_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) log.info("binding rep socket to {0}".format(rep_socket_uri)) rep_socket.bind(rep_socket_uri) log.debug("starting workers") workers = list() for index in range(args.worker_count): worker_id = str(index+1) workers.append(_start_worker_process(worker_id, args, rep_socket_uri)) # loop until all handoffs have been accomplished log.debug("start handoffs") work_generator = _generate_segment_rows(raw_segment_rows) pending_handoff_count = 0 while not halt_event.is_set(): # get a segment row to process. If we are at EOF, segment_row = None try: source_node_names, segment_row = next(work_generator) except StopIteration: if pending_handoff_count == 0: break else: source_node_names, segment_row = None, None # if we have a segment row, and it is a tombstone, we can act # directly on the node database(s) without sending it to a worker if segment_row is not None: if segment_row["status"] == segment_status_tombstone: _process_tombstone(node_databases, source_node_names, segment_row) _purge_handoff_from_source_nodes(node_databases, source_node_names, segment_row["collection_id"], segment_row["key"], segment_row["unified_id"], segment_row["conjoined_part"], segment_row["handoff_node_id"], segment_status_tombstone) continue assert segment_row["status"] == segment_status_final, \ segment_row["status"] # at this point we eaither have a segment row in final status, or # None, indicating no more data # block until we have a ready worker try: request = rep_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.warn("breaking due to halt_event") break raise assert not rep_socket.rcvmore # see how the worker handled the previous segment (if any) initial_request = False if request["message-type"] == "start": log.info("{0} initial request".format(request["worker-id"])) initial_request = True elif request["handoff-successful"]: log.info("{0} handoff ({1}, {2}) successful".format( request["worker-id"], request["unified-id"], request["conjoined-part"])) assert pending_handoff_count > 0 pending_handoff_count -= 1 _purge_handoff_from_source_nodes(node_databases, request["source-node-names"], request["collection-id"], request["key"], request["unified-id"], request["conjoined-part"], request["handoff-node-id"], segment_status_final) else: log.error("{0} handoff ({1}, {2}) failed: {3}".format( request["worker-id"], request["unified-id"], request["conjoined-part"], request["error-message"])) assert pending_handoff_count > 0 pending_handoff_count -= 1 if segment_row is None: # if we have no more work, tell the worker to stop work_message = {"message-type" : "stop"} else: # otherwise, send the segment to the worker work_message = {"message-type" : "work", "source-node-names" : source_node_names, "segment-row" : segment_row} # if this is the worker's first request, send him the node_dict if initial_request: work_message["node-dict"] = node_dict pending_handoff_count += 1 rep_socket.send_pyobj(work_message) log.debug("end of handoffs") for worker in workers: terminate_subprocess(worker) rep_socket.close()
def main(worker_id, host_name, base_port, dest_node_name, rep_socket_uri): """ main entry point return 0 on normal termination (exit code) """ log = logging.getLogger("main") client_tag = _client_tag_template.format(worker_id) halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() log.debug("creating pull socket") pull_socket_uri = "tcp://{0}:{1}".format(socket.gethostbyname(host_name), base_port + worker_id) pull_socket = zeromq_context.socket(zmq.PULL) pull_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) log.info("binding pull socket to {0}".format(pull_socket_uri)) pull_socket.bind(pull_socket_uri) req_socket = zeromq_context.socket(zmq.REQ) req_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark) req_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark) req_socket.connect(rep_socket_uri) log.info("sending 'start' message") # notify our parent that we are ready to receive work request = {"message-type": "start", "worker-id": worker_id} req_socket.send_pyobj(request) log.info("starting message loop") node_dict = None while not halt_event.is_set(): try: message = req_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("interrupted system call wiht halt_event_set") break log.exception(str(zmq_error)) sys.exit(1) assert not req_socket.rcvmore if message["message-type"] == "stop": log.info("'stop' message received") break assert message["message-type"] == "work", message["message-type"] # we expect our parent to send us the node dict in our first message if "node-dict" in message: node_dict = message["node-dict"] assert node_dict is not None # aliases for brevity segment_row = message["segment-row"] source_node_names = message["source-node-names"] # send back enough information to purge the handoffs, if successful request = { "message-type": "handoff-complete", "worker-id": worker_id, "handoff-successful": False, "unified-id": segment_row["unified_id"], "collection-id": segment_row["collection_id"], "key": segment_row["key"], "conjoined-part": segment_row["conjoined_part"], "handoff-node-id": segment_row["handoff_node_id"], "source-node-names": source_node_names, "error-message": "" } for source_node_name in source_node_names: try: _process_handoff(zeromq_context, halt_event, node_dict, pull_socket, pull_socket_uri, client_tag, source_node_name, dest_node_name, segment_row) except Exception as instance: log.exception(instance) request["error-message"] = "".join( [request["error-message"], str(instance)]) else: request["handoff-successful"] = True break req_socket.send_pyobj(request) log.info("end message loop") pull_socket.close() req_socket.close() zeromq_context.term() log.info("program terminates") return 0
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() resources = \ _resources_tuple(halt_event=Event(), volume_by_space_id=_volume_name_by_space_id(), pull_socket=zeromq_context.socket(zmq.PULL), router_socket=zeromq_context.socket(zmq.ROUTER), event_push_client=\ EventPushClient(zeromq_context, "rs_io_controller"), pending_work_by_volume=defaultdict(deque), available_ident_by_volume=defaultdict(deque)) log.debug("binding to {0}".format(io_controller_pull_socket_uri)) resources.pull_socket.bind(io_controller_pull_socket_uri) resources.router_socket.setsockopt(zmq.LINGER, 1000) log.debug("binding to {0}".format(io_controller_router_socket_uri)) resources.router_socket.bind(io_controller_router_socket_uri) # we poll the sockets for readability, we assume we can always # write to the router socket poller = zmq.Poller() poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR) poller.register(resources.router_socket, zmq.POLLIN | zmq.POLLERR) worker_processes = list() for volume_name in set(resources.volume_by_space_id.values()): for index in range(_worker_count): worker_processes.append(_launch_io_worker(volume_name, index + 1)) last_report_time = 0.0 try: while not halt_event.is_set(): for worker_process in worker_processes: poll_subprocess(worker_process) for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) if active_socket is resources.pull_socket: _read_pull_socket(resources) elif active_socket is resources.router_socket: _read_router_socket(resources) else: log.error("unknown socket {0}".format(active_socket)) current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: pending_work = 0 for volume_queue in resources.pending_work_by_volume.values(): pending_work += len(volume_queue) report_message = \ "{0:,} pending_work entries".format(pending_work) log.info(report_message) resources.event_push_client.info("queue_sizes", report_message, pending_work=pending_work) last_report_time = current_time except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: for worker_process in worker_processes: terminate_subprocess(worker_process) resources.pull_socket.close() resources.router_socket.close() resources.event_push_client.close() zeromq_context.term() return return_value
def run(self, halt_event): """ poll for I/O ready objects This is a task for the time queue """ if halt_event.is_set(): self._log.info("halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return next_tasks = list() next_interval = time.time() + self._polling_interval try: result_list = self._poller.poll(timeout=self._poll_timeout) except zmq.ZMQError: zmq_error = sys.exc_info()[1] if is_interrupted_system_call(zmq_error) and halt_event.is_set(): self._log.info("Interrupted with halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return raise for active_socket, event_flags in result_list: if active_socket not in self._active_sockets: self._log.warn("Ignoring unknown active_socket %s" % ( active_socket, )) self._poller.unregister(active_socket) continue if event_flags & zmq.POLLERR: message = ("Error flag from poll() %s" % ( active_socket, )) self._log.error(message) raise ZeroMQPollsterError(message) callback = self._active_sockets[active_socket] readable = (True if event_flags & zmq.POLLIN else False) writable = (True if event_flags & zmq.POLLOUT else False) result_list = callback( active_socket, readable=readable, writable=writable, ) if result_list is not None: next_tasks.extend(result_list) # 2010-10-25 dougfort -- If the socket was readable, we probably # picked up new work. So go back and poll again right away. # These zmq sockets seem to pretty well always be writable, # so don't assume we actually did anything. if readable: next_interval = time.time() # put ourselves into the time queue to run at the next polling interval next_tasks.append((self.run, next_interval, )) return next_tasks
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value
def run(self, halt_event): """ poll for I/O ready objects This is a task for the time queue """ if halt_event.is_set(): self._log.info("halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return next_tasks = list() next_interval = time.time() + self._polling_interval try: result_list = self._poller.poll(timeout=self._poll_timeout) except zmq.ZMQError: zmq_error = sys.exc_info()[1] if is_interrupted_system_call(zmq_error) and halt_event.is_set(): self._log.info("Interrupted with halt_event set: exiting") for active_socket in self._active_sockets.keys(): self._poller.unregister(active_socket) self._active_sockets.clear() return raise for active_socket, event_flags in result_list: if active_socket not in self._active_sockets: self._log.warn("Ignoring unknown active_socket %s" % (active_socket, )) self._poller.unregister(active_socket) continue if event_flags & zmq.POLLERR: message = ("Error flag from poll() %s" % (active_socket, )) self._log.error(message) raise ZeroMQPollsterError(message) callback = self._active_sockets[active_socket] readable = (True if event_flags & zmq.POLLIN else False) writable = (True if event_flags & zmq.POLLOUT else False) result_list = callback( active_socket, readable=readable, writable=writable, ) if result_list is not None: next_tasks.extend(result_list) # 2010-10-25 dougfort -- If the socket was readable, we probably # picked up new work. So go back and poll again right away. # These zmq sockets seem to pretty well always be writable, # so don't assume we actually did anything. if readable: next_interval = time.time() # put ourselves into the time queue to run at the next polling interval next_tasks.append(( self.run, next_interval, )) return next_tasks
def _process_one_transaction(dealer_socket, database_connection, event_push_client): """ Wait for a reply to our last message from the controller. This will be a query request. We send the results to the controller and repeat the cycle, waiting for a reply """ log = logging.getLogger("_process_one_transaction") log.debug("waiting work request") try: request = dealer_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error): raise InterruptedSystemCall() raise assert dealer_socket.rcvmore control = dealer_socket.recv_pyobj() fields = _define_seq_val_fields() if request["handoff-node-id"] is None: query = _all_sequence_rows_for_segment_query.format(fields) else: query = _all_sequence_rows_for_handoff_query.format(fields) control["result"] = "success" control["error-message"] = "" try: result = database_connection.fetch_all_rows(query, request) except psycopg2.OperationalError as instance: error_message = "database error {0}".format(instance) event_push_client.error("database_error", error_message) control["result"] = "database_error" control["error-message"] = error_message else: if len(result) == 0: control["result"] = "no_sequence_rows_found" control["error-message"] = "no sequence rows found" if control["result"] != "success": log.error("user_request_id = {0}, " \ "{1} {2}".format(request["user-request-id"], control["result"], control["error-message"])) dealer_socket.send_pyobj(request, zmq.SNDMORE) dealer_socket.send_pyobj(control) return result_list = list() for row in result: row_list = list(row) segment_sequence_row = segment_sequence_template._make(row_list[:-1]) space_id = row_list[-1] row_dict = dict(segment_sequence_row._asdict().items()) row_dict["hash"] = bytes(row_dict["hash"]) row_dict["space_id"] = space_id result_list.append(row_dict) log.debug("user_request_id = {0}, " \ " sending request back to controller".format( request["user-request-id"])) dealer_socket.send_pyobj(request, zmq.SNDMORE) dealer_socket.send_pyobj(control, zmq.SNDMORE) dealer_socket.send_pyobj(result_list)
def _process_request(resources): """ Wait for a reply to our last message from the controller. """ log = logging.getLogger("_process_one_transaction") log.debug("waiting work request") try: request = resources.dealer_socket.recv_pyobj() except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error): raise InterruptedSystemCall() raise assert resources.dealer_socket.rcvmore control = resources.dealer_socket.recv_pyobj() log.debug("user_request_id = {0}; control = {1}".format( request["user-request-id"], control)) assert resources.dealer_socket.rcvmore sequence_row = resources.dealer_socket.recv_pyobj() value_file_path = compute_value_file_path(_repository_path, sequence_row["space_id"], sequence_row["value_file_id"]) control["result"] = "success" control["error-message"] = "" if value_file_path in resources.file_cache: value_file, _ = resources.file_cache[value_file_path] del resources.file_cache[value_file_path] else: try: value_file = open(value_file_path, "rb") except Exception as instance: log.exception("user_request_id = {0}, " \ "read {1}".format(request["user-request-id"], value_file_path)) resources.event_push_client.exception("error_opening_value_file", str(instance)) control["result"] = "error_opening_value_file" control["error-message"] = str(instance) if control["result"] != "success": _send_error_reply(resources, request, control) return read_offset = \ sequence_row["value_file_offset"] + \ (control["left-offset"] * encoded_block_slice_size) read_size = \ sequence_row["size"] - \ (control["left-offset"] * encoded_block_slice_size) - \ (control["right-offset"] * encoded_block_slice_size) # Ticket #84 handle a short block # the last block in the file may be smaller than encoded_block_slice_size # so we might have subtracted too much for the right offset if control["right-offset"] > 0: block_modulus = sequence_row["size"] % encoded_block_slice_size last_block_size = (encoded_block_slice_size if block_modulus == 0 else \ block_modulus) last_block_delta = encoded_block_slice_size - last_block_size read_size += last_block_delta try: value_file.seek(read_offset) encoded_data = value_file.read(read_size) except Exception as instance: log.exception("user_request_id = {0}, " \ "read {1}".format(request["user-request-id"], value_file_path)) resources.event_push_client.exception("error_reading_value_file", str(instance)) control["result"] = "error_reading_vqalue_file" control["error-message"] = str(instance) if control["result"] != "success": value_file.close() _send_error_reply(resources, request, control) return resources.file_cache[value_file_path] = value_file, time.time() if len(encoded_data) != read_size: error_message = "{0} size mismatch {1} {2}".format( request["retrieve-id"], len(encoded_data), read_size) log.error("user_request_id = {0}, {1}".format(request["user-request-id"], error_message)) resources.event_push_client.error("size_mismatch", error_message) control["result"] = "size_mismatch" control["error-message"] = error_message if control["result"] != "success": _send_error_reply(resources, request, control) return encoded_block_list = list(encoded_block_generator(encoded_data)) segment_size = 0 segment_adler32 = 0 segment_md5 = hashlib.md5() for encoded_block in encoded_block_list: segment_size += len(encoded_block) segment_adler32 = zlib.adler32(encoded_block, segment_adler32) segment_md5.update(encoded_block) segment_md5_digest = segment_md5.digest() reply = { "message-type" : "retrieve-key-reply", "user-request-id" : request["user-request-id"], "client-tag" : request["client-tag"], "message-id" : request["message-id"], "retrieve-id" : request["retrieve-id"], "segment-unified-id" : request["segment-unified-id"], "segment-num" : request["segment-num"], "segment-size" : segment_size, "zfec-padding-size" : sequence_row["zfec_padding_size"], "segment-adler32" : segment_adler32, "segment-md5-digest" : b64encode(segment_md5_digest).decode("utf-8"), "sequence-num" : None, "completed" : control["completed"], "result" : "success", "error-message" : "", } push_socket = _get_reply_push_socket(resources, request["client-address"]) push_socket.send_json(reply, zmq.SNDMORE) for encoded_block in encoded_block_list[:-1]: push_socket.send(encoded_block, zmq.SNDMORE) push_socket.send(encoded_block_list[-1])
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = \ _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value