def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 collectable_segment_ids = io.StringIO() partition_count = 0 collectable_count = 0 try: versioned_collections = get_versioned_collections() for partition in generate_candidate_partitions(connection): partition_count += 1 versioned_collection = \ partition[0].collection_id in versioned_collections count = _evaluate_partition(collectable_segment_ids, partition, versioned_collection) collectable_count += count archive_collectable_segment_rows(connection, collectable_segment_ids, options.max_node_offline_time) collectable_segment_ids.close() except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info( "found {0:,} candidates, collected {1:,} segments".format( partition_count, collectable_count ) ) log.info("program terminates normally") connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") args = parse_commandline() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "handoff_client") event_push_client.info("program-start", "handoff_client starts") return_code = 0 node_databases = None try: node_dict = get_node_ids(args.node_name) node_databases = get_node_databases() conjoined_rows, segment_rows = \ get_handoff_rows(node_databases, node_dict[args.node_name]) log.info("found {0} conjoined and {1} segment handoffs".format( len(conjoined_rows), len(segment_rows))) if len(conjoined_rows) > 0: process_conjoined_rows(halt_event, args, node_databases, conjoined_rows) if len(segment_rows) > 0: process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, segment_rows) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return_code = 1 if node_databases is not None: for connection in node_databases.values(): connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database") return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 try: total_unused_value_file_size = unlink_totally_unused_value_files( connection, _repository_path) unreachable_value_file_size = unlink_unreachable_value_files( connection, _repository_path) ref_generator = generate_value_file_references(options, connection) savings = rewrite_value_files(options, connection, _repository_path, ref_generator) except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("program terminates normally") event_push_client.info( "rewrite complete", "garbage_collector finished", unused_value_file_bytes_reclaimed=total_unused_value_file_size, unreachable_value_file_bytes_reclaimed=unreachable_value_file_size, rewrite_value_file_savings=savings) connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") args = parse_commandline() halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "handoff_client") event_push_client.info("program-start", "handoff_client starts") return_code = 0 node_databases = None try: node_dict = get_node_ids(args.node_name) node_databases = get_node_databases() conjoined_rows, segment_rows = \ get_handoff_rows(node_databases, node_dict[args.node_name]) log.info("found {0} conjoined and {1} segment handoffs".format( len(conjoined_rows), len(segment_rows))) if len(conjoined_rows) > 0: process_conjoined_rows(halt_event, args, node_databases, conjoined_rows) if len(segment_rows) > 0: process_segment_rows(halt_event, zeromq_context, args, node_dict, node_databases, segment_rows) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_code = 1 if node_databases is not None: for connection in node_databases.values(): connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_inspector") event_push_client.info("program-start", "cluster_inspector starts") # if there's any wreckage from a previous run, clear it out if os.path.exists(_work_dir): log.info("removing old {0}".format(_work_dir)) shutil.rmtree(_work_dir) os.mkdir(_work_dir) try: pull_segments_from_nodes(halt_event, _work_dir) if halt_event.is_set(): log.info("halt_event set (1): exiting") return -1 audit_segments(halt_event, _work_dir) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return -3 event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database") return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 try: total_unused_value_file_size = unlink_totally_unused_value_files( connection, _repository_path) unreachable_value_file_size = unlink_unreachable_value_files( connection, _repository_path) ref_generator = generate_value_file_references(options, connection) savings = rewrite_value_files( options, connection, _repository_path, ref_generator) except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("program terminates normally") event_push_client.info( "rewrite complete", "garbage_collector finished", unused_value_file_bytes_reclaimed=total_unused_value_file_size, unreachable_value_file_bytes_reclaimed=unreachable_value_file_size, rewrite_value_file_savings=savings ) connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_repair") event_push_client.info("program-start", "cluster_repair starts") zfec_server_req_socket = zmq_context.socket(zmq.REQ) zfec_server_req_socket.setsockopt(zmq.LINGER, 1000) log.info("connecting req socket to {0}".format(_zfec_server_address)) zfec_server_req_socket.connect(_zfec_server_address) read_subprocess = _start_read_subprocess() write_subprocess = _start_write_subprocess() try: _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess, write_subprocess) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return -3 finally: read_subprocess.terminate() write_subprocess.terminate() event_push_client.close() zfec_server_req_socket.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") options = get_options() try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) return -1 zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "garbage_collector") event_push_client.info("program-start", "garbage_collector starts") return_code = 0 collectable_segment_ids = io.StringIO() partition_count = 0 collectable_count = 0 try: versioned_collections = get_versioned_collections() for partition in generate_candidate_partitions(connection): partition_count += 1 versioned_collection = \ partition[0].collection_id in versioned_collections count = _evaluate_partition(collectable_segment_ids, partition, versioned_collection) collectable_count += count archive_collectable_segment_rows(connection, collectable_segment_ids, options.max_node_offline_time) collectable_segment_ids.close() except Exception: log.exception("_garbage_collection") return_code = -2 else: log.info("found {0:,} candidates, collected {1:,} segments".format( partition_count, collectable_count)) log.info("program terminates normally") connection.close() event_push_client.close() zmq_context.term() return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_repair") event_push_client.info("program-start", "cluster_repair starts") zfec_server_req_socket = zmq_context.socket(zmq.REQ) zfec_server_req_socket.setsockopt(zmq.LINGER, 1000) log.info("connecting req socket to {0}".format(_zfec_server_address)) zfec_server_req_socket.connect(_zfec_server_address) read_subprocess = _start_read_subprocess() write_subprocess = _start_write_subprocess() try: _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess, write_subprocess) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -3 finally: read_subprocess.terminate() write_subprocess.terminate() event_push_client.close() zfec_server_req_socket.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "cluster_inspector") event_push_client.info("program-start", "cluster_inspector starts") # if there's any wreckage from a previous run, clear it out if os.path.exists(_work_dir): log.info("removing old {0}".format(_work_dir)) shutil.rmtree(_work_dir) os.mkdir(_work_dir) try: pull_segments_from_nodes(halt_event, _work_dir) if halt_event.is_set(): log.info("halt_event set (1): exiting") return -1 audit_segments(halt_event, _work_dir) except KeyboardInterrupt: halt_event.set() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -3 event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def _setup(_halt_event, state): log = logging.getLogger("_setup") # do the event push client first, because we may need to # push an execption event from setup state["event-push-client"] = EventPushClient( state["zmq-context"], "space_accounting_server" ) log.info("binding router-server to %s" % (_space_accounting_server_address, )) state["router-server"] = RouterServer( state["zmq-context"], _space_accounting_server_address, state["receive-queue"] ) state["router-server"].register(state["pollster"]) log.info("binding pull-server to %s" % ( _space_accounting_pipeline_address, )) state["pull-server"] = PULLServer( state["zmq-context"], _space_accounting_pipeline_address, state["receive-queue"] ) state["pull-server"].register(state["pollster"]) state["queue-dispatcher"] = DequeDispatcher( state, state["receive-queue"], _dispatch_table ) state["state-cleaner"] = StateCleaner(state) state["event-push-client"].info( "program-start", "space_accounting_server starts" ) # hand the pollster and the queue-dispatcher to the time-queue return [ (state["pollster"].run, time.time(), ), (state["queue-dispatcher"].run, time.time(), ), (state["state-cleaner"].run, state["state-cleaner"].next_run(), ), ]
def _setup(_halt_event, state): log = logging.getLogger("_setup") log.info("starting up") # do the event push client first, because we may need to # push an execption event from setup state["event-push-client"] = EventPushClient(state["zmq-context"], "event_aggregator") state["pub-server"] = PUBServer(state["zmq-context"], _event_aggregator_pub_address) state["sub-clients"] = list() for event_publisher_pub_address in _event_publisher_pub_addresses: sub_client = SUBClient(state["zmq-context"], event_publisher_pub_address, _subscribe_to_all_topics, state["receive-queue"]) sub_client.register(state["pollster"]) state["sub-clients"].append(sub_client) state["callback-dispatcher"] = CallbackDispatcher( state, state["receive-queue"], _publish_event, ) state["event-push-client"].info("program-start", "event_aggregator starts") timer_driven_callbacks = [ ( state["pollster"].run, time.time(), ), ( state["callback-dispatcher"].run, time.time(), ), ] return timer_driven_callbacks
def __init__(self): self._log = logging.getLogger("WebServer") authenticator = SqlAuthenticator() self._central_connection = get_central_connection() self._cluster_row = get_cluster_row(self._central_connection) self._node_local_connection = get_node_local_connection() self._unified_id_factory = UnifiedIDFactory( self._central_connection, _get_shard_id(self._central_connection, self._cluster_row.id) ) self._deliverator = Deliverator() self._zeromq_context = zmq.Context() self._pull_server = GreenletPULLServer( self._zeromq_context, _web_server_pipeline_address, self._deliverator ) self._pull_server.link_exception(self._unhandled_greenlet_exception) # message sent to data readers and writers telling them the server # is (re)starting, thereby invalidating any archvies or retrieved # that are in progress for this node timestamp = create_timestamp() start_message = { "message-type" : "web-server-start", "priority" : create_priority(), "unified-id" : self._unified_id_factory.next(), "timestamp-repr" : repr(timestamp), "source-node-name" : _local_node_name, } self._data_writer_clients = list() for node_name, address in zip(_node_names, _data_writer_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_server_pipeline_address, self._deliverator, connect_messages=[start_message, ] ) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_writer_clients.append(resilient_client) self._data_reader_clients = list() self._data_readers = list() for node_name, address in zip(_node_names, _data_reader_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_server_pipeline_address, self._deliverator, connect_messages=[start_message, ] ) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_reader_clients.append(resilient_client) data_reader = DataReader( node_name, resilient_client ) self._data_readers.append(data_reader) self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address ) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception ) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client ) self._event_push_client = EventPushClient( self._zeromq_context, "web-server" ) self._watcher = Watcher( _stats, self._data_reader_clients, self._data_writer_clients, self._event_push_client ) id_translator_keys_path = os.path.join( _repository_path, "id_translator_keys.pkl" ) with open(id_translator_keys_path, "r") as input_file: id_translator_keys = pickle.load(input_file) self._id_translator = InternalIDTranslator( id_translator_keys["key"], id_translator_keys["hmac_key"], id_translator_keys["iv_key"], id_translator_keys["hmac_size"] ) self.application = Application( self._central_connection, self._node_local_connection, self._cluster_row, self._unified_id_factory, self._id_translator, self._data_writer_clients, self._data_readers, authenticator, self._accounting_client, self._event_push_client, _stats ) self.wsgi_server = WSGIServer( (_web_server_host, _web_server_port), application=self.application, backlog=_wsgi_backlog )
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None file_space_info = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception( "database exception", str(value), exctype=exctype.__name__ ) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue file_space_info = load_file_space_info(connection) file_space_sanity_check(file_space_info, _repository_path) # try one defrag pass bytes_defragged = 0 connection.begin_transaction() try: bytes_defragged = _defrag_pass(connection, file_space_info, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # exit if we're done and asked to do single pass if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')): halt_event.set() # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def __init__(self, halt_event): self._log = logging.getLogger("WebServer") memcached_client = create_memcached_client() self._interaction_pool = \ gdbpool.interaction_pool.DBInteractionPool( get_central_database_dsn(), pool_name=_central_pool_name, pool_size=_central_database_pool_size, do_log=True) self._interaction_pool.add_pool(dsn=get_node_local_database_dsn(), pool_name=_local_node_name, pool_size=_local_database_pool_size) # Ticket #25: must run database operation in a greenlet greenlet = gevent.Greenlet.spawn(_get_cluster_row, self._interaction_pool) greenlet.join() self._cluster_row = greenlet.get() authenticator = \ InteractionPoolAuthenticator(memcached_client, self._interaction_pool) self._zeromq_context = zmq.Context() self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client) self._event_push_client = EventPushClient(self._zeromq_context, "web-server") id_translator_keys_path = os.environ.get( "NIMBUS_IO_ID_TRANSLATION_KEYS", os.path.join(_repository_path, "id_translator_keys.pkl")) with open(id_translator_keys_path, "r") as input_file: id_translator_keys = pickle.load(input_file) self._id_translator = InternalIDTranslator( id_translator_keys["key"], id_translator_keys["hmac_key"], id_translator_keys["iv_key"], id_translator_keys["hmac_size"]) redis_queue = gevent.queue.Queue() self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue, _local_node_name) self._redis_sink.link_exception(self._unhandled_greenlet_exception) self.application = Application(self._interaction_pool, self._cluster_row, self._id_translator, authenticator, self._accounting_client, self._event_push_client, redis_queue) self.wsgi_server = WSGIServer( (_web_public_reader_host, _web_public_reader_port), application=self.application, backlog=_wsgi_backlog, log=sys.stdout)
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") prepare_ipc_path(_pull_socket_uri) halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() pull_socket = _bind_pull_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "service_availability") event_push_client.info("program-starts", "service availability monitor starts") message_count = 0 try: ping_process_dict = _start_ping_processes(halt_event) while not halt_event.is_set(): if message_count % len(ping_process_dict) == 0: for ping_process in ping_process_dict.values(): poll_subprocess(ping_process.process) message = pull_socket.recv_pyobj() assert not pull_socket.rcvmore _process_one_message(message, ping_process_dict, event_push_client) message_count += 1 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program terminating normally; interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminating normally") log.debug("terminating subprocesses") _terminate_ping_processes(ping_process_dict) pull_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None file_space_info = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception("database exception", str(value), exctype=exctype.__name__) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue file_space_info = load_file_space_info(connection) file_space_sanity_check(file_space_info, _repository_path) # try one defrag pass bytes_defragged = 0 connection.begin_transaction() try: bytes_defragged = _defrag_pass(connection, file_space_info, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # exit if we're done and asked to do single pass if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')): halt_event.set() # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_push_client = EventPushClient(zeromq_context, "redis_stats_collector") event_push_client.info("program-start", "flush_stats_from_redis starts") # don't flush anything newer than 1 minute ago current_time = datetime.utcnow() timestamp_cutoff = current_time - timedelta(minutes=1) return_code = 0 central_db_connection = None collection_ops_accounting_rows = list() # values to be added to the dedupe table new_dedupes = list() # keys to be deleted (a list for each node node_keys_processed = [list() for _ in _node_names] try: central_db_connection = get_central_connection() # On startup, the program connects to the central database and tries # to acquire a pg_advisory_lock appropriate for this program and the # data center it is running in using the pg_try_advisory_lock function. # If it cannot acquire the lock, it notes the status of the lock # and exits. This central locking mechanism lets us avoid single points # of failure by configuring the program to run on multiple nodes. with advisory_lock(central_db_connection, "redis_stats_collector"): node_dict = _retrieve_node_dict(central_db_connection) for node_name, keys_processed in \ zip(_node_names, node_keys_processed): node_id = node_dict[node_name] log.debug("processing node {0} node_id={1}".format(node_name, node_id)) # The program then selects into memory all recently collected # keys from the central database table # collection_ops_accounting_flush_dedupe and stores them in a # dedupe set. This set allows runs of the collection/flush # program to be idempotent across some time period ( # but we won't keep the list of old keys forever.) dedupe_set = _retrieve_dedupe_set(central_db_connection, node_id) # The program then visits the Redis instance on every storage # node in the local data center, collecting the data from all # past stats keys -- aggregating it into the program's memory. # The aggregation should involve buckets for each # storage_node_id and redis key, corresponding to the columns # in the database. _process_one_node(node_name, node_dict[node_name], timestamp_cutoff, dedupe_set, collection_ops_accounting_rows, new_dedupes, keys_processed) # After collecting past keys from every storage node, # inside a central database transaction: # 1. Insert the collected stats into the central database # collection_ops_accounting # 2. Insert collected keys into recently collected keys # collection_ops_accounting_flush_dedupe. # 3. commit transaction log.debug("updating central database") central_db_connection.begin_transaction() try: _insert_accounting_rows(central_db_connection, collection_ops_accounting_rows) _insert_dedupe_rows(central_db_connection, timestamp_cutoff, new_dedupes) except Exception: central_db_connection.rollback() raise else: central_db_connection.commit() # Then revisit the Redis nodes, and delete the keys we flushed # into the database, and any keys we skipped because they were # found in the dedupe set. for node_name, keys_processed in zip(_node_names, node_keys_processed): _remove_processed_keys(node_name, keys_processed) except Exception as instance: log.exception("Uhandled exception {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return_code = 1 if central_db_connection is not None: central_db_connection.close() event_push_client.close() zeromq_context.term() log.info("program terminates return_code = {0}".format(return_code)) return return_code
def main(): """ main entry point return 0 for success (exit code) """ initialize_logging(_log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() signal.signal(signal.SIGTERM, _create_signal_handler(halt_event)) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "defragger") event_push_client.info("program-start", "defragger starts") connection = None while not halt_event.is_set(): # if we don't have an open database connection, get one if connection is None: try: connection = get_node_local_connection() except Exception as instance: exctype, value = sys.exc_info()[:2] event_push_client.exception( "database exception", str(value), exctype=exctype.__name__ ) log.exception("Exception connecting to database") halt_event.wait(_database_retry_interval) continue # start a transaction connection.execute("begin") # try one defrag pass bytes_defragged = 0 try: bytes_defragged = _defrag_pass(connection, event_push_client) except KeyboardInterrupt: halt_event.set() connection.rollback() except Exception as instance: log.exception(str(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) connection.rollback() else: connection.commit() log.info("bytes defragged = {0:,}".format(bytes_defragged)) # if we didn't do anything on this pass... if bytes_defragged == 0: # close the database connection if connection is not None: connection.close() connection = None # wait and try again try: halt_event.wait(_defrag_check_interval) except KeyboardInterrupt: halt_event.set() if connection is not None: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 volume_name = sys.argv[1] worker_number = int(sys.argv[2]) log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], volume_name.replace("/", "_"), worker_number, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_source_name = "rs_io_worker_{0}_{1}".format(volume_name, worker_number) resources = \ _resources_tuple(halt_event=halt_event, zeromq_context=zeromq_context, reply_push_sockets=dict(), event_push_client=EventPushClient(zeromq_context, event_source_name), dealer_socket=zeromq_context.socket(zmq.DEALER), file_cache=LRUCache(_max_file_cache_size)) resources.dealer_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting to {0}".format(io_controller_router_socket_uri)) resources.dealer_socket.connect(io_controller_router_socket_uri) last_close_pass_time = time.time() try: while not halt_event.is_set(): # an occasional pass that closes any open files that haven't # been used current_time = time.time() elapsed_time = current_time - last_close_pass_time if elapsed_time > _unused_file_close_interval: _make_close_pass(resources, current_time) last_close_pass_time = current_time _send_work_request(resources, volume_name) _process_request(resources) except InterruptedSystemCall: if halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception( unhandled_exception_topic, "Interrupted zeromq system call", exctype="InterruptedSystemCall") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: resources.dealer_socket.close() for push_socket in resources.reply_push_sockets.values(): push_socket.close() resources.event_push_client.close() resources.zeromq_context.term() return return_value
def __init__(self): self._log = logging.getLogger("WebInternalReader") memcached_client = memcache.Client(_memcached_nodes) self._central_connection = get_central_connection() self._cluster_row = get_cluster_row(self._central_connection) self._node_local_connection = get_node_local_connection() self._deliverator = Deliverator() self._zeromq_context = zmq.Context() self._pull_server = GreenletPULLServer( self._zeromq_context, _web_internal_reader_pipeline_address, self._deliverator) self._pull_server.link_exception(self._unhandled_greenlet_exception) self._data_reader_clients = list() self._data_readers = list() for node_name, address in zip(_node_names, _data_reader_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_internal_reader_pipeline_address, self._deliverator, connect_messages=[]) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_reader_clients.append(resilient_client) data_reader = DataReader(node_name, resilient_client) self._data_readers.append(data_reader) self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client) self._event_push_client = EventPushClient(self._zeromq_context, "web-internal-reader") # message sent to data readers telling them the server # is (re)starting, thereby invalidating any archvies or retrieved # that are in progress for this node timestamp = create_timestamp() self._event_push_client.info("web-reader-start", "web reader (re)start", timestamp_repr=repr(timestamp), source_node_name=_local_node_name) self._watcher = Watcher(_stats, self._data_reader_clients, self._event_push_client) self.application = Application(memcached_client, self._central_connection, self._node_local_connection, self._cluster_row, self._data_readers, self._accounting_client, self._event_push_client, _stats) self.wsgi_server = WSGIServer( (_web_internal_reader_host, _web_internal_reader_port), application=self.application, backlog=_wsgi_backlog)
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 worker_number = int(sys.argv[1]) log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], worker_number, _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() event_source_name = "rs_dbpool_worker_{0}".format(worker_number) event_push_client = EventPushClient(zeromq_context, event_source_name) dealer_socket = zeromq_context.socket(zmq.DEALER) dealer_socket.setsockopt(zmq.LINGER, 1000) log.debug("connecting to {0}".format(db_controller_router_socket_uri)) dealer_socket.connect(db_controller_router_socket_uri) log.debug("opening local database connection") database_connection = get_node_local_connection() try: _send_initial_work_request(dealer_socket) while not halt_event.is_set(): _process_one_transaction(dealer_socket, database_connection, event_push_client) except InterruptedSystemCall: if halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "Interrupted zeromq system call", exctype="InterruptedSystemCall") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: database_connection.close() dealer_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point return 0 for success (exit code) """ global _max_value_file_time initialize_logging(_log_path) log = logging.getLogger("main") try: _max_value_file_time = parse_timedelta_str(_max_value_file_time_str) except Exception as instance: log.exception("Unable to parse '{0}' {1}".format( _max_value_file_time_str, instance)) return -1 log.info("program starts; max_value_file_time = {0}".format( _max_value_file_time)) zmq_context = zmq.Context() event_push_client = EventPushClient(zmq_context, "node_inspector") event_push_client.info("program-start", "node_inspector starts") try: connection = get_node_local_connection() except Exception as instance: log.exception("Exception connecting to database {0}".format(instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -1 known_value_files = dict() connection.begin_transaction() try: for batch in generate_work(connection): _process_work_batch(connection, known_value_files, batch) except Exception as instance: connection.rollback() log.exception("Exception processing batch {0} {1}".format( batch, instance)) event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__ ) return -1 else: connection.commit() finally: connection.close() event_push_client.close() zmq_context.term() log.info("program terminates normally") return 0
def __init__(self, halt_event): self._log = logging.getLogger("WebWriter") memcached_client = memcache.Client(_memcached_nodes) self._interaction_pool = gdbpool.interaction_pool.DBInteractionPool( get_central_database_dsn(), pool_name=_central_pool_name, pool_size=_database_pool_size, do_log=True) authenticator = InteractionPoolAuthenticator(memcached_client, self._interaction_pool) # Ticket #25: must run database operation in a greenlet greenlet = gevent.Greenlet.spawn(_get_cluster_row_and_node_row, self._interaction_pool) greenlet.join() self._cluster_row, node_row = greenlet.get() self._unified_id_factory = UnifiedIDFactory(node_row.id) self._deliverator = Deliverator() self._zeromq_context = zmq.Context() self._pull_server = GreenletPULLServer( self._zeromq_context, _web_writer_pipeliner_address, self._deliverator ) self._pull_server.link_exception(self._unhandled_greenlet_exception) self._data_writer_clients = list() for node_name, address in zip(_node_names, _data_writer_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_writer_pipeliner_address, self._deliverator, connect_messages=[] ) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_writer_clients.append(resilient_client) self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address ) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception ) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client ) self._event_push_client = EventPushClient( self._zeromq_context, "web-server" ) # message sent to data writers telling them the server # is (re)starting, thereby invalidating any archives # that are in progress for this node unified_id = self._unified_id_factory.next() timestamp = create_timestamp() self._event_push_client.info("web-writer-start", "web writer (re)start", unified_id=unified_id, timestamp_repr=repr(timestamp), source_node_name=_local_node_name) id_translator_keys_path = os.environ.get( "NIMBUS_IO_ID_TRANSLATION_KEYS", os.path.join(_repository_path, "id_translator_keys.pkl")) with open(id_translator_keys_path, "r") as input_file: id_translator_keys = pickle.load(input_file) self._id_translator = InternalIDTranslator( id_translator_keys["key"], id_translator_keys["hmac_key"], id_translator_keys["iv_key"], id_translator_keys["hmac_size"] ) redis_queue = gevent.queue.Queue() self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue, _local_node_name) self._redis_sink.link_exception(self._unhandled_greenlet_exception) self.application = Application( self._cluster_row, self._unified_id_factory, self._id_translator, self._data_writer_clients, authenticator, self._accounting_client, self._event_push_client, redis_queue ) self.wsgi_server = WSGIServer((_web_writer_host, _web_writer_port), application=self.application, backlog=_wsgi_backlog )
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") for internal_socket_uri in internal_socket_uri_list: prepare_ipc_path(internal_socket_uri) halt_event = Event() set_signal_handler(halt_event) database_pool_controller = _launch_database_pool_controller() io_controller = _launch_io_controller() zeromq_context = zmq.Context() rep_socket = _bind_rep_socket(zeromq_context) db_controller_push_socket = \ _connect_db_controller_push_socket(zeromq_context) event_push_client = EventPushClient(zeromq_context, "retrieve_source") event_push_client.info("program-starts", "retrieve source starts") # we poll the sockets for readability, we assume we can always # write to the push client sockets poller = zmq.Poller() poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR) last_report_time = 0.0 request_count = 0 try: while not halt_event.is_set(): poll_subprocess(database_pool_controller) poll_subprocess(io_controller) # we've only registered one socket, so we could use an 'if' here, # but this 'for' works ok and it has the same form as the other # places where we use poller for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) assert active_socket is rep_socket _process_one_request(rep_socket, db_controller_push_socket) request_count += 1 current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: report_message = "{0:,} requests".format(request_count) log.info(report_message) event_push_client.info("request_count", report_message, request_count=request_count) last_report_time = current_time request_count = 0 except KeyboardInterrupt: # convenience for testing log.info("keyboard interrupt: terminating normally") except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: terminate_subprocess(database_pool_controller) terminate_subprocess(io_controller) rep_socket.close() db_controller_push_socket.close() event_push_client.close() zeromq_context.term() return return_value
def __init__(self, halt_event): self._log = logging.getLogger("WebServer") memcached_client = create_memcached_client() self._interaction_pool = \ gdbpool.interaction_pool.DBInteractionPool( get_central_database_dsn(), pool_name=_central_pool_name, pool_size=_central_database_pool_size, do_log=True) self._interaction_pool.add_pool( dsn=get_node_local_database_dsn(), pool_name=_local_node_name, pool_size=_local_database_pool_size) # Ticket #25: must run database operation in a greenlet greenlet = gevent.Greenlet.spawn(_get_cluster_row, self._interaction_pool) greenlet.join() self._cluster_row = greenlet.get() authenticator = \ InteractionPoolAuthenticator(memcached_client, self._interaction_pool) self._zeromq_context = zmq.Context() self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address ) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception ) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client ) self._event_push_client = EventPushClient( self._zeromq_context, "web-server" ) id_translator_keys_path = os.environ.get( "NIMBUS_IO_ID_TRANSLATION_KEYS", os.path.join(_repository_path, "id_translator_keys.pkl")) with open(id_translator_keys_path, "r") as input_file: id_translator_keys = pickle.load(input_file) self._id_translator = InternalIDTranslator( id_translator_keys["key"], id_translator_keys["hmac_key"], id_translator_keys["iv_key"], id_translator_keys["hmac_size"] ) redis_queue = gevent.queue.Queue() self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue, _local_node_name) self._redis_sink.link_exception(self._unhandled_greenlet_exception) self.application = Application( self._interaction_pool, self._cluster_row, self._id_translator, authenticator, self._accounting_client, self._event_push_client, redis_queue ) self.wsgi_server = WSGIServer( (_web_public_reader_host, _web_public_reader_port), application=self.application, backlog=_wsgi_backlog, log=sys.stdout )
def __init__(self): self._log = logging.getLogger("WebInternalReader") memcached_client = memcache.Client(_memcached_nodes) self._central_connection = get_central_connection() self._cluster_row = get_cluster_row(self._central_connection) self._node_local_connection = get_node_local_connection() self._deliverator = Deliverator() self._zeromq_context = zmq.Context() self._pull_server = GreenletPULLServer( self._zeromq_context, _web_internal_reader_pipeline_address, self._deliverator ) self._pull_server.link_exception(self._unhandled_greenlet_exception) self._data_reader_clients = list() self._data_readers = list() for node_name, address in zip(_node_names, _data_reader_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_internal_reader_pipeline_address, self._deliverator, connect_messages=[] ) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_reader_clients.append(resilient_client) data_reader = DataReader( node_name, resilient_client ) self._data_readers.append(data_reader) self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address ) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception ) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client ) self._event_push_client = EventPushClient( self._zeromq_context, "web-internal-reader" ) # message sent to data readers telling them the server # is (re)starting, thereby invalidating any archvies or retrieved # that are in progress for this node timestamp = create_timestamp() self._event_push_client.info("web-reader-start", "web reader (re)start", timestamp_repr=repr(timestamp), source_node_name=_local_node_name) self._watcher = Watcher( _stats, self._data_reader_clients, self._event_push_client ) self.application = Application( memcached_client, self._central_connection, self._node_local_connection, self._cluster_row, self._data_readers, self._accounting_client, self._event_push_client, _stats ) self.wsgi_server = WSGIServer( (_web_internal_reader_host, _web_internal_reader_port), application=self.application, backlog=_wsgi_backlog )
def main(): """ main entry point returns 0 for normal termination (usually SIGTERM) """ return_value = 0 log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name) initialize_logging(log_path) log = logging.getLogger("main") log.info("program starts") halt_event = Event() set_signal_handler(halt_event) zeromq_context = zmq.Context() resources = \ _resources_tuple(halt_event=Event(), volume_by_space_id=_volume_name_by_space_id(), pull_socket=zeromq_context.socket(zmq.PULL), router_socket=zeromq_context.socket(zmq.ROUTER), event_push_client=\ EventPushClient(zeromq_context, "rs_io_controller"), pending_work_by_volume=defaultdict(deque), available_ident_by_volume=defaultdict(deque)) log.debug("binding to {0}".format(io_controller_pull_socket_uri)) resources.pull_socket.bind(io_controller_pull_socket_uri) resources.router_socket.setsockopt(zmq.LINGER, 1000) log.debug("binding to {0}".format(io_controller_router_socket_uri)) resources.router_socket.bind(io_controller_router_socket_uri) # we poll the sockets for readability, we assume we can always # write to the router socket poller = zmq.Poller() poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR) poller.register(resources.router_socket, zmq.POLLIN | zmq.POLLERR) worker_processes = list() for volume_name in set(resources.volume_by_space_id.values()): for index in range(_worker_count): worker_processes.append(_launch_io_worker(volume_name, index + 1)) last_report_time = 0.0 try: while not halt_event.is_set(): for worker_process in worker_processes: poll_subprocess(worker_process) for active_socket, event_flags in poller.poll(_poll_timeout): if event_flags & zmq.POLLERR: error_message = \ "error flags from zmq {0}".format(active_socket) log.error(error_message) raise PollError(error_message) if active_socket is resources.pull_socket: _read_pull_socket(resources) elif active_socket is resources.router_socket: _read_router_socket(resources) else: log.error("unknown socket {0}".format(active_socket)) current_time = time.time() elapsed_time = current_time - last_report_time if elapsed_time > _reporting_interval: pending_work = 0 for volume_queue in resources.pending_work_by_volume.values(): pending_work += len(volume_queue) report_message = \ "{0:,} pending_work entries".format(pending_work) log.info(report_message) resources.event_push_client.info("queue_sizes", report_message, pending_work=pending_work) last_report_time = current_time except zmq.ZMQError as zmq_error: if is_interrupted_system_call(zmq_error) and halt_event.is_set(): log.info("program teminates normally with interrupted system call") else: log.exception("zeromq error processing request") resources.event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError") return_value = 1 except Exception as instance: log.exception("error processing request") resources.event_push_client.exception( unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__) return_value = 1 else: log.info("program teminates normally") finally: for worker_process in worker_processes: terminate_subprocess(worker_process) resources.pull_socket.close() resources.router_socket.close() resources.event_push_client.close() zeromq_context.term() return return_value
def __init__(self, halt_event): self._log = logging.getLogger("WebWriter") memcached_client = memcache.Client(_memcached_nodes) self._interaction_pool = gdbpool.interaction_pool.DBInteractionPool( get_central_database_dsn(), pool_name=_central_pool_name, pool_size=_database_pool_size, do_log=True) authenticator = InteractionPoolAuthenticator(memcached_client, self._interaction_pool) # Ticket #25: must run database operation in a greenlet greenlet = gevent.Greenlet.spawn(_get_cluster_row_and_node_row, self._interaction_pool) greenlet.join() self._cluster_row, node_row = greenlet.get() self._unified_id_factory = UnifiedIDFactory(node_row.id) self._deliverator = Deliverator() self._zeromq_context = zmq.Context() self._pull_server = GreenletPULLServer(self._zeromq_context, _web_writer_pipeliner_address, self._deliverator) self._pull_server.link_exception(self._unhandled_greenlet_exception) self._data_writer_clients = list() for node_name, address in zip(_node_names, _data_writer_addresses): resilient_client = GreenletResilientClient( self._zeromq_context, node_name, address, _client_tag, _web_writer_pipeliner_address, self._deliverator, connect_messages=[]) resilient_client.link_exception(self._unhandled_greenlet_exception) self._data_writer_clients.append(resilient_client) self._space_accounting_dealer_client = GreenletDealerClient( self._zeromq_context, _local_node_name, _space_accounting_server_address) self._space_accounting_dealer_client.link_exception( self._unhandled_greenlet_exception) push_client = GreenletPUSHClient( self._zeromq_context, _local_node_name, _space_accounting_pipeline_address, ) self._accounting_client = SpaceAccountingClient( _local_node_name, self._space_accounting_dealer_client, push_client) self._event_push_client = EventPushClient(self._zeromq_context, "web-server") # message sent to data writers telling them the server # is (re)starting, thereby invalidating any archives # that are in progress for this node unified_id = self._unified_id_factory.next() timestamp = create_timestamp() self._event_push_client.info("web-writer-start", "web writer (re)start", unified_id=unified_id, timestamp_repr=repr(timestamp), source_node_name=_local_node_name) id_translator_keys_path = os.environ.get( "NIMBUS_IO_ID_TRANSLATION_KEYS", os.path.join(_repository_path, "id_translator_keys.pkl")) with open(id_translator_keys_path, "r") as input_file: id_translator_keys = pickle.load(input_file) self._id_translator = InternalIDTranslator( id_translator_keys["key"], id_translator_keys["hmac_key"], id_translator_keys["iv_key"], id_translator_keys["hmac_size"]) redis_queue = gevent.queue.Queue() self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue, _local_node_name) self._redis_sink.link_exception(self._unhandled_greenlet_exception) self.application = Application(self._cluster_row, self._unified_id_factory, self._id_translator, self._data_writer_clients, authenticator, self._accounting_client, self._event_push_client, redis_queue) self.wsgi_server = WSGIServer((_web_writer_host, _web_writer_port), application=self.application, backlog=_wsgi_backlog)