Ejemplo n.º 1
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, 
                                        partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, 
                                         collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info(
            "found {0:,} candidates, collected {1:,} segments".format(
                partition_count, collectable_count
            )
        )
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Ejemplo n.º 2
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    args = parse_commandline()

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context =  zmq.Context()

    event_push_client = EventPushClient(zeromq_context, "handoff_client")
    event_push_client.info("program-start", "handoff_client starts")  

    return_code = 0
    node_databases = None
    try:
        node_dict = get_node_ids(args.node_name)
        node_databases = get_node_databases()
        conjoined_rows, segment_rows = \
            get_handoff_rows(node_databases, node_dict[args.node_name])
        log.info("found {0} conjoined and {1} segment handoffs".format(
            len(conjoined_rows), len(segment_rows)))
        if len(conjoined_rows)  > 0:
            process_conjoined_rows(halt_event, 
                                   args, 
                                   node_databases, 
                                   conjoined_rows)
        if len(segment_rows)  > 0:
            process_segment_rows(halt_event, 
                                 zeromq_context, 
                                 args, 
                                 node_dict,
                                 node_databases,
                                 segment_rows)
    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return_code = 1

    if node_databases is not None:
        for connection in node_databases.values():
            connection.close()
    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
Ejemplo n.º 3
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator = generate_value_file_references(options, connection)
        savings = rewrite_value_files(options, connection, _repository_path,
                                      ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete",
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings)

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Ejemplo n.º 4
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    args = parse_commandline()

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_push_client = EventPushClient(zeromq_context, "handoff_client")
    event_push_client.info("program-start", "handoff_client starts")

    return_code = 0
    node_databases = None
    try:
        node_dict = get_node_ids(args.node_name)
        node_databases = get_node_databases()
        conjoined_rows, segment_rows = \
            get_handoff_rows(node_databases, node_dict[args.node_name])
        log.info("found {0} conjoined and {1} segment handoffs".format(
            len(conjoined_rows), len(segment_rows)))
        if len(conjoined_rows) > 0:
            process_conjoined_rows(halt_event, args, node_databases,
                                   conjoined_rows)
        if len(segment_rows) > 0:
            process_segment_rows(halt_event, zeromq_context, args, node_dict,
                                 node_databases, segment_rows)
    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_code = 1

    if node_databases is not None:
        for connection in node_databases.values():
            connection.close()
    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
Ejemplo n.º 5
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_inspector")
    event_push_client.info("program-start", "cluster_inspector starts")

    # if there's any wreckage from a previous run, clear it out
    if os.path.exists(_work_dir):
        log.info("removing old {0}".format(_work_dir))
        shutil.rmtree(_work_dir)
    os.mkdir(_work_dir)

    try:
        pull_segments_from_nodes(halt_event, _work_dir)

        if halt_event.is_set():
            log.info("halt_event set (1): exiting")
            return -1

        audit_segments(halt_event, _work_dir)

    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return -3

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator =  generate_value_file_references(options, connection)
        savings = rewrite_value_files(
            options, connection, _repository_path, ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete", 
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings
        )  

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Ejemplo n.º 7
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_repair")
    event_push_client.info("program-start", "cluster_repair starts")

    zfec_server_req_socket = zmq_context.socket(zmq.REQ)
    zfec_server_req_socket.setsockopt(zmq.LINGER, 1000)
    log.info("connecting req socket to {0}".format(_zfec_server_address))
    zfec_server_req_socket.connect(_zfec_server_address)

    read_subprocess = _start_read_subprocess()
    write_subprocess = _start_write_subprocess()

    try:
        _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess,
                        write_subprocess)
    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return -3
    finally:
        read_subprocess.terminate()
        write_subprocess.terminate()
        event_push_client.close()
        zfec_server_req_socket.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 8
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("found {0:,} candidates, collected {1:,} segments".format(
            partition_count, collectable_count))
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Ejemplo n.º 9
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_repair")
    event_push_client.info("program-start", "cluster_repair starts")  

    zfec_server_req_socket = zmq_context.socket(zmq.REQ)
    zfec_server_req_socket.setsockopt(zmq.LINGER, 1000)
    log.info("connecting req socket to {0}".format(_zfec_server_address))
    zfec_server_req_socket.connect(_zfec_server_address)

    read_subprocess = _start_read_subprocess()
    write_subprocess = _start_write_subprocess()

    try:
        _repair_cluster(halt_event, 
                        zfec_server_req_socket, 
                        read_subprocess, 
                        write_subprocess)
    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -3
    finally:
        read_subprocess.terminate()
        write_subprocess.terminate()
        event_push_client.close()
        zfec_server_req_socket.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 10
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_inspector")
    event_push_client.info("program-start", "cluster_inspector starts")  

    # if there's any wreckage from a previous run, clear it out
    if os.path.exists(_work_dir):
        log.info("removing old {0}".format(_work_dir))
        shutil.rmtree(_work_dir)
    os.mkdir(_work_dir)

    try:
        pull_segments_from_nodes(halt_event, _work_dir)

        if halt_event.is_set():
            log.info("halt_event set (1): exiting")
            return -1

        audit_segments(halt_event, _work_dir)

    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -3

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 11
0
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(
        state["zmq-context"],
        "space_accounting_server"
    )

    log.info("binding router-server to %s" % (_space_accounting_server_address, ))
    state["router-server"] = RouterServer(
        state["zmq-context"],
        _space_accounting_server_address,
        state["receive-queue"]
    )
    state["router-server"].register(state["pollster"])

    log.info("binding pull-server to %s" % (
        _space_accounting_pipeline_address, 
    ))
    state["pull-server"] = PULLServer(
        state["zmq-context"],
        _space_accounting_pipeline_address,
        state["receive-queue"]
    )
    state["pull-server"].register(state["pollster"])

    state["queue-dispatcher"] = DequeDispatcher(
        state,
        state["receive-queue"],
        _dispatch_table
    )

    state["state-cleaner"] = StateCleaner(state)

    state["event-push-client"].info(
        "program-start", "space_accounting_server starts"
    )  

    # hand the pollster and the queue-dispatcher to the time-queue 
    return [
        (state["pollster"].run, time.time(), ), 
        (state["queue-dispatcher"].run, time.time(), ), 
        (state["state-cleaner"].run, state["state-cleaner"].next_run(), ), 
    ] 
Ejemplo n.º 12
0
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")
    log.info("starting up")

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(state["zmq-context"],
                                                 "event_aggregator")

    state["pub-server"] = PUBServer(state["zmq-context"],
                                    _event_aggregator_pub_address)

    state["sub-clients"] = list()
    for event_publisher_pub_address in _event_publisher_pub_addresses:
        sub_client = SUBClient(state["zmq-context"],
                               event_publisher_pub_address,
                               _subscribe_to_all_topics,
                               state["receive-queue"])
        sub_client.register(state["pollster"])
        state["sub-clients"].append(sub_client)

    state["callback-dispatcher"] = CallbackDispatcher(
        state,
        state["receive-queue"],
        _publish_event,
    )

    state["event-push-client"].info("program-start", "event_aggregator starts")

    timer_driven_callbacks = [
        (
            state["pollster"].run,
            time.time(),
        ),
        (
            state["callback-dispatcher"].run,
            time.time(),
        ),
    ]
    return timer_driven_callbacks
Ejemplo n.º 13
0
    def __init__(self):
        self._log = logging.getLogger("WebServer")
        authenticator = SqlAuthenticator()

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._unified_id_factory = UnifiedIDFactory(
            self._central_connection,
            _get_shard_id(self._central_connection, self._cluster_row.id)
        )
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, 
            _web_server_pipeline_address,
            self._deliverator
        )
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        # message sent to data readers and writers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        start_message = {
            "message-type"              : "web-server-start",
            "priority"                  : create_priority(),
            "unified-id"                : self._unified_id_factory.next(),
            "timestamp-repr"            : repr(timestamp),
            "source-node-name"          : _local_node_name,
        }

        self._data_writer_clients = list()
        for node_name, address in zip(_node_names, _data_writer_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_server_pipeline_address,
                self._deliverator,
                connect_messages=[start_message, ]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_writer_clients.append(resilient_client)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_server_pipeline_address,
                self._deliverator,
                connect_messages=[start_message, ]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(
                node_name, resilient_client
            )
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-server"
        )

        self._watcher = Watcher(
            _stats, 
            self._data_reader_clients,
            self._data_writer_clients,
            self._event_push_client
        )

        id_translator_keys_path = os.path.join(
            _repository_path, "id_translator_keys.pkl"
        )
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"],
            id_translator_keys["hmac_key"], 
            id_translator_keys["iv_key"],
            id_translator_keys["hmac_size"]
        )
        self.application = Application(
            self._central_connection,
            self._node_local_connection,
            self._cluster_row,
            self._unified_id_factory,
            self._id_translator,
            self._data_writer_clients,
            self._data_readers,
            authenticator,
            self._accounting_client,
            self._event_push_client,
            _stats
        )
        self.wsgi_server = WSGIServer(
            (_web_server_host, _web_server_port), 
            application=self.application,
            backlog=_wsgi_backlog
        )
Ejemplo n.º 14
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection) 
            file_space_sanity_check(file_space_info, _repository_path)


        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, 
                                           file_space_info, 
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 15
0
    def __init__(self, halt_event):
        self._log = logging.getLogger("WebServer")
        memcached_client = create_memcached_client()

        self._interaction_pool = \
            gdbpool.interaction_pool.DBInteractionPool(
                get_central_database_dsn(),
                pool_name=_central_pool_name,
                pool_size=_central_database_pool_size,
                do_log=True)

        self._interaction_pool.add_pool(dsn=get_node_local_database_dsn(),
                                        pool_name=_local_node_name,
                                        pool_size=_local_database_pool_size)

        # Ticket #25: must run database operation in a greenlet
        greenlet = gevent.Greenlet.spawn(_get_cluster_row,
                                         self._interaction_pool)
        greenlet.join()
        self._cluster_row = greenlet.get()

        authenticator = \
            InteractionPoolAuthenticator(memcached_client,
                                         self._interaction_pool)

        self._zeromq_context = zmq.Context()

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, _local_node_name,
            _space_accounting_server_address)
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception)

        push_client = GreenletPUSHClient(
            self._zeromq_context,
            _local_node_name,
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name, self._space_accounting_dealer_client,
            push_client)

        self._event_push_client = EventPushClient(self._zeromq_context,
                                                  "web-server")

        id_translator_keys_path = os.environ.get(
            "NIMBUS_IO_ID_TRANSLATION_KEYS",
            os.path.join(_repository_path, "id_translator_keys.pkl"))
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"], id_translator_keys["hmac_key"],
            id_translator_keys["iv_key"], id_translator_keys["hmac_size"])

        redis_queue = gevent.queue.Queue()

        self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue,
                                                     _local_node_name)
        self._redis_sink.link_exception(self._unhandled_greenlet_exception)

        self.application = Application(self._interaction_pool,
                                       self._cluster_row, self._id_translator,
                                       authenticator, self._accounting_client,
                                       self._event_push_client, redis_queue)
        self.wsgi_server = WSGIServer(
            (_web_public_reader_host, _web_public_reader_port),
            application=self.application,
            backlog=_wsgi_backlog,
            log=sys.stdout)
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    prepare_ipc_path(_pull_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    pull_socket = _bind_pull_socket(zeromq_context)

    event_push_client = EventPushClient(zeromq_context, "service_availability")
    event_push_client.info("program-starts", 
                           "service availability monitor starts")

    message_count = 0
    try:
        ping_process_dict = _start_ping_processes(halt_event)

        while not halt_event.is_set():

            if message_count % len(ping_process_dict) == 0:
                for ping_process in ping_process_dict.values():
                    poll_subprocess(ping_process.process)

            message = pull_socket.recv_pyobj()
            assert not pull_socket.rcvmore

            _process_one_message(message, ping_process_dict, event_push_client)

            message_count += 1

    except KeyboardInterrupt: # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program terminating normally; interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminating normally")

    log.debug("terminating subprocesses")
    _terminate_ping_processes(ping_process_dict)
    pull_socket.close()
    event_push_client.close()
    zeromq_context.term()

    return return_value
Ejemplo n.º 17
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception("database exception",
                                            str(value),
                                            exctype=exctype.__name__)
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection)
            file_space_sanity_check(file_space_info, _repository_path)

        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, file_space_info,
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(unhandled_exception_topic,
                                        str(instance),
                                        exctype=instance.__class__.__name__)
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()

    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 18
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context =  zmq.Context()

    event_push_client = EventPushClient(zeromq_context, 
                                        "redis_stats_collector")
    event_push_client.info("program-start", "flush_stats_from_redis starts")  

    # don't flush anything newer than 1 minute ago
    current_time = datetime.utcnow()
    timestamp_cutoff = current_time - timedelta(minutes=1)

    return_code = 0
    central_db_connection = None

    collection_ops_accounting_rows = list()

    # values to be added to the dedupe table
    new_dedupes = list()

    # keys to be deleted (a list for each node
    node_keys_processed = [list() for _ in _node_names]

    try:
        central_db_connection = get_central_connection()

        # On startup, the program connects to the central database and tries 
        # to acquire a pg_advisory_lock appropriate for this program and the 
        # data center it is running in using the pg_try_advisory_lock function.
        # If it cannot acquire the lock, it notes the status of the lock 
        # and exits. This central locking mechanism lets us avoid single points
        # of failure by configuring the program to run on multiple nodes.

        with advisory_lock(central_db_connection, "redis_stats_collector"):
            node_dict = _retrieve_node_dict(central_db_connection)
            for node_name, keys_processed in \
                zip(_node_names, node_keys_processed):
                node_id = node_dict[node_name]
                log.debug("processing node {0} node_id={1}".format(node_name,
                                                                  node_id))

                # The program then selects into memory all recently collected 
                # keys from the central database table 
                # collection_ops_accounting_flush_dedupe and stores them in a 
                # dedupe set. This set allows runs of the collection/flush 
                # program to be idempotent across some time period (
                # but we won't keep the list of old keys forever.) 

                dedupe_set = _retrieve_dedupe_set(central_db_connection, 
                                                  node_id)

                # The program then visits the Redis instance on every storage 
                # node in the local data center, collecting the data from all 
                # past stats keys -- aggregating it into the program's memory.  
                # The aggregation should involve buckets for each 
                # storage_node_id and redis key, corresponding to the columns 
                # in the database.
                _process_one_node(node_name,
                                  node_dict[node_name],
                                  timestamp_cutoff,
                                  dedupe_set,
                                  collection_ops_accounting_rows,
                                  new_dedupes,
                                  keys_processed)

            # After collecting past keys from every storage node, 
            # inside a central database transaction:
            # 1. Insert the collected stats into the central database 
            #    collection_ops_accounting
            # 2. Insert collected keys into recently collected keys 
            #    collection_ops_accounting_flush_dedupe.
            # 3. commit transaction
            log.debug("updating central database")
            central_db_connection.begin_transaction()
            try:
                _insert_accounting_rows(central_db_connection,
                                        collection_ops_accounting_rows)
                _insert_dedupe_rows(central_db_connection, 
                                    timestamp_cutoff, 
                                    new_dedupes)
            except Exception:
                central_db_connection.rollback()
                raise
            else:
                central_db_connection.commit()

            # Then revisit the Redis nodes, and delete the keys we flushed 
            # into the database, and any keys we skipped because they were 
            # found in the dedupe set.
            for node_name, keys_processed in zip(_node_names, 
                                                 node_keys_processed):
                _remove_processed_keys(node_name, keys_processed)

    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return_code = 1

    if central_db_connection is not None:
        central_db_connection.close()

    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
Ejemplo n.º 19
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    signal.signal(signal.SIGTERM, _create_signal_handler(halt_event))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

        # start a transaction
        connection.execute("begin")

        # try one defrag pass
        bytes_defragged = 0
        try:
            bytes_defragged = _defrag_pass(connection, event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 20
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    volume_name = sys.argv[1]
    worker_number = int(sys.argv[2])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         volume_name.replace("/", "_"),
                                         worker_number, _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_io_worker_{0}_{1}".format(volume_name,
                                                      worker_number)
    resources = \
        _resources_tuple(halt_event=halt_event,
                         zeromq_context=zeromq_context,
                         reply_push_sockets=dict(),
                         event_push_client=EventPushClient(zeromq_context,
                                                           event_source_name),
                         dealer_socket=zeromq_context.socket(zmq.DEALER),
                         file_cache=LRUCache(_max_file_cache_size))

    resources.dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(io_controller_router_socket_uri))
    resources.dealer_socket.connect(io_controller_router_socket_uri)

    last_close_pass_time = time.time()
    try:
        while not halt_event.is_set():
            # an occasional pass that closes any open files that haven't
            # been used
            current_time = time.time()
            elapsed_time = current_time - last_close_pass_time
            if elapsed_time > _unused_file_close_interval:
                _make_close_pass(resources, current_time)
                last_close_pass_time = current_time

            _send_work_request(resources, volume_name)
            _process_request(resources)

    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(
                unhandled_exception_topic,
                "Interrupted zeromq system call",
                exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        resources.dealer_socket.close()
        for push_socket in resources.reply_push_sockets.values():
            push_socket.close()
        resources.event_push_client.close()
        resources.zeromq_context.term()

    return return_value
Ejemplo n.º 21
0
    def __init__(self):
        self._log = logging.getLogger("WebInternalReader")

        memcached_client = memcache.Client(_memcached_nodes)

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, _web_internal_reader_pipeline_address,
            self._deliverator)
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context,
                node_name,
                address,
                _client_tag,
                _web_internal_reader_pipeline_address,
                self._deliverator,
                connect_messages=[])
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(node_name, resilient_client)
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, _local_node_name,
            _space_accounting_server_address)
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception)

        push_client = GreenletPUSHClient(
            self._zeromq_context,
            _local_node_name,
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name, self._space_accounting_dealer_client,
            push_client)

        self._event_push_client = EventPushClient(self._zeromq_context,
                                                  "web-internal-reader")

        # message sent to data readers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        self._event_push_client.info("web-reader-start",
                                     "web reader (re)start",
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        self._watcher = Watcher(_stats, self._data_reader_clients,
                                self._event_push_client)

        self.application = Application(memcached_client,
                                       self._central_connection,
                                       self._node_local_connection,
                                       self._cluster_row, self._data_readers,
                                       self._accounting_client,
                                       self._event_push_client, _stats)
        self.wsgi_server = WSGIServer(
            (_web_internal_reader_host, _web_internal_reader_port),
            application=self.application,
            backlog=_wsgi_backlog)
Ejemplo n.º 22
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    prepare_ipc_path(_pull_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    pull_socket = _bind_pull_socket(zeromq_context)

    event_push_client = EventPushClient(zeromq_context, "service_availability")
    event_push_client.info("program-starts", 
                           "service availability monitor starts")

    message_count = 0
    try:
        ping_process_dict = _start_ping_processes(halt_event)

        while not halt_event.is_set():

            if message_count % len(ping_process_dict) == 0:
                for ping_process in ping_process_dict.values():
                    poll_subprocess(ping_process.process)

            message = pull_socket.recv_pyobj()
            assert not pull_socket.rcvmore

            _process_one_message(message, ping_process_dict, event_push_client)

            message_count += 1

    except KeyboardInterrupt: # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program terminating normally; interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminating normally")

    log.debug("terminating subprocesses")
    _terminate_ping_processes(ping_process_dict)
    pull_socket.close()
    event_push_client.close()
    zeromq_context.term()

    return return_value
Ejemplo n.º 23
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    worker_number = int(sys.argv[1])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         worker_number,
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_dbpool_worker_{0}".format(worker_number)
    event_push_client = EventPushClient(zeromq_context, event_source_name)

    dealer_socket = zeromq_context.socket(zmq.DEALER)
    dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(db_controller_router_socket_uri))
    dealer_socket.connect(db_controller_router_socket_uri)

    log.debug("opening local database connection")
    database_connection = get_node_local_connection()

    try:
        _send_initial_work_request(dealer_socket)
        while not halt_event.is_set():
            _process_one_transaction(dealer_socket, 
                                     database_connection,
                                     event_push_client)
    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "Interrupted zeromq system call",
                                        exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        database_connection.close()
        dealer_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
Ejemplo n.º 24
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    global _max_value_file_time

    initialize_logging(_log_path)
    log = logging.getLogger("main")

    try:
        _max_value_file_time = parse_timedelta_str(_max_value_file_time_str)
    except Exception as instance:
        log.exception("Unable to parse '{0}' {1}".format(
            _max_value_file_time_str, instance))
        return -1

    log.info("program starts; max_value_file_time = {0}".format(
        _max_value_file_time))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "node_inspector")
    event_push_client.info("program-start", "node_inspector starts")  

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1

    known_value_files = dict()

    connection.begin_transaction()
    try:
        for batch in generate_work(connection):
            _process_work_batch(connection, known_value_files, batch)
    except Exception as instance:
        connection.rollback()
        log.exception("Exception processing batch {0} {1}".format(
            batch, instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1
    else:
        connection.commit()
    finally:
        connection.close()
        event_push_client.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Ejemplo n.º 25
0
    def __init__(self, halt_event):
        self._log = logging.getLogger("WebWriter")
        memcached_client = memcache.Client(_memcached_nodes)

        self._interaction_pool = gdbpool.interaction_pool.DBInteractionPool(
            get_central_database_dsn(), 
            pool_name=_central_pool_name,
            pool_size=_database_pool_size, 
            do_log=True)

        authenticator = InteractionPoolAuthenticator(memcached_client, 
                                                     self._interaction_pool)

        # Ticket #25: must run database operation in a greenlet
        greenlet =  gevent.Greenlet.spawn(_get_cluster_row_and_node_row, 
                                           self._interaction_pool)
        greenlet.join()
        self._cluster_row, node_row = greenlet.get()

        self._unified_id_factory = UnifiedIDFactory(node_row.id)

        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, 
            _web_writer_pipeliner_address,
            self._deliverator
        )
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_writer_clients = list()
        for node_name, address in zip(_node_names, _data_writer_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_writer_pipeliner_address,
                self._deliverator,
                connect_messages=[]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_writer_clients.append(resilient_client)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-server"
        )

        # message sent to data writers telling them the server
        # is (re)starting, thereby invalidating any archives
        # that are in progress for this node
        unified_id = self._unified_id_factory.next()
        timestamp = create_timestamp()
        self._event_push_client.info("web-writer-start",
                                     "web writer (re)start",
                                     unified_id=unified_id,
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        id_translator_keys_path = os.environ.get(
            "NIMBUS_IO_ID_TRANSLATION_KEYS", 
            os.path.join(_repository_path, "id_translator_keys.pkl"))
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"],
            id_translator_keys["hmac_key"], 
            id_translator_keys["iv_key"],
            id_translator_keys["hmac_size"]
        )

        redis_queue = gevent.queue.Queue()

        self._redis_sink = OperationalStatsRedisSink(halt_event, 
                                                     redis_queue,
                                                     _local_node_name)
        self._redis_sink.link_exception(self._unhandled_greenlet_exception)

        self.application = Application(
            self._cluster_row,
            self._unified_id_factory,
            self._id_translator,
            self._data_writer_clients,
            authenticator,
            self._accounting_client,
            self._event_push_client,
            redis_queue
        )
        self.wsgi_server = WSGIServer((_web_writer_host, _web_writer_port), 
                                      application=self.application,
                                      backlog=_wsgi_backlog
        )
Ejemplo n.º 26
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    for internal_socket_uri in internal_socket_uri_list:
        prepare_ipc_path(internal_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    database_pool_controller = _launch_database_pool_controller()
    io_controller = _launch_io_controller()

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)
    db_controller_push_socket = _connect_db_controller_push_socket(zeromq_context)
    event_push_client = EventPushClient(zeromq_context, "retrieve_source")
    event_push_client.info("program-starts", "retrieve source starts")

    # we poll the sockets for readability, we assume we can always
    # write to the push client sockets
    poller = zmq.Poller()
    poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR)

    last_report_time = 0.0
    request_count = 0
    try:
        while not halt_event.is_set():
            poll_subprocess(database_pool_controller)
            poll_subprocess(io_controller)

            # we've only registered one socket, so we could use an 'if' here,
            # but this 'for' works ok and it has the same form as the other
            # places where we use poller
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)

                assert active_socket is rep_socket

                _process_one_request(rep_socket, db_controller_push_socket)

                request_count += 1

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                report_message = "{0:,} requests".format(request_count)
                log.info(report_message)
                event_push_client.info("request_count", report_message, request_count=request_count)
                last_report_time = current_time
                request_count = 0

    except KeyboardInterrupt:  # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        terminate_subprocess(database_pool_controller)
        terminate_subprocess(io_controller)
        rep_socket.close()
        db_controller_push_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
Ejemplo n.º 27
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    for internal_socket_uri in internal_socket_uri_list:
        prepare_ipc_path(internal_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    database_pool_controller = _launch_database_pool_controller()
    io_controller = _launch_io_controller()

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)
    db_controller_push_socket = \
        _connect_db_controller_push_socket(zeromq_context)
    event_push_client = EventPushClient(zeromq_context, "retrieve_source")
    event_push_client.info("program-starts", "retrieve source starts")

    # we poll the sockets for readability, we assume we can always
    # write to the push client sockets
    poller = zmq.Poller()
    poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR)

    last_report_time = 0.0
    request_count = 0
    try:
        while not halt_event.is_set():
            poll_subprocess(database_pool_controller)
            poll_subprocess(io_controller)

            # we've only registered one socket, so we could use an 'if' here,
            # but this 'for' works ok and it has the same form as the other
            # places where we use poller
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = \
                        "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)

                assert active_socket is rep_socket

                _process_one_request(rep_socket, db_controller_push_socket)

                request_count += 1

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                report_message = "{0:,} requests".format(request_count)
                log.info(report_message)
                event_push_client.info("request_count",
                                       report_message,
                                       request_count=request_count)
                last_report_time = current_time
                request_count = 0

    except KeyboardInterrupt:  # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        terminate_subprocess(database_pool_controller)
        terminate_subprocess(io_controller)
        rep_socket.close()
        db_controller_push_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
Ejemplo n.º 28
0
    def __init__(self, halt_event):
        self._log = logging.getLogger("WebServer")
        memcached_client = create_memcached_client()

        self._interaction_pool = \
            gdbpool.interaction_pool.DBInteractionPool(
                get_central_database_dsn(), 
                pool_name=_central_pool_name,
                pool_size=_central_database_pool_size, 
                do_log=True)

        self._interaction_pool.add_pool(
            dsn=get_node_local_database_dsn(), 
            pool_name=_local_node_name,
            pool_size=_local_database_pool_size) 

        # Ticket #25: must run database operation in a greenlet
        greenlet =  gevent.Greenlet.spawn(_get_cluster_row, 
                                           self._interaction_pool)
        greenlet.join()
        self._cluster_row = greenlet.get()

        authenticator = \
            InteractionPoolAuthenticator(memcached_client, 
                                         self._interaction_pool)

        self._zeromq_context = zmq.Context()

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-server"
        )

        id_translator_keys_path = os.environ.get(
            "NIMBUS_IO_ID_TRANSLATION_KEYS", 
            os.path.join(_repository_path, "id_translator_keys.pkl"))
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"],
            id_translator_keys["hmac_key"], 
            id_translator_keys["iv_key"],
            id_translator_keys["hmac_size"]
        )

        redis_queue = gevent.queue.Queue()

        self._redis_sink = OperationalStatsRedisSink(halt_event, 
                                                     redis_queue,
                                                     _local_node_name)
        self._redis_sink.link_exception(self._unhandled_greenlet_exception)

        self.application = Application(
            self._interaction_pool,
            self._cluster_row,
            self._id_translator,
            authenticator,
            self._accounting_client,
            self._event_push_client,
            redis_queue
        )
        self.wsgi_server = WSGIServer(
            (_web_public_reader_host, _web_public_reader_port), 
            application=self.application,
            backlog=_wsgi_backlog,
            log=sys.stdout
        )
Ejemplo n.º 29
0
    def __init__(self):
        self._log = logging.getLogger("WebInternalReader")

        memcached_client = memcache.Client(_memcached_nodes)

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, 
            _web_internal_reader_pipeline_address,
            self._deliverator
        )
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_internal_reader_pipeline_address,
                self._deliverator,
                connect_messages=[]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(
                node_name, resilient_client
            )
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-internal-reader"
        )

        # message sent to data readers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        self._event_push_client.info("web-reader-start",
                                     "web reader (re)start",
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        self._watcher = Watcher(
            _stats, 
            self._data_reader_clients,
            self._event_push_client
        )

        self.application = Application(
            memcached_client,
            self._central_connection,
            self._node_local_connection,
            self._cluster_row,
            self._data_readers,
            self._accounting_client,
            self._event_push_client,
            _stats
        )
        self.wsgi_server = WSGIServer(
            (_web_internal_reader_host, _web_internal_reader_port), 
            application=self.application,
            backlog=_wsgi_backlog
        )
Ejemplo n.º 30
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    resources = \
        _resources_tuple(halt_event=Event(),
                         volume_by_space_id=_volume_name_by_space_id(),
                         pull_socket=zeromq_context.socket(zmq.PULL),
                         router_socket=zeromq_context.socket(zmq.ROUTER),
                         event_push_client=\
                            EventPushClient(zeromq_context,
                                            "rs_io_controller"),
                         pending_work_by_volume=defaultdict(deque),
                         available_ident_by_volume=defaultdict(deque))

    log.debug("binding to {0}".format(io_controller_pull_socket_uri))
    resources.pull_socket.bind(io_controller_pull_socket_uri)

    resources.router_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("binding to {0}".format(io_controller_router_socket_uri))
    resources.router_socket.bind(io_controller_router_socket_uri)

    # we poll the sockets for readability, we assume we can always
    # write to the router socket
    poller = zmq.Poller()
    poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR)
    poller.register(resources.router_socket, zmq.POLLIN | zmq.POLLERR)

    worker_processes = list()
    for volume_name in set(resources.volume_by_space_id.values()):
        for index in range(_worker_count):
            worker_processes.append(_launch_io_worker(volume_name, index + 1))

    last_report_time = 0.0
    try:
        while not halt_event.is_set():
            for worker_process in worker_processes:
                poll_subprocess(worker_process)
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = \
                        "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)
                if active_socket is resources.pull_socket:
                    _read_pull_socket(resources)
                elif active_socket is resources.router_socket:
                    _read_router_socket(resources)
                else:
                    log.error("unknown socket {0}".format(active_socket))

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                pending_work = 0
                for volume_queue in resources.pending_work_by_volume.values():
                    pending_work += len(volume_queue)
                report_message = \
                    "{0:,} pending_work entries".format(pending_work)
                log.info(report_message)
                resources.event_push_client.info("queue_sizes",
                                                 report_message,
                                                 pending_work=pending_work)

                last_report_time = current_time

    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(unhandled_exception_topic,
                                                  "zeromq_error",
                                                  exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        for worker_process in worker_processes:
            terminate_subprocess(worker_process)
        resources.pull_socket.close()
        resources.router_socket.close()
        resources.event_push_client.close()
        zeromq_context.term()

    return return_value
Ejemplo n.º 31
0
    def __init__(self, halt_event):
        self._log = logging.getLogger("WebWriter")
        memcached_client = memcache.Client(_memcached_nodes)

        self._interaction_pool = gdbpool.interaction_pool.DBInteractionPool(
            get_central_database_dsn(),
            pool_name=_central_pool_name,
            pool_size=_database_pool_size,
            do_log=True)

        authenticator = InteractionPoolAuthenticator(memcached_client,
                                                     self._interaction_pool)

        # Ticket #25: must run database operation in a greenlet
        greenlet = gevent.Greenlet.spawn(_get_cluster_row_and_node_row,
                                         self._interaction_pool)
        greenlet.join()
        self._cluster_row, node_row = greenlet.get()

        self._unified_id_factory = UnifiedIDFactory(node_row.id)

        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(self._zeromq_context,
                                               _web_writer_pipeliner_address,
                                               self._deliverator)
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_writer_clients = list()
        for node_name, address in zip(_node_names, _data_writer_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context,
                node_name,
                address,
                _client_tag,
                _web_writer_pipeliner_address,
                self._deliverator,
                connect_messages=[])
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_writer_clients.append(resilient_client)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, _local_node_name,
            _space_accounting_server_address)
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception)

        push_client = GreenletPUSHClient(
            self._zeromq_context,
            _local_node_name,
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name, self._space_accounting_dealer_client,
            push_client)

        self._event_push_client = EventPushClient(self._zeromq_context,
                                                  "web-server")

        # message sent to data writers telling them the server
        # is (re)starting, thereby invalidating any archives
        # that are in progress for this node
        unified_id = self._unified_id_factory.next()
        timestamp = create_timestamp()
        self._event_push_client.info("web-writer-start",
                                     "web writer (re)start",
                                     unified_id=unified_id,
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        id_translator_keys_path = os.environ.get(
            "NIMBUS_IO_ID_TRANSLATION_KEYS",
            os.path.join(_repository_path, "id_translator_keys.pkl"))
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"], id_translator_keys["hmac_key"],
            id_translator_keys["iv_key"], id_translator_keys["hmac_size"])

        redis_queue = gevent.queue.Queue()

        self._redis_sink = OperationalStatsRedisSink(halt_event, redis_queue,
                                                     _local_node_name)
        self._redis_sink.link_exception(self._unhandled_greenlet_exception)

        self.application = Application(self._cluster_row,
                                       self._unified_id_factory,
                                       self._id_translator,
                                       self._data_writer_clients,
                                       authenticator, self._accounting_client,
                                       self._event_push_client, redis_queue)
        self.wsgi_server = WSGIServer((_web_writer_host, _web_writer_port),
                                      application=self.application,
                                      backlog=_wsgi_backlog)