def setUp(self):
        self.tearDown()
        os.makedirs(_repository_path)
        self._key_generator = generate_key()

        self._database_connection = get_node_local_connection()

        self._event_publisher_process = start_event_publisher(
            _local_node_name, 
            _event_publisher_pull_address,
            _event_publisher_pub_address
        )
        poll_result = poll_process(self._event_publisher_process)
        self.assertEqual(poll_result, None)

        self._data_writer_process = start_data_writer(
            _cluster_name,
            _local_node_name, 
            _data_writer_address,
            _event_publisher_pull_address,
            _repository_path
        )
        poll_result = poll_process(self._data_writer_process)
        self.assertEqual(poll_result, None)

        self._data_reader_process = start_data_reader(
            _local_node_name, 
            _data_reader_address,
            _event_publisher_pull_address,
            _repository_path
        )
        poll_result = poll_process(self._data_reader_process)
        self.assertEqual(poll_result, None)
Example #2
0
    def __init__(self, halt_event, node_id_dict, message_queue, push_client):
        Thread.__init__(self, name="WriterThread")
        self._halt_event = halt_event
        self._node_id_dict = node_id_dict
        self._message_queue = message_queue
        self._database_connection = get_node_local_connection()
        self._active_segments = dict()
        self._completions = list()
        self._writer = None
        self._reply_pusher = push_client


        self._dispatch_table = {
            "archive-key-entire"        : self._handle_archive_key_entire,
            "archive-key-start"         : self._handle_archive_key_start,
            "archive-key-next"          : self._handle_archive_key_next,
            "archive-key-final"         : self._handle_archive_key_final,
            "archive-key-cancel"        : self._handle_archive_key_cancel,
            "destroy-key"               : self._handle_destroy_key,
            "start-conjoined-archive"   : self._handle_start_conjoined_archive,
            "abort-conjoined-archive"   : self._handle_abort_conjoined_archive,
            "finish-conjoined-archive"  : self._handle_finish_conjoined_archive,
            "web-writer-start"          : self._handle_web_writer_start,
            "sync-value-file"           : self._handle_sync_value_file,
        }
Example #3
0
def _volume_name_by_space_id():
    """
    The control process creates a pool of worker processes of configurable size
    (default 2) for each distinct file space. However, if multiple file spaces 
    have the same "volume name" value, then one worker process pool handles 
    read requests to all of the file spaces with that same volume name. 
    In other words, there will be a pool of workers for each non null volume 
    name. Null values are never the same as other null values, so if no volume 
    names are specified for the table spaces, there will be one read worker 
    pool per file space.

    So we assign a volume name to each space_id, creating a 'null-nn' name
    if volume is null
    """
    connection =  get_node_local_connection()
    file_space_info = load_file_space_info(connection)
    connection.close()
    file_space_sanity_check(file_space_info, _repository_path)

    volume_name_by_space_id = dict() 
    null_count = 0
    for file_space_row_list in file_space_info.values():
        for file_space_row in file_space_row_list:
            if file_space_row.volume is None:
                null_count += 1
                volume_name = "null-{0}".format(null_count)
            else:
                volume_name = file_space_row.volume

            volume_name_by_space_id[file_space_row.space_id] = volume_name

    return volume_name_by_space_id
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, 
                                        partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, 
                                         collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info(
            "found {0:,} candidates, collected {1:,} segments".format(
                partition_count, collectable_count
            )
        )
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Example #5
0
def _volume_name_by_space_id():
    """
    The control process creates a pool of worker processes of configurable size
    (default 2) for each distinct file space. However, if multiple file spaces 
    have the same "volume name" value, then one worker process pool handles 
    read requests to all of the file spaces with that same volume name. 
    In other words, there will be a pool of workers for each non null volume 
    name. Null values are never the same as other null values, so if no volume 
    names are specified for the table spaces, there will be one read worker 
    pool per file space.

    So we assign a volume name to each space_id, creating a 'null-nn' name
    if volume is null
    """
    connection = get_node_local_connection()
    file_space_info = load_file_space_info(connection)
    connection.close()
    file_space_sanity_check(file_space_info, _repository_path)

    volume_name_by_space_id = dict()
    null_count = 0
    for file_space_row_list in file_space_info.values():
        for file_space_row in file_space_row_list:
            if file_space_row.volume is None:
                null_count += 1
                volume_name = "null-{0}".format(null_count)
            else:
                volume_name = file_space_row.volume

            volume_name_by_space_id[file_space_row.space_id] = volume_name

    return volume_name_by_space_id
Example #6
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("found {0:,} candidates, collected {1:,} segments".format(
            partition_count, collectable_count))
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator =  generate_value_file_references(options, connection)
        savings = rewrite_value_files(
            options, connection, _repository_path, ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete", 
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings
        )  

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Example #8
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator = generate_value_file_references(options, connection)
        savings = rewrite_value_files(options, connection, _repository_path,
                                      ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete",
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings)

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
def delete_all_motoboto_test_segments():
    central_conn = get_central_connection()
    local_conn = get_node_local_connection()
    collection_id_rows = central_conn.fetch_all_rows(_test_collections_query, [])
    central_conn.close()

    local_conn.begin_transaction()
    local_conn.execute("create temp table tmp_motoboto_collection_ids (id int4 not null)", [])
    for row in collection_id_rows:
        local_conn.execute("insert into tmp_motoboto_collection_ids values (%s)", row)

    for query in _delete_test_collections_data:
        rowcount = local_conn.execute(query, [])
        if rowcount:
            print "Deleted %s via %s" % (rowcount, query.split("\n", 1)[0])

    local_conn.commit()
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(
        state["zmq-context"],
        "data_reader"
    )

    log.info("binding resilient-server to %s" % (_data_reader_address, ))
    state["resilient-server"] = ResilientServer(
        state["zmq-context"],
        _data_reader_address,
        state["receive-queue"]
    )
    state["resilient-server"].register(state["pollster"])

    state["queue-dispatcher"] = DequeDispatcher(
        state,
        state["receive-queue"],
        _dispatch_table
    )

    state["state-cleaner"] = StateCleaner(state)

    state["database-connection"] = get_node_local_connection()

    state["reader"] = Reader(
        state["database-connection"],
        _repository_path
    )

    state["stats-reporter"] = StatsReporter(state)

    state["event-push-client"].info("program-start", "data_reader starts")  

    return [
        (state["pollster"].run, time.time(), ), 
        (state["queue-dispatcher"].run, time.time(), ), 
        (state["state-cleaner"].run, state["state-cleaner"].next_run(), ), 
        (state["stats-reporter"].run, state["stats-reporter"].next_run(), ), 
    ] 
Example #11
0
def delete_all_motoboto_test_segments():
    central_conn = get_central_connection()
    local_conn = get_node_local_connection()
    collection_id_rows = central_conn.fetch_all_rows(_test_collections_query,
                                                     [])
    central_conn.close()

    local_conn.begin_transaction()
    local_conn.execute(
        "create temp table tmp_motoboto_collection_ids (id int4 not null)", [])
    for row in collection_id_rows:
        local_conn.execute(
            "insert into tmp_motoboto_collection_ids values (%s)", row)

    for query in _delete_test_collections_data:
        rowcount = local_conn.execute(query, [])
        if rowcount:
            print "Deleted %s via %s" % (
                rowcount,
                query.split("\n", 1)[0],
            )

    local_conn.commit()
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(state["zmq-context"], "data_writer")

    log.info("binding resilient-server to %s" % (_data_writer_address,))
    state["resilient-server"] = ResilientServer(state["zmq-context"], _data_writer_address, state["receive-queue"])
    state["resilient-server"].register(state["pollster"])

    state["queue-dispatcher"] = DequeDispatcher(state, state["receive-queue"], _dispatch_table)

    central_connection = get_central_connection()
    state["cluster-row"] = get_cluster_row(central_connection)
    state["node-rows"] = get_node_rows(central_connection, state["cluster-row"].id)
    central_connection.close()

    state["node-id-dict"] = dict([(node_row.name, node_row.id) for node_row in state["node-rows"]])

    state["database-connection"] = get_node_local_connection()

    # Ticket #1646 mark output value files as closed at startup
    mark_value_files_as_closed(state["database-connection"])

    state["writer"] = Writer(state["database-connection"], _repository_path)

    state["stats-reporter"] = StatsReporter(state)

    state["event-push-client"].info("program-start", "data_writer starts")

    return [
        (state["pollster"].run, time.time()),
        (state["queue-dispatcher"].run, time.time()),
        (state["stats-reporter"].run, state["stats-reporter"].next_run()),
    ]
Example #13
0
    def __init__(self, halt_event, node_id_dict, message_queue, push_client):
        Thread.__init__(self, name="WriterThread")
        self._halt_event = halt_event
        self._node_id_dict = node_id_dict
        self._message_queue = message_queue
        self._database_connection = get_node_local_connection()
        self._active_segments = dict()
        self._completions = list()
        self._writer = None
        self._reply_pusher = push_client

        self._dispatch_table = {
            "archive-key-entire": self._handle_archive_key_entire,
            "archive-key-start": self._handle_archive_key_start,
            "archive-key-next": self._handle_archive_key_next,
            "archive-key-final": self._handle_archive_key_final,
            "archive-key-cancel": self._handle_archive_key_cancel,
            "destroy-key": self._handle_destroy_key,
            "start-conjoined-archive": self._handle_start_conjoined_archive,
            "abort-conjoined-archive": self._handle_abort_conjoined_archive,
            "finish-conjoined-archive": self._handle_finish_conjoined_archive,
            "web-writer-start": self._handle_web_writer_start,
            "sync-value-file": self._handle_sync_value_file,
        }
Example #14
0
    def setUp(self):
        self.tearDown()
        os.makedirs(_repository_path)
        self._key_generator = generate_key()

        self._database_connection = get_node_local_connection()

        self._event_publisher_process = start_event_publisher(
            _local_node_name, _event_publisher_pull_address,
            _event_publisher_pub_address)
        poll_result = poll_process(self._event_publisher_process)
        self.assertEqual(poll_result, None)

        self._data_writer_process = start_data_writer(
            _cluster_name, _local_node_name, _data_writer_address,
            _event_publisher_pull_address, _repository_path)
        poll_result = poll_process(self._data_writer_process)
        self.assertEqual(poll_result, None)

        self._data_reader_process = start_data_reader(
            _local_node_name, _data_reader_address,
            _event_publisher_pull_address, _repository_path)
        poll_result = poll_process(self._data_reader_process)
        self.assertEqual(poll_result, None)
Example #15
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    global _max_value_file_time

    initialize_logging(_log_path)
    log = logging.getLogger("main")

    try:
        _max_value_file_time = parse_timedelta_str(_max_value_file_time_str)
    except Exception as instance:
        log.exception("Unable to parse '{0}' {1}".format(
            _max_value_file_time_str, instance))
        return -1

    log.info("program starts; max_value_file_time = {0}".format(
        _max_value_file_time))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "node_inspector")
    event_push_client.info("program-start", "node_inspector starts")  

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1

    known_value_files = dict()

    connection.begin_transaction()
    try:
        for batch in generate_work(connection):
            _process_work_batch(connection, known_value_files, batch)
    except Exception as instance:
        connection.rollback()
        log.exception("Exception processing batch {0} {1}".format(
            batch, instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1
    else:
        connection.commit()
    finally:
        connection.close()
        event_push_client.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Example #16
0
def generate_work(connection):
    """
    generate batches for inspection
    """
    prev_key = None
    batch = list()
    for raw_entry in connection.generate_all_rows(_work_query, []):
        entry = _entry_template._make(raw_entry)
        batch_key = make_batch_key(entry)
        if prev_key is None:
            prev_key = batch_key
        if batch_key != prev_key:
            yield batch
            batch = list()
            prev_key = batch_key
        batch.append(entry)

    if len(batch) is not None:
        yield batch


if __name__ == "__main__":
    """
    test the generator independantly
    """
    from tools.database_connection import get_node_local_connection
    connection = get_node_local_connection()
    for entry in generate_work(connection):
        print(entry)
Example #17
0
 def setUp(self):
     self._connection = get_node_local_connection()
     _clear_test_data(self._connection)
    def __init__(self):
        self._log = logging.getLogger("WebInternalReader")

        memcached_client = memcache.Client(_memcached_nodes)

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, 
            _web_internal_reader_pipeline_address,
            self._deliverator
        )
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_internal_reader_pipeline_address,
                self._deliverator,
                connect_messages=[]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(
                node_name, resilient_client
            )
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-internal-reader"
        )

        # message sent to data readers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        self._event_push_client.info("web-reader-start",
                                     "web reader (re)start",
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        self._watcher = Watcher(
            _stats, 
            self._data_reader_clients,
            self._event_push_client
        )

        self.application = Application(
            memcached_client,
            self._central_connection,
            self._node_local_connection,
            self._cluster_row,
            self._data_readers,
            self._accounting_client,
            self._event_push_client,
            _stats
        )
        self.wsgi_server = WSGIServer(
            (_web_internal_reader_host, _web_internal_reader_port), 
            application=self.application,
            backlog=_wsgi_backlog
        )
    def __init__(self):
        self._log = logging.getLogger("WebServer")
        authenticator = SqlAuthenticator()

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._unified_id_factory = UnifiedIDFactory(
            self._central_connection,
            _get_shard_id(self._central_connection, self._cluster_row.id)
        )
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, 
            _web_server_pipeline_address,
            self._deliverator
        )
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        # message sent to data readers and writers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        start_message = {
            "message-type"              : "web-server-start",
            "priority"                  : create_priority(),
            "unified-id"                : self._unified_id_factory.next(),
            "timestamp-repr"            : repr(timestamp),
            "source-node-name"          : _local_node_name,
        }

        self._data_writer_clients = list()
        for node_name, address in zip(_node_names, _data_writer_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_server_pipeline_address,
                self._deliverator,
                connect_messages=[start_message, ]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_writer_clients.append(resilient_client)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context, 
                node_name,
                address,
                _client_tag,
                _web_server_pipeline_address,
                self._deliverator,
                connect_messages=[start_message, ]
            )
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(
                node_name, resilient_client
            )
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_server_address
        )
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception
        )

        push_client = GreenletPUSHClient(
            self._zeromq_context, 
            _local_node_name, 
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name,
            self._space_accounting_dealer_client,
            push_client
        )

        self._event_push_client = EventPushClient(
            self._zeromq_context,
            "web-server"
        )

        self._watcher = Watcher(
            _stats, 
            self._data_reader_clients,
            self._data_writer_clients,
            self._event_push_client
        )

        id_translator_keys_path = os.path.join(
            _repository_path, "id_translator_keys.pkl"
        )
        with open(id_translator_keys_path, "r") as input_file:
            id_translator_keys = pickle.load(input_file)

        self._id_translator = InternalIDTranslator(
            id_translator_keys["key"],
            id_translator_keys["hmac_key"], 
            id_translator_keys["iv_key"],
            id_translator_keys["hmac_size"]
        )
        self.application = Application(
            self._central_connection,
            self._node_local_connection,
            self._cluster_row,
            self._unified_id_factory,
            self._id_translator,
            self._data_writer_clients,
            self._data_readers,
            authenticator,
            self._accounting_client,
            self._event_push_client,
            _stats
        )
        self.wsgi_server = WSGIServer(
            (_web_server_host, _web_server_port), 
            application=self.application,
            backlog=_wsgi_backlog
        )
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    signal.signal(signal.SIGTERM, _create_signal_handler(halt_event))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

        # start a transaction
        connection.execute("begin")

        # try one defrag pass
        bytes_defragged = 0
        try:
            bytes_defragged = _defrag_pass(connection, event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Example #21
0
    def __init__(self):
        self._log = logging.getLogger("WebInternalReader")

        memcached_client = memcache.Client(_memcached_nodes)

        self._central_connection = get_central_connection()
        self._cluster_row = get_cluster_row(self._central_connection)
        self._node_local_connection = get_node_local_connection()
        self._deliverator = Deliverator()

        self._zeromq_context = zmq.Context()

        self._pull_server = GreenletPULLServer(
            self._zeromq_context, _web_internal_reader_pipeline_address,
            self._deliverator)
        self._pull_server.link_exception(self._unhandled_greenlet_exception)

        self._data_reader_clients = list()
        self._data_readers = list()
        for node_name, address in zip(_node_names, _data_reader_addresses):
            resilient_client = GreenletResilientClient(
                self._zeromq_context,
                node_name,
                address,
                _client_tag,
                _web_internal_reader_pipeline_address,
                self._deliverator,
                connect_messages=[])
            resilient_client.link_exception(self._unhandled_greenlet_exception)
            self._data_reader_clients.append(resilient_client)
            data_reader = DataReader(node_name, resilient_client)
            self._data_readers.append(data_reader)

        self._space_accounting_dealer_client = GreenletDealerClient(
            self._zeromq_context, _local_node_name,
            _space_accounting_server_address)
        self._space_accounting_dealer_client.link_exception(
            self._unhandled_greenlet_exception)

        push_client = GreenletPUSHClient(
            self._zeromq_context,
            _local_node_name,
            _space_accounting_pipeline_address,
        )

        self._accounting_client = SpaceAccountingClient(
            _local_node_name, self._space_accounting_dealer_client,
            push_client)

        self._event_push_client = EventPushClient(self._zeromq_context,
                                                  "web-internal-reader")

        # message sent to data readers telling them the server
        # is (re)starting, thereby invalidating any archvies or retrieved
        # that are in progress for this node
        timestamp = create_timestamp()
        self._event_push_client.info("web-reader-start",
                                     "web reader (re)start",
                                     timestamp_repr=repr(timestamp),
                                     source_node_name=_local_node_name)

        self._watcher = Watcher(_stats, self._data_reader_clients,
                                self._event_push_client)

        self.application = Application(memcached_client,
                                       self._central_connection,
                                       self._node_local_connection,
                                       self._cluster_row, self._data_readers,
                                       self._accounting_client,
                                       self._event_push_client, _stats)
        self.wsgi_server = WSGIServer(
            (_web_internal_reader_host, _web_internal_reader_port),
            application=self.application,
            backlog=_wsgi_backlog)
Example #22
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection) 
            file_space_sanity_check(file_space_info, _repository_path)


        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, 
                                           file_space_info, 
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
 def setUp(self):
     self.tearDown()
     self._connection = get_node_local_connection()
Example #24
0
    return (entry.unified_id, entry.conjoined_part, entry.segment_num, )

def generate_work(connection):
    """
    generate batches for inspection
    """
    prev_key = None
    batch = list()
    for raw_entry in connection.generate_all_rows(_work_query, []):
        entry = _entry_template._make(raw_entry)
        batch_key = make_batch_key(entry)
        if prev_key is None:
            prev_key = batch_key
        if batch_key != prev_key:
            yield batch
            batch = list()
            prev_key = batch_key
        batch.append(entry)

    if len(batch) is not None:
        yield batch

if __name__ == "__main__":
    """
    test the generator independantly
    """
    from tools.database_connection import get_node_local_connection
    connection = get_node_local_connection()
    for entry in generate_work(connection):
        print(entry)
    def setUp(self):
        self.tearDown()
        os.makedirs(_test_dir)

        self._database_connection = get_node_local_connection()
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")
    status_checkers = list()

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(
        state["zmq-context"],
        "handoff_server"
    )

    central_connection = get_central_connection()
    state["cluster-row"] = get_cluster_row(central_connection)
    state["node-rows"] = get_node_rows(
        central_connection, state["cluster-row"].id
    )
    central_connection.close()

    state["node-id-dict"] = dict(
        [(node_row.name, node_row.id, ) for node_row in state["node-rows"]]
    )
    state["node-name-dict"] = dict(
        [(node_row.id, node_row.name, ) for node_row in state["node-rows"]]
    )

    state["database-connection"] = get_node_local_connection()
    for node_row, handoff_server_address in zip(
        state["node-rows"], _handoff_server_addresses
    ):
        if node_row.name == _local_node_name:
            log.info("binding resilient-server to %s" % (
                handoff_server_address, 
            ))
            state["resilient-server"] = ResilientServer(
                state["zmq-context"],
                handoff_server_address,
                state["receive-queue"]
            )
            state["resilient-server"].register(state["pollster"])
        else:
            handoff_server_client = ResilientClient(
                state["zmq-context"],
                state["pollster"],
                node_row.name,
                handoff_server_address,
                _client_tag,
                _handoff_server_pipeline_address
            )
            state["handoff-server-clients"].append(handoff_server_client)
            # don't run all the status checkers at the same time
            status_checkers.append(
                (handoff_server_client.run, 
                 time.time() + random.random() * 60.0, )
            )        

    log.info("binding pull-server to %s" % (_handoff_server_pipeline_address, ))
    state["pull-server"] = PULLServer(
        state["zmq-context"],
        _handoff_server_pipeline_address,
        state["receive-queue"]
    )
    state["pull-server"].register(state["pollster"])

    for node_row, data_reader_address in zip(
        state["node-rows"], _data_reader_addresses
    ):
        data_reader_client = ResilientClient(
            state["zmq-context"],
            state["pollster"],
            node_row.name,
            data_reader_address,
            _client_tag,
            _handoff_server_pipeline_address
        )
        state["reader-client-dict"][data_reader_client.server_node_name] = \
                data_reader_client
        # don't run all the status checkers at the same time
        status_checkers.append(
            (data_reader_client.run, time.time() + random.random() * 60.0, )
        )        

    for node_row, data_writer_address in zip(
        state["node-rows"], _data_writer_addresses
    ):
        data_writer_client = ResilientClient(
            state["zmq-context"],
            state["pollster"],
            node_row.name,
            data_writer_address,
            _client_tag,
            _handoff_server_pipeline_address
        )
        state["writer-client-dict"][data_writer_client.server_node_name] = \
                data_writer_client
        # don't run all the status checkers at the same time
        status_checkers.append(
            (data_writer_client.run, time.time() + random.random() * 60.0, )
        )        

    state["queue-dispatcher"] = DequeDispatcher(
        state,
        state["receive-queue"],
        _dispatch_table
    )

    state["handoff-requestor"] = HandoffRequestor(state, _local_node_name)
    state["handoff-starter"] = HandoffStarter(
        state, _local_node_name, state["event-push-client"]
    )

    state["event-push-client"].info("program-start", "handoff_server starts")  

    timer_driven_callbacks = [
        (state["handoff-starter"].run, state["handoff-starter"].next_run(), ),
        (state["pollster"].run, time.time(), ), 
        (state["queue-dispatcher"].run, time.time(), ), 
        # try to spread out handoff polling, if all nodes start together
        (state["handoff-requestor"].run,
            time.time() + random.random() * handoff_polling_interval)
    ] 
    timer_driven_callbacks.extend(status_checkers)
    return timer_driven_callbacks
Example #27
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    worker_number = int(sys.argv[1])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         worker_number,
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_dbpool_worker_{0}".format(worker_number)
    event_push_client = EventPushClient(zeromq_context, event_source_name)

    dealer_socket = zeromq_context.socket(zmq.DEALER)
    dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(db_controller_router_socket_uri))
    dealer_socket.connect(db_controller_router_socket_uri)

    log.debug("opening local database connection")
    database_connection = get_node_local_connection()

    try:
        _send_initial_work_request(dealer_socket)
        while not halt_event.is_set():
            _process_one_transaction(dealer_socket, 
                                     database_connection,
                                     event_push_client)
    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "Interrupted zeromq system call",
                                        exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        database_connection.close()
        dealer_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
def _setup(_halt_event, state):
    log = logging.getLogger("_setup")
    status_checkers = list()

    # do the event push client first, because we may need to
    # push an execption event from setup
    state["event-push-client"] = EventPushClient(
        state["zmq-context"],
        "anti_entropy_server"
    )

    state["central-database-connection"] = get_central_connection()
    state["local-database-connection"] = get_node_local_connection()

    state["cluster-row"] = get_cluster_row(
        state["central-database-connection"] 
    )

    local_anti_entropy_server_address = None
    for node_name, address in zip(_node_names, _anti_entropy_server_addresses):
        if node_name == _local_node_name:
            local_anti_entropy_server_address = address
            break
    assert local_anti_entropy_server_address is not None

    log.info("binding resilient-server to %s" % (
        local_anti_entropy_server_address, 
    ))
    state["resilient-server"] = ResilientServer(
        state["zmq-context"],
        local_anti_entropy_server_address,
        state["receive-queue"]
    )
    state["resilient-server"].register(state["pollster"])

    log.info("binding pull-server to %s" % (
        _anti_entropy_server_pipeline_address, 
    ))
    state["pull-server"] = PULLServer(
        state["zmq-context"],
        _anti_entropy_server_pipeline_address,
        state["receive-queue"]
    )
    state["pull-server"].register(state["pollster"])

    state["anti-entropy-clients"] = list()
    for node_name, anti_entropy_server_address in zip(
        _node_names, _anti_entropy_server_addresses
    ):
        resilient_client = ResilientClient(
                state["zmq-context"],
                state["pollster"],
                node_name,
                anti_entropy_server_address,
                _client_tag,
                _anti_entropy_server_pipeline_address
            )
        state["anti-entropy-clients"].append(resilient_client)
        status_checkers.append(
            (resilient_client.run, time.time() + random.random() * 60.0, )
        )        

    state["queue-dispatcher"] = DequeDispatcher(
        state,
        state["receive-queue"],
        _dispatch_table
    )

    state["collection-list-requestor"] = CollectionListRequestor(state)
    state["consistency-check-starter"] = ConsistencyCheckStarter(
        state, _start_consistency_check
    )
    state["retry-manager"] = RetryManager(
        state, _start_consistency_check
    )
    state["state-cleaner"] = StateCleaner(state)

    state["event-push-client"].info(
        "program-start", "anti_entropy_server starts"
    )  

    # start the collection list requestor right away
    # start the consistency check starter a little later, when
    # we presumably have some collection ids
    timer_driven_callbacks = [
        (state["pollster"].run, time.time(), ), 
        (state["queue-dispatcher"].run, time.time(), ), 
        (state["collection-list-requestor"].run, time.time(), ), 
        (state["consistency-check-starter"].run, time.time()+60.0, ), 
        (state["retry-manager"].run, state["retry-manager"].next_run(), ), 
        (state["state-cleaner"].run, state["state-cleaner"].next_run(), ), 
    ] 
    timer_driven_callbacks.extend(status_checkers)
    return timer_driven_callbacks
Example #29
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception("database exception",
                                            str(value),
                                            exctype=exctype.__name__)
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection)
            file_space_sanity_check(file_space_info, _repository_path)

        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, file_space_info,
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(unhandled_exception_topic,
                                        str(instance),
                                        exctype=instance.__class__.__name__)
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()

    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0