def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    return_value = 0

    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    node_generators = _start_subprocesses(halt_event)
    merge_manager = heapq.merge(*node_generators)

    try:
        _manage_subprocesses(halt_event, merge_manager)
    except Exception as instance:
        log.exception(instance)
        return_value = 1

    return return_value
Exemple #2
0
def main():
    """
    main entry point
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    zeromq_context = zmq.Context()

    req_socket = zeromq_context.socket(zmq.REQ)
    req_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting req socket to {0}".format(_zfec_server_address))
    req_socket.connect(_zfec_server_address)

    return_value = 0

    try:
        success_count, failure_count = _run_tests(req_socket)
    except Exception as instance:
        log.exception(instance)
        return_value = 1
    else:
        log.info("terminates normally {0} successes {1} failures".format(
            success_count, failure_count))
    finally:
        req_socket.close()
        zeromq_context.term()

    return return_value
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    return_value = 0

    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    node_subprocesses = _start_subprocesses(halt_event)

    try:
        halt_event.wait()
    except Exception as instance:
        log.exception(instance)
        return_value = 1

    for node_subprocess in node_subprocesses:
        node_subprocess.terminate()

    return return_value
Exemple #4
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    return_value = 0

    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    node_generators = _start_subprocesses(halt_event)
    merge_manager = heapq.merge(*node_generators)

    try:
        _manage_subprocesses(halt_event, merge_manager)
    except Exception as instance:
        log.exception(instance)
        return_value = 1

    return return_value
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, 
                                        partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, 
                                         collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info(
            "found {0:,} candidates, collected {1:,} segments".format(
                partition_count, collectable_count
            )
        )
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Exemple #6
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    return_value = 0

    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    node_subprocesses = _start_subprocesses(halt_event)

    try:
        halt_event.wait()
    except Exception as instance:
        log.exception(instance)
        return_value = 1

    for node_subprocess in node_subprocesses:
        node_subprocess.terminate()

    return return_value
def init_setup():
    initialize_logging(LOG_PATH)
    log = logging.getLogger("init_setup")
    log.info("setup start")
    global _ROUTER
    _ROUTER = Router()
    gevent.spawn_later(0.0, _ROUTER.init)
    log.info("setup complete")
Exemple #8
0
def init_setup():
    initialize_logging(LOG_PATH)
    log = logging.getLogger("init_setup")
    log.info("setup start")
    global _ROUTER
    _ROUTER = Router()
    gevent.spawn_later(0.0, _ROUTER.init)
    log.info("setup complete")
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    memcached_client = memcache.Client(_memcached_nodes)

    zeromq_context = zmq.Context()
    sub_socket = _create_sub_socket(zeromq_context)

    expected_sequence = {
        _cache_update_channel : None,
    }

    while not halt_event.is_set():
        try:
            topic = sub_socket.recv()
            assert sub_socket.rcvmore
            meta = sub_socket.recv()
            if sub_socket.rcvmore:
                data = sub_socket.recv()
            else:
                data = ""

            _process_one_event(memcached_client, 
                               expected_sequence, 
                               topic, 
                               meta, 
                               data)

        except KeyboardInterrupt: # convenience for testing
            log.info("keyboard interrupt: terminating normally")
            halt_event.set()
        except zmq.ZMQError as zmq_error:
            if is_interrupted_system_call(zmq_error) and halt_event.is_set():
                log.info("interrupted system call - ok at shutdown")
            else:
                log.exception("zeromq error processing request")
                return_value = 1
            halt_event.set()
        except Exception:
            log.exception("error processing request")
            return_value = 1
            halt_event.set()

    sub_socket.close()
    zeromq_context.term()

    log.info("program teminates: return value = {0}".format(return_value))
    return return_value
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    args = parse_commandline()

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context =  zmq.Context()

    event_push_client = EventPushClient(zeromq_context, "handoff_client")
    event_push_client.info("program-start", "handoff_client starts")  

    return_code = 0
    node_databases = None
    try:
        node_dict = get_node_ids(args.node_name)
        node_databases = get_node_databases()
        conjoined_rows, segment_rows = \
            get_handoff_rows(node_databases, node_dict[args.node_name])
        log.info("found {0} conjoined and {1} segment handoffs".format(
            len(conjoined_rows), len(segment_rows)))
        if len(conjoined_rows)  > 0:
            process_conjoined_rows(halt_event, 
                                   args, 
                                   node_databases, 
                                   conjoined_rows)
        if len(segment_rows)  > 0:
            process_segment_rows(halt_event, 
                                 zeromq_context, 
                                 args, 
                                 node_dict,
                                 node_databases,
                                 segment_rows)
    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return_code = 1

    if node_databases is not None:
        for connection in node_databases.values():
            connection.close()
    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
Exemple #11
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    memcached_client = memcache.Client(_memcached_nodes)

    zeromq_context = zmq.Context()
    sub_socket = _create_sub_socket(zeromq_context)

    expected_sequence = {
        _cache_update_channel: None,
    }

    while not halt_event.is_set():
        try:
            topic = sub_socket.recv()
            assert sub_socket.rcvmore
            meta = sub_socket.recv()
            if sub_socket.rcvmore:
                data = sub_socket.recv()
            else:
                data = ""

            _process_one_event(memcached_client, expected_sequence, topic,
                               meta, data)

        except KeyboardInterrupt:  # convenience for testing
            log.info("keyboard interrupt: terminating normally")
            halt_event.set()
        except zmq.ZMQError as zmq_error:
            if is_interrupted_system_call(zmq_error) and halt_event.is_set():
                log.info("interrupted system call - ok at shutdown")
            else:
                log.exception("zeromq error processing request")
                return_value = 1
            halt_event.set()
        except Exception:
            log.exception("error processing request")
            return_value = 1
            halt_event.set()

    sub_socket.close()
    zeromq_context.term()

    log.info("program teminates: return value = {0}".format(return_value))
    return return_value
Exemple #12
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    collectable_segment_ids = io.StringIO()

    partition_count = 0
    collectable_count = 0

    try:
        versioned_collections = get_versioned_collections()
        for partition in generate_candidate_partitions(connection):
            partition_count += 1
            versioned_collection = \
                    partition[0].collection_id in versioned_collections
            count = _evaluate_partition(collectable_segment_ids, partition,
                                        versioned_collection)
            collectable_count += count
        archive_collectable_segment_rows(connection, collectable_segment_ids,
                                         options.max_node_offline_time)
        collectable_segment_ids.close()
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("found {0:,} candidates, collected {1:,} segments".format(
            partition_count, collectable_count))
        log.info("program terminates normally")

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Exemple #13
0
def main():
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    halt_event = Event()
    gevent.signal(signal.SIGTERM, _signal_handler_closure(halt_event))

    try:
        web_public_reader = WebPublicReaderServer(halt_event)
        web_public_reader.start()
    except Exception, instance:
        log.exception(str(instance))
        return -1
def main():
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    halt_event = Event()
    gevent.signal(signal.SIGTERM, _signal_handler_closure(halt_event))

    try:
        web_public_reader = WebPublicReaderServer(halt_event)
        web_public_reader.start()
    except Exception, instance:
        log.exception(str(instance))
        return -1
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")  

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator =  generate_value_file_references(options, connection)
        savings = rewrite_value_files(
            options, connection, _repository_path, ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete", 
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings
        )  

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
Exemple #16
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    options = get_options()

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database")
        return -1

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "garbage_collector")
    event_push_client.info("program-start", "garbage_collector starts")

    return_code = 0

    try:
        total_unused_value_file_size = unlink_totally_unused_value_files(
            connection, _repository_path)
        unreachable_value_file_size = unlink_unreachable_value_files(
            connection, _repository_path)
        ref_generator = generate_value_file_references(options, connection)
        savings = rewrite_value_files(options, connection, _repository_path,
                                      ref_generator)
    except Exception:
        log.exception("_garbage_collection")
        return_code = -2
    else:
        log.info("program terminates normally")

        event_push_client.info(
            "rewrite complete",
            "garbage_collector finished",
            unused_value_file_bytes_reclaimed=total_unused_value_file_size,
            unreachable_value_file_bytes_reclaimed=unreachable_value_file_size,
            rewrite_value_file_savings=savings)

    connection.close()

    event_push_client.close()
    zmq_context.term()

    return return_code
def main():
    """
    main processing module
    """
    global _return_code

    initialize_logging(_log_path)
    log = logging.getLogger("main")

    halt_event = Event()
    gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event)
    log.info("program starts")

    redis_queue = gevent.queue.Queue()
    greenlets = list()

    try:
        config_path = sys.argv[1]
        log.info("reading config from '{0}'".format(config_path))
        with open(config_path) as input_file:
            config = json.load(input_file)

        redis_sink = WebMonitorRedisSink(halt_event, redis_queue)
        redis_sink.link_exception(_redis_exception_closure(halt_event))
        redis_sink.start()

        greenlets.append(redis_sink)

        for config_entry in config:
            pinger = Pinger(halt_event, 
                            _polling_interval, 
                            redis_queue, 
                            config_entry)

            pinger.link_exception(_unhandled_greenlet_exception)
            pinger.start()

            greenlets.append(pinger)

    except Exception as instance:
        log.exception(instance)
        _return_code = 1

    # wait here while the pingers do their job
    halt_event.wait()

    for entry in greenlets:
        entry.join(timeout=3.0)

    log.info("program terminates return code {0}".format(_return_code))
    return _return_code
    def setUp(self):
        initialize_logging(_log_path)
        self.tearDown()

        # clear out any old stats
        space_accounting_database = SpaceAccountingDatabase()
        space_accounting_database.clear_collection_stats(_collection_id)
        space_accounting_database.commit()

        self._space_accounting_server_process = start_space_accounting_server(
            _local_node_name, _space_accounting_server_address, _space_accounting_pipeline_address
        )
        poll_result = poll_process(self._space_accounting_server_process)
        self.assertEqual(poll_result, None)
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_repair")
    event_push_client.info("program-start", "cluster_repair starts")  

    zfec_server_req_socket = zmq_context.socket(zmq.REQ)
    zfec_server_req_socket.setsockopt(zmq.LINGER, 1000)
    log.info("connecting req socket to {0}".format(_zfec_server_address))
    zfec_server_req_socket.connect(_zfec_server_address)

    read_subprocess = _start_read_subprocess()
    write_subprocess = _start_write_subprocess()

    try:
        _repair_cluster(halt_event, 
                        zfec_server_req_socket, 
                        read_subprocess, 
                        write_subprocess)
    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -3
    finally:
        read_subprocess.terminate()
        write_subprocess.terminate()
        event_push_client.close()
        zfec_server_req_socket.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #20
0
def main():
    """
    main processing module
    """
    global _return_code

    initialize_logging(_log_path)
    log = logging.getLogger("main")

    halt_event = Event()
    gevent.signal(signal.SIGTERM, _handle_sigterm, halt_event)
    log.info("program starts")

    redis_queue = gevent.queue.Queue()
    greenlets = list()

    try:
        config_path = sys.argv[1]
        log.info("reading config from '{0}'".format(config_path))
        with open(config_path) as input_file:
            config = json.load(input_file)

        redis_sink = WebMonitorRedisSink(halt_event, redis_queue)
        redis_sink.link_exception(_redis_exception_closure(halt_event))
        redis_sink.start()

        greenlets.append(redis_sink)

        for config_entry in config:
            pinger = Pinger(halt_event, _polling_interval, redis_queue,
                            config_entry)

            pinger.link_exception(_unhandled_greenlet_exception)
            pinger.start()

            greenlets.append(pinger)

    except Exception as instance:
        log.exception(instance)
        _return_code = 1

    # wait here while the pingers do their job
    halt_event.wait()

    for entry in greenlets:
        entry.join(timeout=3.0)

    log.info("program terminates return code {0}".format(_return_code))
    return _return_code
Exemple #21
0
def main():
    """Main entry point for cluster simulator"""

    args = parse_cmdline()
    config = ClusterConfig(args)
    print repr(args)

    if not sanity_check(config):
        return 1

    if config.createnew:
        ensure_paths(config)
        old_config = config
    else:
        old_config = config
        config = ClusterConfig.load(config)

    if old_config.logprune:
        remove_files(config.log_path)

    # save() sets createnew to false
    createnew = config.createnew

    if old_config.createnew and not config.systemdb:
        config.database_users.update(create_database(config))
        print "Saving config to %s" % (config.config_path, )
        config.save()
    elif not config.systemdb:
        start_database(config)

    os.environ.update(dict(config.env_for_cluster()))

    #import pdb
    #pdb.set_trace()
    log_path = os.path.join(config.log_path, _log_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("progam starts")

    log.info("entering main loop")
    command_interpreter = CommandInterpreter(config, createnew)
    if old_config.start:
        command_interpreter.do_start("all")
        print "Web servers at: %s" % (", ".join(config.web_server_urls), )
    command_interpreter.cmdloop("sim.nimbus.io")
    log.info("leaving main loop")

    log.info("program ends normally")
    return 0
Exemple #22
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    args = parse_commandline()

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_push_client = EventPushClient(zeromq_context, "handoff_client")
    event_push_client.info("program-start", "handoff_client starts")

    return_code = 0
    node_databases = None
    try:
        node_dict = get_node_ids(args.node_name)
        node_databases = get_node_databases()
        conjoined_rows, segment_rows = \
            get_handoff_rows(node_databases, node_dict[args.node_name])
        log.info("found {0} conjoined and {1} segment handoffs".format(
            len(conjoined_rows), len(segment_rows)))
        if len(conjoined_rows) > 0:
            process_conjoined_rows(halt_event, args, node_databases,
                                   conjoined_rows)
        if len(segment_rows) > 0:
            process_segment_rows(halt_event, zeromq_context, args, node_dict,
                                 node_databases, segment_rows)
    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_code = 1

    if node_databases is not None:
        for connection in node_databases.values():
            connection.close()
    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
def main():
    """Main entry point for cluster simulator"""

    args = parse_cmdline()
    config = ClusterConfig(args)
    print repr(args)

    if not sanity_check(config):
        return 1

    if config.createnew:
        ensure_paths(config)
        old_config = config
    else:
        old_config = config
        config = ClusterConfig.load(config)

    if old_config.logprune:
        remove_files(config.log_path)

    # save() sets createnew to false
    createnew = config.createnew

    if old_config.createnew and not config.systemdb:
        config.database_users.update(create_database(config))
        print "Saving config to %s" % (config.config_path, )
        config.save()
    elif not config.systemdb:
        start_database(config)

    os.environ.update(dict(config.env_for_cluster()))

    #import pdb
    #pdb.set_trace()
    log_path = os.path.join(config.log_path, _log_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("progam starts")

    log.info("entering main loop")
    command_interpreter = CommandInterpreter(config, createnew)
    if old_config.start:
        command_interpreter.do_start("all")
        print "Web servers at: %s" % ( ", ".join(config.web_server_urls), )
    command_interpreter.cmdloop("sim.nimbus.io")
    log.info("leaving main loop")

    log.info("program ends normally")
    return 0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_inspector")
    event_push_client.info("program-start", "cluster_inspector starts")  

    # if there's any wreckage from a previous run, clear it out
    if os.path.exists(_work_dir):
        log.info("removing old {0}".format(_work_dir))
        shutil.rmtree(_work_dir)
    os.mkdir(_work_dir)

    try:
        pull_segments_from_nodes(halt_event, _work_dir)

        if halt_event.is_set():
            log.info("halt_event set (1): exiting")
            return -1

        audit_segments(halt_event, _work_dir)

    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -3

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #25
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_inspector")
    event_push_client.info("program-start", "cluster_inspector starts")

    # if there's any wreckage from a previous run, clear it out
    if os.path.exists(_work_dir):
        log.info("removing old {0}".format(_work_dir))
        shutil.rmtree(_work_dir)
    os.mkdir(_work_dir)

    try:
        pull_segments_from_nodes(halt_event, _work_dir)

        if halt_event.is_set():
            log.info("halt_event set (1): exiting")
            return -1

        audit_segments(halt_event, _work_dir)

    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return -3

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #26
0
    def setUp(self):
        initialize_logging(_log_path)
        self.tearDown()

        # clear out any old stats
        space_accounting_database = SpaceAccountingDatabase()
        space_accounting_database.clear_collection_stats(_collection_id)
        space_accounting_database.commit()

        self._space_accounting_server_process = \
            start_space_accounting_server(
                _local_node_name,
                _space_accounting_server_address,
                _space_accounting_pipeline_address
            )
        poll_result = poll_process(self._space_accounting_server_process)
        self.assertEqual(poll_result, None)
Exemple #27
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "cluster_repair")
    event_push_client.info("program-start", "cluster_repair starts")

    zfec_server_req_socket = zmq_context.socket(zmq.REQ)
    zfec_server_req_socket.setsockopt(zmq.LINGER, 1000)
    log.info("connecting req socket to {0}".format(_zfec_server_address))
    zfec_server_req_socket.connect(_zfec_server_address)

    read_subprocess = _start_read_subprocess()
    write_subprocess = _start_write_subprocess()

    try:
        _repair_cluster(halt_event, zfec_server_req_socket, read_subprocess,
                        write_subprocess)
    except KeyboardInterrupt:
        halt_event.set()
    except Exception as instance:
        log.exception(str(instance))
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return -3
    finally:
        read_subprocess.terminate()
        write_subprocess.terminate()
        event_push_client.close()
        zfec_server_req_socket.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
def main():
    """
    main entry point
    """
    returncode = 0
    
    initialize_logging(_log_path)

    log = logging.getLogger("main")
    state = _create_state()
    set_signal_handler(state["halt-event"])

    try:
        _setup(state)
    except Exception:
        instance = sys.exc_info()[1]
        log.exception("unhandled exception in _setup")
        log.critical("unhandled exception in _setup {0}".format(
            instance))
        state["halt-event"].set()
        returncode = 1

    log.debug("start halt_event loop")
    while not state["halt-event"].is_set():
        try:
            state["pollster"].run(state["halt-event"])
        except Exception:
            instance = sys.exc_info()[1]
            log.exception("unhandled exception in pollster")
            log.critical("unhandled exception in pollster {0}".format(
                instance))
            state["halt-event"].set()
            returncode = 1
    log.debug("end halt_event loop")

    try:
        _tear_down(state)
    except Exception:
        instance = sys.exc_info()[1]
        log.exception("unhandled exception in _tear_down")
        log.critical("unhandled exception in _tear_down {0}".format(
            instance))
        returncode = 1

    return returncode
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0
    if len(sys.argv) == 1:
        server_number = 0
    else:
        server_number = int(sys.argv[1])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name,
                                         server_number)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)

    try:
        while not halt_event.is_set():
            _process_one_request(rep_socket)
    except ZfecServerInterrupedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
            return 0
        log.exception("error processing request")
        return_value = 1
    except Exception:
        log.exception("error processing request")
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        rep_socket.close()
        zeromq_context.term()

    return return_value
Exemple #30
0
def main():
    """
    main entry point
    """
    index_str = sys.argv[1]
    index = int(index_str)
    source_node_name = _node_names[index]
    data_reader_anti_entropy_address = \
            _data_reader_anti_entropy_addresses[index]

    log_path = "{0}/nimbusio_cluster_repair_data_reader_{1}_to_{2}.log".format(
        os.environ["NIMBUSIO_LOG_DIR"], source_node_name, _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")

    log.info("program starts: reading from node {0}".format(source_node_name))

    zeromq_context = zmq.Context()

    req_socket = zeromq_context.socket(zmq.REQ)
    req_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting req socket to {0}".format(
        data_reader_anti_entropy_address))
    req_socket.connect(data_reader_anti_entropy_address)

    return_value = 0

    try:
        audit_records_processed = _process_repair_entries(index, 
                                                          source_node_name, 
                                                          req_socket)
    except Exception as instance:
        log.exception(instance)
        return_value = 1
    else:
        log.info("terminates normally {0} audit records processed".format(
            audit_records_processed))
    finally:
        req_socket.close()
        zeromq_context.term()

    return return_value
Exemple #31
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0
    if len(sys.argv) == 1:
        server_number = 0
    else:
        server_number = int(sys.argv[1])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         _local_node_name, server_number)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)

    try:
        while not halt_event.is_set():
            _process_one_request(rep_socket)
    except ZfecServerInterrupedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
            return 0
        log.exception("error processing request")
        return_value = 1
    except Exception:
        log.exception("error processing request")
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        rep_socket.close()
        zeromq_context.term()

    return return_value
Exemple #32
0
def main():
    """
    main entry point
    """
    [
        work_dir,
        index_str,
    ] = sys.argv[1:]
    index = int(index_str)
    node_name = _node_names[index]
    database_host = _node_database_hosts[index]
    database_port = _node_database_ports[index]
    database_password = _node_database_passwords[index]

    log_path = "{0}/nimbusio_segment_puller_from_{1}_to_{2}.log".format(
        os.environ["NIMBUSIO_LOG_DIR"], node_name, _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")

    log.info("program starts: work_dir={0}, index={1}, {2}".format(
        work_dir, index, node_name))

    try:
        connection = get_node_connection(node_name, database_password,
                                         database_host, database_port)
    except Exception as instance:
        log.exception("Unable to connect to database {0}".format(instance))
        return -1

    try:
        _pull_segment_data(connection, work_dir, node_name)
        _pull_damaged_segment_data(connection, work_dir, node_name)
    except Exception as instance:
        log.exception("_pull_segment_data failed {0}".format(instance))
        return -2
    finally:
        connection.close()

    log.info("program terminates normally")
    return 0
def main():
    """
    main entry point
    """
    [work_dir, index_str, ] = sys.argv[1:]
    index = int(index_str)
    node_name = _node_names[index]
    database_host = _node_database_hosts[index]
    database_port = _node_database_ports[index]
    database_password = _node_database_passwords[index]

    log_path = "{0}/nimbusio_segment_puller_from_{1}_to_{2}.log".format(
        os.environ["NIMBUSIO_LOG_DIR"], node_name, _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")

    log.info("program starts: work_dir={0}, index={1}, {2}".format(
        work_dir, index, node_name))

    try:
        connection = get_node_connection(node_name,
                                         database_password,
                                         database_host,
                                         database_port)
    except Exception as instance:
        log.exception("Unable to connect to database {0}".format(instance))
        return -1

    try:
        _pull_segment_data(connection, work_dir, node_name)
        _pull_damaged_segment_data(connection, work_dir, node_name)
    except Exception as instance:
        log.exception("_pull_segment_data failed {0}".format(instance))
        return -2
    finally:
        connection.close()

    log.info("program terminates normally")
    return 0
Exemple #34
0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context = zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception("database exception",
                                            str(value),
                                            exctype=exctype.__name__)
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection)
            file_space_sanity_check(file_space_info, _repository_path)

        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, file_space_info,
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(unhandled_exception_topic,
                                        str(instance),
                                        exctype=instance.__class__.__name__)
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()

    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #35
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    global _max_value_file_time

    initialize_logging(_log_path)
    log = logging.getLogger("main")

    try:
        _max_value_file_time = parse_timedelta_str(_max_value_file_time_str)
    except Exception as instance:
        log.exception("Unable to parse '{0}' {1}".format(
            _max_value_file_time_str, instance))
        return -1

    log.info("program starts; max_value_file_time = {0}".format(
        _max_value_file_time))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "node_inspector")
    event_push_client.info("program-start", "node_inspector starts")  

    try:
        connection = get_node_local_connection()
    except Exception as instance:
        log.exception("Exception connecting to database {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1

    known_value_files = dict()

    connection.begin_transaction()
    try:
        for batch in generate_work(connection):
            _process_work_batch(connection, known_value_files, batch)
    except Exception as instance:
        connection.rollback()
        log.exception("Exception processing batch {0} {1}".format(
            batch, instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return -1
    else:
        connection.commit()
    finally:
        connection.close()
        event_push_client.close()
        zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #36
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    prepare_ipc_path(_pull_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    pull_socket = _bind_pull_socket(zeromq_context)

    event_push_client = EventPushClient(zeromq_context, "service_availability")
    event_push_client.info("program-starts", 
                           "service availability monitor starts")

    message_count = 0
    try:
        ping_process_dict = _start_ping_processes(halt_event)

        while not halt_event.is_set():

            if message_count % len(ping_process_dict) == 0:
                for ping_process in ping_process_dict.values():
                    poll_subprocess(ping_process.process)

            message = pull_socket.recv_pyobj()
            assert not pull_socket.rcvmore

            _process_one_message(message, ping_process_dict, event_push_client)

            message_count += 1

    except KeyboardInterrupt: # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program terminating normally; interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminating normally")

    log.debug("terminating subprocesses")
    _terminate_ping_processes(ping_process_dict)
    pull_socket.close()
    event_push_client.close()
    zeromq_context.term()

    return return_value
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    resources = \
        _resources_tuple(halt_event=Event(),
                         zeromq_context=zeromq_context,
                         reply_push_sockets=dict(),
                         pull_socket=zeromq_context.socket(zmq.PULL),
                         io_controller_push_socket=\
                            zeromq_context.socket(zmq.PUSH),
                         router_socket=zeromq_context.socket(zmq.ROUTER),
                         event_push_client=\
                            EventPushClient(zeromq_context, 
                                            "rs_db_pool_controller"),
                         active_retrieves=dict(),
                         pending_work_queue=deque(),
                         available_ident_queue=deque())

    log.debug("binding to {0}".format(db_controller_pull_socket_uri))
    resources.pull_socket.bind(db_controller_pull_socket_uri)

    log.debug("connecting to {0}".format(io_controller_pull_socket_uri))
    resources.io_controller_push_socket.connect(io_controller_pull_socket_uri)

    resources.router_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("binding to {0}".format(db_controller_router_socket_uri))
    resources.router_socket.bind(db_controller_router_socket_uri)

    # we poll the sockets for readability, we assume we can always
    # write to the router socket
    poller = zmq.Poller()
    poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR)
    poller.register(resources.router_socket, zmq.POLLIN| zmq.POLLERR)

    worker_processes = list()
    for index in range(_worker_count):
        worker_processes.append(_launch_database_pool_worker(index+1))
    
    last_report_time = 0.0
    try:
        while not halt_event.is_set():
            for worker_process in worker_processes:
                poll_subprocess(worker_process)
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = \
                        "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message) 
                if active_socket is resources.pull_socket:
                    _read_pull_socket(resources)
                elif active_socket is resources.router_socket:
                    _read_router_socket(resources)
                else:
                    log.error("unknown socket {0}".format(active_socket))
            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                report_message = \
                    "{0:,} active_retrives, " \
                    "{1:,} pending_work_queue entries, " \
                    "{2:,} available_ident_queue entries" \
                    "".format(len(resources.active_retrieves),
                              len(resources.pending_work_queue),
                              len(resources.available_ident_queue))
                log.info(report_message)
                resources.event_push_client.info(
                    "queue_sizes", 
                    report_message,
                    active_retrieves=len(resources.active_retrieves),
                    pending_work_queue=len(resources.pending_work_queue),
                    available_ident_queue=len(resources.available_ident_queue))

                last_report_time = current_time

    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(unhandled_exception_topic,
                                                  "zeromq_error",
                                                  exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        for worker_process in worker_processes:
            terminate_subprocess(worker_process)
        resources.pull_socket.close()
        resources.io_controller_push_socket.close()
        resources.router_socket.close()
        for push_socket in resources.reply_push_sockets.values():
            push_socket.close()
        resources.event_push_client.close()
        zeromq_context.term()

    return return_value
def main(
    log_path, 
    state,
    pre_loop_actions,
    post_loop_actions,
    exception_action = None,
    halt_event = Event(),
):
    """
    This function is run as the main entry point of every time queue driven
    process. It wraps the event loop driven by te time_queue.
    
    log_path
        The full path to the log file for this process

    state
        State object (usually a dict) passed to callback functions

    pre_loop_actions
        A list of functions to be run before the event loop starts. 

        Function arguments are ``(halt_event, state)``

        Functions may return a list of tuples to be added to the time queue
        ``(callback_function, start_time, )``

        nimbus.io processes this function is conventionlly run a single
        function called ``_startup``, but it can have any name.

    post_loop_actions
        A list of functions to be run after the event loop terminates 
        (``halt_event`` set)

        Function argument is  ``(state)``

        Function returns is ignored

        In nimbus.io processes this function is conventioanlly named
        ``_tear_down``, but it can have any name.

    exception_action
        A frunction to be executed when the event loop catches and
        exception fromma calback function.
        
        It takes ``(state)`` as an argument

        Its return is ignored

        In nimbus.io processes this function is used to push a zeromq
        message to an event publisher

    halt_event (optional)
        a ``threading.Event`` that will be set when SIGTERM is detected
        used to terminate the event loop.

        halt_event is passed to as an argument to callback functions.
        
        If the caller wants greater access, they can create their
        own halt event and pass it here as an argument.

    returns 0 for normal termination, nonzero for failure
        
    """
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("start")

    while not halt_event.is_set():
        try:
            _run_until_halt(
                state,
                pre_loop_actions,
                post_loop_actions,
                halt_event
            )
        except Exception, instance:
            log.exception(instance)
            print >> sys.stderr, instance.__class__.__name__, str(instance)
            if exception_action is not None:
                exception_action(state)
            return 12
Exemple #39
0
        self.tearDown()

    def tearDown(self):
        pass

    def test_invalid_url(self):
        """try a URL we cant parse"""
        result = parse_url("GET", "pork")
        self.assertEqual(result, None)

    def test_valid_urls(self):
        """run through all the URLs we should be able to parse"""
        for method, url, expected_action in _valid_urls_with_actions:
            result = parse_url(method, url)
            self.assertNotEqual(result, None, (method, url, ))
            action, match_object = result
            self.assertEqual(
                action, expected_action, 
                (action, expected_action, method, url, )
            )
            self.assertTrue(
                _match_object_dispatch_table[action](match_object),
                (method, url,)
            )

if __name__ == "__main__":
    initialize_logging(_log_path)
    unittest.main()


Exemple #40
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    volume_name = sys.argv[1]
    worker_number = int(sys.argv[2])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         volume_name.replace("/", "_"),
                                         worker_number, _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_io_worker_{0}_{1}".format(volume_name,
                                                      worker_number)
    resources = \
        _resources_tuple(halt_event=halt_event,
                         zeromq_context=zeromq_context,
                         reply_push_sockets=dict(),
                         event_push_client=EventPushClient(zeromq_context,
                                                           event_source_name),
                         dealer_socket=zeromq_context.socket(zmq.DEALER),
                         file_cache=LRUCache(_max_file_cache_size))

    resources.dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(io_controller_router_socket_uri))
    resources.dealer_socket.connect(io_controller_router_socket_uri)

    last_close_pass_time = time.time()
    try:
        while not halt_event.is_set():
            # an occasional pass that closes any open files that haven't
            # been used
            current_time = time.time()
            elapsed_time = current_time - last_close_pass_time
            if elapsed_time > _unused_file_close_interval:
                _make_close_pass(resources, current_time)
                last_close_pass_time = current_time

            _send_work_request(resources, volume_name)
            _process_request(resources)

    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(
                unhandled_exception_topic,
                "Interrupted zeromq system call",
                exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        resources.dealer_socket.close()
        for push_socket in resources.reply_push_sockets.values():
            push_socket.close()
        resources.event_push_client.close()
        resources.zeromq_context.term()

    return return_value
Exemple #41
0
def main(
        log_path,
        state,
        pre_loop_actions,
        post_loop_actions,
        exception_action=None,
        halt_event=Event(),
):
    """
    This function is run as the main entry point of every time queue driven
    process. It wraps the event loop driven by te time_queue.

    log_path
        The full path to the log file for this process

    state
        State object (usually a dict) passed to callback functions

    pre_loop_actions
        A list of functions to be run before the event loop starts.

        Function arguments are ``(halt_event, state)``

        Functions may return a list of tuples to be added to the time queue
        ``(callback_function, start_time, )``

        nimbus.io processes this function is conventionlly run a single
        function called ``_startup``, but it can have any name.

    post_loop_actions
        A list of functions to be run after the event loop terminates
        (``halt_event`` set)

        Function argument is  ``(state)``

        Function returns is ignored

        In nimbus.io processes this function is conventioanlly named
        ``_tear_down``, but it can have any name.

    exception_action
        A frunction to be executed when the event loop catches and
        exception fromma calback function.

        It takes ``(state)`` as an argument

        Its return is ignored

        In nimbus.io processes this function is used to push a zeromq
        message to an event publisher

    halt_event (optional)
        a ``threading.Event`` that will be set when SIGTERM is detected
        used to terminate the event loop.

        halt_event is passed to as an argument to callback functions.

        If the caller wants greater access, they can create their
        own halt event and pass it here as an argument.

    returns 0 for normal termination, nonzero for failure

    """
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("start")

    while not halt_event.is_set():
        try:
            _run_until_halt(state, pre_loop_actions, post_loop_actions,
                            halt_event)
        except Exception:
            instance = sys.exc_info()[1]
            log.exception(instance)
            if exception_action is not None:
                exception_action(state)
            return 12

    log.info("normal termination")
    return 0
Exemple #42
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    for internal_socket_uri in internal_socket_uri_list:
        prepare_ipc_path(internal_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    database_pool_controller = _launch_database_pool_controller()
    io_controller = _launch_io_controller()

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)
    db_controller_push_socket = \
        _connect_db_controller_push_socket(zeromq_context)
    event_push_client = EventPushClient(zeromq_context, "retrieve_source")
    event_push_client.info("program-starts", "retrieve source starts")

    # we poll the sockets for readability, we assume we can always
    # write to the push client sockets
    poller = zmq.Poller()
    poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR)

    last_report_time = 0.0
    request_count = 0
    try:
        while not halt_event.is_set():
            poll_subprocess(database_pool_controller)
            poll_subprocess(io_controller)

            # we've only registered one socket, so we could use an 'if' here,
            # but this 'for' works ok and it has the same form as the other
            # places where we use poller
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = \
                        "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)

                assert active_socket is rep_socket

                _process_one_request(rep_socket, db_controller_push_socket)

                request_count += 1

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                report_message = "{0:,} requests".format(request_count)
                log.info(report_message)
                event_push_client.info("request_count",
                                       report_message,
                                       request_count=request_count)
                last_report_time = current_time
                request_count = 0

    except KeyboardInterrupt:  # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        terminate_subprocess(database_pool_controller)
        terminate_subprocess(io_controller)
        rep_socket.close()
        db_controller_push_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    for internal_socket_uri in internal_socket_uri_list:
        prepare_ipc_path(internal_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    database_pool_controller = _launch_database_pool_controller()
    io_controller = _launch_io_controller()

    zeromq_context = zmq.Context()
    rep_socket = _bind_rep_socket(zeromq_context)
    db_controller_push_socket = _connect_db_controller_push_socket(zeromq_context)
    event_push_client = EventPushClient(zeromq_context, "retrieve_source")
    event_push_client.info("program-starts", "retrieve source starts")

    # we poll the sockets for readability, we assume we can always
    # write to the push client sockets
    poller = zmq.Poller()
    poller.register(rep_socket, zmq.POLLIN | zmq.POLLERR)

    last_report_time = 0.0
    request_count = 0
    try:
        while not halt_event.is_set():
            poll_subprocess(database_pool_controller)
            poll_subprocess(io_controller)

            # we've only registered one socket, so we could use an 'if' here,
            # but this 'for' works ok and it has the same form as the other
            # places where we use poller
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)

                assert active_socket is rep_socket

                _process_one_request(rep_socket, db_controller_push_socket)

                request_count += 1

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                report_message = "{0:,} requests".format(request_count)
                log.info(report_message)
                event_push_client.info("request_count", report_message, request_count=request_count)
                last_report_time = current_time
                request_count = 0

    except KeyboardInterrupt:  # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic, "zeromq_error", exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic, str(instance), exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        terminate_subprocess(database_pool_controller)
        terminate_subprocess(io_controller)
        rep_socket.close()
        db_controller_push_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    prepare_ipc_path(_pull_socket_uri)

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    pull_socket = _bind_pull_socket(zeromq_context)

    event_push_client = EventPushClient(zeromq_context, "service_availability")
    event_push_client.info("program-starts", 
                           "service availability monitor starts")

    message_count = 0
    try:
        ping_process_dict = _start_ping_processes(halt_event)

        while not halt_event.is_set():

            if message_count % len(ping_process_dict) == 0:
                for ping_process in ping_process_dict.values():
                    poll_subprocess(ping_process.process)

            message = pull_socket.recv_pyobj()
            assert not pull_socket.rcvmore

            _process_one_message(message, ping_process_dict, event_push_client)

            message_count += 1

    except KeyboardInterrupt: # convenience for testing
        log.info("keyboard interrupt: terminating normally")
    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program terminating normally; interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "zeromq_error",
                                        exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminating normally")

    log.debug("terminating subprocesses")
    _terminate_ping_processes(ping_process_dict)
    pull_socket.close()
    event_push_client.close()
    zeromq_context.term()

    return return_value
Exemple #45
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    worker_number = int(sys.argv[1])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         worker_number,
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_dbpool_worker_{0}".format(worker_number)
    event_push_client = EventPushClient(zeromq_context, event_source_name)

    dealer_socket = zeromq_context.socket(zmq.DEALER)
    dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(db_controller_router_socket_uri))
    dealer_socket.connect(db_controller_router_socket_uri)

    log.debug("opening local database connection")
    database_connection = get_node_local_connection()

    try:
        _send_initial_work_request(dealer_socket)
        while not halt_event.is_set():
            _process_one_transaction(dealer_socket, 
                                     database_connection,
                                     event_push_client)
    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            event_push_client.exception(unhandled_exception_topic,
                                        "Interrupted zeromq system call",
                                        exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        event_push_client.exception(unhandled_exception_topic,
                                    str(instance),
                                    exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        database_connection.close()
        dealer_socket.close()
        event_push_client.close()
        zeromq_context.term()

    return return_value
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    signal.signal(signal.SIGTERM, _create_signal_handler(halt_event))

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

        # start a transaction
        connection.execute("begin")

        # try one defrag pass
        bytes_defragged = 0
        try:
            bytes_defragged = _defrag_pass(connection, event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
def main():
    """
    main entry point

    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zmq_context =  zmq.Context()

    event_push_client = EventPushClient(zmq_context, "defragger")
    event_push_client.info("program-start", "defragger starts")  

    connection = None
    file_space_info = None

    while not halt_event.is_set():

        # if we don't have an open database connection, get one
        if connection is None:
            try:
                connection = get_node_local_connection()
            except Exception as instance:
                exctype, value = sys.exc_info()[:2]
                event_push_client.exception(
                    "database exception",
                    str(value),
                    exctype=exctype.__name__
                )
                log.exception("Exception connecting to database")
                halt_event.wait(_database_retry_interval)
                continue

            file_space_info = load_file_space_info(connection) 
            file_space_sanity_check(file_space_info, _repository_path)


        # try one defrag pass
        bytes_defragged = 0
        connection.begin_transaction()
        try:
            bytes_defragged = _defrag_pass(connection, 
                                           file_space_info, 
                                           event_push_client)
        except KeyboardInterrupt:
            halt_event.set()
            connection.rollback()
        except Exception as instance:
            log.exception(str(instance))
            event_push_client.exception(
                unhandled_exception_topic,
                str(instance),
                exctype=instance.__class__.__name__
            )
            connection.rollback()
        else:
            connection.commit()

        log.info("bytes defragged = {0:,}".format(bytes_defragged))

        # if we didn't do anything on this pass...
        if bytes_defragged == 0:

            # exit if we're done and asked to do single pass
            if int(os.environ.get('NIMBUSIO_EXIT_WHEN_DONE', '0')):
                halt_event.set()

            # close the database connection
            if connection is not None:
                connection.close()
                connection = None

            # wait and try again
            try:
                halt_event.wait(_defrag_check_interval)
            except KeyboardInterrupt:
                halt_event.set()
                
    if connection is not None:
        connection.close()

    event_push_client.close()
    zmq_context.term()

    log.info("program terminates normally")
    return 0
Exemple #48
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    volume_name = sys.argv[1]
    worker_number = int(sys.argv[2])

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"], 
                                         volume_name.replace("/", "_"),
                                         worker_number,
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    event_source_name = "rs_io_worker_{0}_{1}".format(volume_name, 
                                                      worker_number)
    resources = \
        _resources_tuple(halt_event=halt_event,
                         zeromq_context=zeromq_context,
                         reply_push_sockets=dict(),
                         event_push_client=EventPushClient(zeromq_context, 
                                                           event_source_name),
                         dealer_socket=zeromq_context.socket(zmq.DEALER),
                         file_cache=LRUCache(_max_file_cache_size))

    resources.dealer_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("connecting to {0}".format(io_controller_router_socket_uri))
    resources.dealer_socket.connect(io_controller_router_socket_uri)

    last_close_pass_time = time.time()
    try:
        while not halt_event.is_set():
            # an occasional pass that closes any open files that haven't 
            # been used
            current_time = time.time()
            elapsed_time = current_time - last_close_pass_time
            if elapsed_time > _unused_file_close_interval:
                _make_close_pass(resources, current_time)
                last_close_pass_time = current_time 

            _send_work_request(resources, volume_name)
            _process_request(resources)

    except InterruptedSystemCall:
        if halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(
                unhandled_exception_topic,
                "Interrupted zeromq system call",
                exctype="InterruptedSystemCall")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        resources.dealer_socket.close()
        for push_socket in resources.reply_push_sockets.values():
            push_socket.close()
        resources.event_push_client.close()
        resources.zeromq_context.term()

    return return_value
Exemple #49
0
def main():
    """
    main entry point
    return 0 for success (exit code)
    """
    initialize_logging(_log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context =  zmq.Context()

    event_push_client = EventPushClient(zeromq_context, 
                                        "redis_stats_collector")
    event_push_client.info("program-start", "flush_stats_from_redis starts")  

    # don't flush anything newer than 1 minute ago
    current_time = datetime.utcnow()
    timestamp_cutoff = current_time - timedelta(minutes=1)

    return_code = 0
    central_db_connection = None

    collection_ops_accounting_rows = list()

    # values to be added to the dedupe table
    new_dedupes = list()

    # keys to be deleted (a list for each node
    node_keys_processed = [list() for _ in _node_names]

    try:
        central_db_connection = get_central_connection()

        # On startup, the program connects to the central database and tries 
        # to acquire a pg_advisory_lock appropriate for this program and the 
        # data center it is running in using the pg_try_advisory_lock function.
        # If it cannot acquire the lock, it notes the status of the lock 
        # and exits. This central locking mechanism lets us avoid single points
        # of failure by configuring the program to run on multiple nodes.

        with advisory_lock(central_db_connection, "redis_stats_collector"):
            node_dict = _retrieve_node_dict(central_db_connection)
            for node_name, keys_processed in \
                zip(_node_names, node_keys_processed):
                node_id = node_dict[node_name]
                log.debug("processing node {0} node_id={1}".format(node_name,
                                                                  node_id))

                # The program then selects into memory all recently collected 
                # keys from the central database table 
                # collection_ops_accounting_flush_dedupe and stores them in a 
                # dedupe set. This set allows runs of the collection/flush 
                # program to be idempotent across some time period (
                # but we won't keep the list of old keys forever.) 

                dedupe_set = _retrieve_dedupe_set(central_db_connection, 
                                                  node_id)

                # The program then visits the Redis instance on every storage 
                # node in the local data center, collecting the data from all 
                # past stats keys -- aggregating it into the program's memory.  
                # The aggregation should involve buckets for each 
                # storage_node_id and redis key, corresponding to the columns 
                # in the database.
                _process_one_node(node_name,
                                  node_dict[node_name],
                                  timestamp_cutoff,
                                  dedupe_set,
                                  collection_ops_accounting_rows,
                                  new_dedupes,
                                  keys_processed)

            # After collecting past keys from every storage node, 
            # inside a central database transaction:
            # 1. Insert the collected stats into the central database 
            #    collection_ops_accounting
            # 2. Insert collected keys into recently collected keys 
            #    collection_ops_accounting_flush_dedupe.
            # 3. commit transaction
            log.debug("updating central database")
            central_db_connection.begin_transaction()
            try:
                _insert_accounting_rows(central_db_connection,
                                        collection_ops_accounting_rows)
                _insert_dedupe_rows(central_db_connection, 
                                    timestamp_cutoff, 
                                    new_dedupes)
            except Exception:
                central_db_connection.rollback()
                raise
            else:
                central_db_connection.commit()

            # Then revisit the Redis nodes, and delete the keys we flushed 
            # into the database, and any keys we skipped because they were 
            # found in the dedupe set.
            for node_name, keys_processed in zip(_node_names, 
                                                 node_keys_processed):
                _remove_processed_keys(node_name, keys_processed)

    except Exception as instance:
        log.exception("Uhandled exception {0}".format(instance))
        event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__
        )
        return_code = 1

    if central_db_connection is not None:
        central_db_connection.close()

    event_push_client.close()
    zeromq_context.term()

    log.info("program terminates return_code = {0}".format(return_code))
    return return_code
            "segment-size"      : file_size,
            "segment-adler32"   : file_adler32,
            "segment-md5-digest": b64encode(file_md5.digest()),
            "file-size"         : file_size,
            "file-adler32"      : file_adler32,
            "file-hash"         : b64encode(file_md5.digest()),
            "handoff-node-name" : None,
        }
        reply = send_request_and_get_reply(
            _local_node_name,
            _data_writer_address, 
            _local_node_name,
            _client_address,
            message, 
            data=content_item
        )
        self.assertEqual(reply["message-id"], message_id)
        self.assertEqual(reply["message-type"], "archive-key-final-reply")
        self.assertEqual(reply["result"], "success")

        reply = self._destroy(
            collection_id, key, destroy_timestamp, segment_num
        )
        self.assertEqual(reply["result"], "success", reply["error-message"])


if __name__ == "__main__":
    initialize_logging(_log_path)
    unittest.main()

Exemple #51
0
def main():
    """
    main entry point
    returns 0 for normal termination (usually SIGTERM)
    """
    return_value = 0

    log_path = _log_path_template.format(os.environ["NIMBUSIO_LOG_DIR"],
                                         _local_node_name)
    initialize_logging(log_path)
    log = logging.getLogger("main")
    log.info("program starts")

    halt_event = Event()
    set_signal_handler(halt_event)

    zeromq_context = zmq.Context()

    resources = \
        _resources_tuple(halt_event=Event(),
                         volume_by_space_id=_volume_name_by_space_id(),
                         pull_socket=zeromq_context.socket(zmq.PULL),
                         router_socket=zeromq_context.socket(zmq.ROUTER),
                         event_push_client=\
                            EventPushClient(zeromq_context,
                                            "rs_io_controller"),
                         pending_work_by_volume=defaultdict(deque),
                         available_ident_by_volume=defaultdict(deque))

    log.debug("binding to {0}".format(io_controller_pull_socket_uri))
    resources.pull_socket.bind(io_controller_pull_socket_uri)

    resources.router_socket.setsockopt(zmq.LINGER, 1000)
    log.debug("binding to {0}".format(io_controller_router_socket_uri))
    resources.router_socket.bind(io_controller_router_socket_uri)

    # we poll the sockets for readability, we assume we can always
    # write to the router socket
    poller = zmq.Poller()
    poller.register(resources.pull_socket, zmq.POLLIN | zmq.POLLERR)
    poller.register(resources.router_socket, zmq.POLLIN | zmq.POLLERR)

    worker_processes = list()
    for volume_name in set(resources.volume_by_space_id.values()):
        for index in range(_worker_count):
            worker_processes.append(_launch_io_worker(volume_name, index + 1))

    last_report_time = 0.0
    try:
        while not halt_event.is_set():
            for worker_process in worker_processes:
                poll_subprocess(worker_process)
            for active_socket, event_flags in poller.poll(_poll_timeout):
                if event_flags & zmq.POLLERR:
                    error_message = \
                        "error flags from zmq {0}".format(active_socket)
                    log.error(error_message)
                    raise PollError(error_message)
                if active_socket is resources.pull_socket:
                    _read_pull_socket(resources)
                elif active_socket is resources.router_socket:
                    _read_router_socket(resources)
                else:
                    log.error("unknown socket {0}".format(active_socket))

            current_time = time.time()
            elapsed_time = current_time - last_report_time
            if elapsed_time > _reporting_interval:
                pending_work = 0
                for volume_queue in resources.pending_work_by_volume.values():
                    pending_work += len(volume_queue)
                report_message = \
                    "{0:,} pending_work entries".format(pending_work)
                log.info(report_message)
                resources.event_push_client.info("queue_sizes",
                                                 report_message,
                                                 pending_work=pending_work)

                last_report_time = current_time

    except zmq.ZMQError as zmq_error:
        if is_interrupted_system_call(zmq_error) and halt_event.is_set():
            log.info("program teminates normally with interrupted system call")
        else:
            log.exception("zeromq error processing request")
            resources.event_push_client.exception(unhandled_exception_topic,
                                                  "zeromq_error",
                                                  exctype="ZMQError")
            return_value = 1
    except Exception as instance:
        log.exception("error processing request")
        resources.event_push_client.exception(
            unhandled_exception_topic,
            str(instance),
            exctype=instance.__class__.__name__)
        return_value = 1
    else:
        log.info("program teminates normally")
    finally:
        for worker_process in worker_processes:
            terminate_subprocess(worker_process)
        resources.pull_socket.close()
        resources.router_socket.close()
        resources.event_push_client.close()
        zeromq_context.term()

    return return_value