Example #1
0
# -*- coding: utf-8 -*-
"""
internal_sockets.py

addresses of sockets used internally by retrieve_source
external addresses come from environment variables
"""
import os

from tools.zeromq_util import ipc_socket_uri

_local_node_name = os.environ["NIMBUSIO_NODE_NAME"]
_socket_dir = os.environ["NIMBUSIO_SOCKET_DIR"]

db_controller_pull_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name, "db_controller_pull")

db_controller_router_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name, "db_controller_router")

io_controller_pull_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name, "io_controller_pull")

io_controller_router_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name, "io_controller_router")

internal_socket_uri_list = [
    db_controller_pull_socket_uri,
    db_controller_router_socket_uri,
    io_controller_router_socket_uri,
    io_controller_router_socket_uri,
]
def process_segment_rows(halt_event, 
                         zeromq_context, 
                         args, 
                         node_dict,
                         node_databases,
                         raw_segment_rows):
    """
    process handoffs of segment rows
    """
    log = logging.getLogger("process_segment_rows")

    rep_socket_uri = ipc_socket_uri(_socket_dir, 
                                    args.node_name,
                                    "handoff_client")
    prepare_ipc_path(rep_socket_uri)

    rep_socket = zeromq_context.socket(zmq.REP)
    rep_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark)
    rep_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark)
    log.info("binding rep socket to {0}".format(rep_socket_uri))
    rep_socket.bind(rep_socket_uri)

    log.debug("starting workers")
    workers = list()
    for index in range(args.worker_count):
        worker_id = str(index+1)
        workers.append(_start_worker_process(worker_id, args, rep_socket_uri))

    # loop until all handoffs have been accomplished
    log.debug("start handoffs")
    work_generator =  _generate_segment_rows(raw_segment_rows)
    pending_handoff_count = 0
    while not halt_event.is_set():

        # get a segment row to process. If we are at EOF, segment_row = None
        try:
            source_node_names, segment_row = next(work_generator)
        except StopIteration:
            if pending_handoff_count == 0:
                break
            else:
                source_node_names, segment_row = None, None

        # if we have a segment row, and it is a tombstone, we can act
        # directly on the node database(s) without sending it to a worker
        if segment_row is not None:
            if segment_row["status"] == segment_status_tombstone:
                _process_tombstone(node_databases, 
                                   source_node_names, 
                                   segment_row)
                _purge_handoff_from_source_nodes(node_databases,
                                                 source_node_names,
                                                 segment_row["collection_id"],
                                                 segment_row["key"],
                                                 segment_row["unified_id"],
                                                 segment_row["conjoined_part"],
                                                 segment_row["handoff_node_id"],
                                                 segment_status_tombstone)
                continue
            assert segment_row["status"] == segment_status_final, \
                segment_row["status"]
    
        # at this point we eaither have a segment row in final status, or
        # None, indicating no more data
        # block until we have a ready worker
        try:
            request = rep_socket.recv_pyobj()
        except zmq.ZMQError as zmq_error:
            if is_interrupted_system_call(zmq_error) and halt_event.is_set():
                log.warn("breaking due to halt_event")
                break
            raise
        assert not rep_socket.rcvmore

        # see how the worker handled the previous segment (if any)
        initial_request = False
        if request["message-type"] == "start":
            log.info("{0} initial request".format(request["worker-id"]))
            initial_request = True
        elif request["handoff-successful"]:
            log.info("{0} handoff ({1}, {2}) successful".format(
                request["worker-id"], 
                request["unified-id"], 
                request["conjoined-part"]))
            assert pending_handoff_count > 0
            pending_handoff_count -= 1
            _purge_handoff_from_source_nodes(node_databases, 
                                             request["source-node-names"],
                                             request["collection-id"],
                                             request["key"],
                                             request["unified-id"],
                                             request["conjoined-part"],
                                             request["handoff-node-id"],
                                             segment_status_final)
        else:
            log.error("{0} handoff ({1}, {2}) failed: {3}".format(
                request["worker-id"],
                request["unified-id"], 
                request["conjoined-part"],
                request["error-message"]))
            assert pending_handoff_count > 0
            pending_handoff_count -= 1

        if segment_row is None:
            # if we have no more work, tell the worker to stop
            work_message = {"message-type"        : "stop"}
        else:
            # otherwise, send the segment to the worker 
            work_message = {"message-type"        : "work",
                            "source-node-names"   : source_node_names,
                            "segment-row"         : segment_row}
            # if this is the worker's first request, send him the node_dict
            if initial_request:
                work_message["node-dict"] = node_dict
            pending_handoff_count += 1

        rep_socket.send_pyobj(work_message)

    log.debug("end of handoffs")

    for worker in workers:
        terminate_subprocess(worker)

    rep_socket.close()
from tools.standard_logging import initialize_logging
from tools.zeromq_util import is_interrupted_system_call, \
        prepare_ipc_path, \
        ipc_socket_uri
from tools.process_util import identify_program_dir, \
        set_signal_handler, \
        poll_subprocess
from tools.event_push_client import EventPushClient, unhandled_exception_topic

_node_names = os.environ["NIMBUSIO_NODE_NAME_SEQ"].split()
_local_node_name = os.environ["NIMBUSIO_NODE_NAME"]
_log_path_template = "{0}/nimbusio_service_availability_monitor_{1}.log"
_socket_dir = os.environ["NIMBUSIO_SOCKET_DIR"]
_pull_socket_uri = ipc_socket_uri(_socket_dir, 
                                  _local_node_name,
                                  "service_availability_monitor")
_pull_socket_hwm = 1000
_poll_timeout = 3000 # milliseconds
_reporting_interval = 60.0

_ping_process_desc = namedtuple("PingProcessDesc", ["module_dir",
                                                    "file_name",
                                                    "service_name",
                                                    "ping_uris", ])

_ping_process = namedtuple("PingProcess", ["service_name",
                                           "node_name",
                                           "process", 
                                           "reachable_state", ])
_ping_process_descs = [ 
Example #4
0
# -*- coding: utf-8 -*-
"""
internal_sockets.py

addresses of sockets used internally by retrieve_source
external addresses come from environment variables
"""
import os

from tools.zeromq_util import ipc_socket_uri

_local_node_name = os.environ["NIMBUSIO_NODE_NAME"]
_socket_dir = os.environ["NIMBUSIO_SOCKET_DIR"]

db_controller_pull_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name,
                                               "db_controller_pull")

db_controller_router_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name,
                                                 "db_controller_router")

io_controller_pull_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name,
                                               "io_controller_pull")

io_controller_router_socket_uri = ipc_socket_uri(_socket_dir, _local_node_name,
                                                 "io_controller_router")

internal_socket_uri_list = [
    db_controller_pull_socket_uri,
    db_controller_router_socket_uri,
    io_controller_router_socket_uri,
    io_controller_router_socket_uri,
Example #5
0
 def event_publisher_pull_addresses(self):
     s = self.socket_path
     return [ipc_socket_uri(s, n, "nimbusio-event-publisher") for n in self.node_names]
Example #6
0
 def event_publisher_pull_addresses(self):
     s = self.socket_path
     return [ipc_socket_uri(s, n, "nimbusio-event-publisher") \
         for n in self.node_names]
Example #7
0
from tools.standard_logging import initialize_logging
from tools.zeromq_util import is_interrupted_system_call, \
        prepare_ipc_path, \
        ipc_socket_uri
from tools.process_util import identify_program_dir, \
        set_signal_handler, \
        poll_subprocess
from tools.event_push_client import EventPushClient, unhandled_exception_topic

_node_names = os.environ["NIMBUSIO_NODE_NAME_SEQ"].split()
_local_node_name = os.environ["NIMBUSIO_NODE_NAME"]
_log_path_template = "{0}/nimbusio_service_availability_monitor_{1}.log"
_socket_dir = os.environ["NIMBUSIO_SOCKET_DIR"]
_pull_socket_uri = ipc_socket_uri(_socket_dir, 
                                  _local_node_name,
                                  "service_availability_monitor")
_pull_socket_hwm = 1000
_poll_timeout = 3000 # milliseconds
_reporting_interval = 60.0

_ping_process_desc = namedtuple("PingProcessDesc", ["module_dir",
                                                    "file_name",
                                                    "service_name",
                                                    "ping_uris", ])

_ping_process = namedtuple("PingProcess", ["service_name",
                                           "node_name",
                                           "process", 
                                           "reachable_state", ])
_ping_process_descs = [ 
Example #8
0
def process_segment_rows(halt_event, zeromq_context, args, node_dict,
                         node_databases, raw_segment_rows):
    """
    process handoffs of segment rows
    """
    log = logging.getLogger("process_segment_rows")

    rep_socket_uri = ipc_socket_uri(_socket_dir, args.node_name,
                                    "handoff_client")
    prepare_ipc_path(rep_socket_uri)

    rep_socket = zeromq_context.socket(zmq.REP)
    rep_socket.setsockopt(zmq.SNDHWM, _socket_high_water_mark)
    rep_socket.setsockopt(zmq.RCVHWM, _socket_high_water_mark)
    log.info("binding rep socket to {0}".format(rep_socket_uri))
    rep_socket.bind(rep_socket_uri)

    log.debug("starting workers")
    workers = list()
    for index in range(args.worker_count):
        worker_id = str(index + 1)
        workers.append(_start_worker_process(worker_id, args, rep_socket_uri))

    # loop until all handoffs have been accomplished
    log.debug("start handoffs")
    work_generator = _generate_segment_rows(raw_segment_rows)
    pending_handoff_count = 0
    while not halt_event.is_set():

        # get a segment row to process. If we are at EOF, segment_row = None
        try:
            source_node_names, segment_row = next(work_generator)
        except StopIteration:
            if pending_handoff_count == 0:
                break
            else:
                source_node_names, segment_row = None, None

        # if we have a segment row, and it is a tombstone, we can act
        # directly on the node database(s) without sending it to a worker
        if segment_row is not None:
            if segment_row["status"] == segment_status_tombstone:
                _process_tombstone(node_databases, source_node_names,
                                   segment_row)
                _purge_handoff_from_source_nodes(
                    node_databases, source_node_names,
                    segment_row["collection_id"], segment_row["key"],
                    segment_row["unified_id"], segment_row["conjoined_part"],
                    segment_row["handoff_node_id"], segment_status_tombstone)
                continue
            assert segment_row["status"] == segment_status_final, \
                segment_row["status"]

        # at this point we eaither have a segment row in final status, or
        # None, indicating no more data
        # block until we have a ready worker
        try:
            request = rep_socket.recv_pyobj()
        except zmq.ZMQError as zmq_error:
            if is_interrupted_system_call(zmq_error) and halt_event.is_set():
                log.warn("breaking due to halt_event")
                break
            raise
        assert not rep_socket.rcvmore

        # see how the worker handled the previous segment (if any)
        initial_request = False
        if request["message-type"] == "start":
            log.info("{0} initial request".format(request["worker-id"]))
            initial_request = True
        elif request["handoff-successful"]:
            log.info("{0} handoff ({1}, {2}) successful".format(
                request["worker-id"], request["unified-id"],
                request["conjoined-part"]))
            assert pending_handoff_count > 0
            pending_handoff_count -= 1
            _purge_handoff_from_source_nodes(
                node_databases, request["source-node-names"],
                request["collection-id"], request["key"],
                request["unified-id"], request["conjoined-part"],
                request["handoff-node-id"], segment_status_final)
        else:
            log.error("{0} handoff ({1}, {2}) failed: {3}".format(
                request["worker-id"], request["unified-id"],
                request["conjoined-part"], request["error-message"]))
            assert pending_handoff_count > 0
            pending_handoff_count -= 1

        if segment_row is None:
            # if we have no more work, tell the worker to stop
            work_message = {"message-type": "stop"}
        else:
            # otherwise, send the segment to the worker
            work_message = {
                "message-type": "work",
                "source-node-names": source_node_names,
                "segment-row": segment_row
            }
            # if this is the worker's first request, send him the node_dict
            if initial_request:
                work_message["node-dict"] = node_dict
            pending_handoff_count += 1

        rep_socket.send_pyobj(work_message)

    log.debug("end of handoffs")

    for worker in workers:
        terminate_subprocess(worker)

    rep_socket.close()