Exemple #1
0
def launch_shm_worker():
    global worker_pid, shm_mgr

    pid = os.fork()

    if pid == -1:
        raise OSError('fork failed, unable to run SharedMemoryManager')

    # run SHM worker in the child process
    elif pid == 0:
        shm_mgr = SharedMemoryManager(config)
        shm_mgr.run_as_worker()

    # send testing request in the parent process
    else:
        # wait for shm worker
        time.sleep(1)
        worker_pid = pid

    return pid
Exemple #2
0
class BaseNode():
    ''' The Base Class of Nodes
    '''

    role = None

    def __init__(self, config, role=None):
        self.config = config
        self.role = role or self.role

        self.worker_pids = []
        self.shm_worker_pid = None
        self.pkt_rpter_worker_pid = None

        self.node_id = self.config.basic.node_id

    def _write_master_pid(self):
        pid_path = self.config.basic.pid_file
        pid = os.getpid()

        with open(pid_path, 'w') as f:
            f.write(str(pid))

        logger.debug(
            f'wrote pid file {pid_path} for master process, pid: {pid}')

    def _read_master_pid(self):
        pid_path = self.config.basic.pid_file

        try:
            with open(pid_path, 'r') as f:
                content = f.read()
        except FileNotFoundError:
            raise PidFileNotExists

        try:
            return int(content)
        except ValueError:
            raise ValueError('pid file has beed tampered')

    def _handle_term_master(self, signal, sf):
        logger.debug(f'Master process received signal: {signal}')
        logger.debug(f'The master process starts to shut down workers')
        self.shutdown_workers()

        pid_path = self.config.basic.pid_file
        if os.path.isfile(pid_path):
            logger.debug(f'Remove pid file: {pid_path}')
            os.remove(pid_path)

    def _handle_term_worker(self, signal, sf):
        pid = os.getpid()
        logger.debug(f'Worker {pid} received signal: {signal}')
        logger.debug(f'Shutting down worker {pid}')
        self.core.shutdown()

    def _handle_term_shm(self, signal, sf):
        pid = os.getpid()
        logger.debug(f'SharedMemoryManager {pid} received signal: {signal}')
        logger.debug(f'Shutting down SharedMemoryManager {pid}')
        self.shm_mgr.shutdown_worker()

    def _handle_term_pkt_rpter(self, signal, sf):
        pid = os.getpid()
        logger.debug(f'SpecialPacketRepeater {pid} received signal: {signal}')
        logger.debug(f'Shutting down SpecialPacketRepeater {pid}')
        self.pkt_rpter.shutdown()

    def _sig_master(self):
        sig.signal(sig.SIGHUP, sig.SIG_IGN)
        for s in TERM_SIGNALS:
            sig.signal(s, self._handle_term_master)

    def _sig_normal_worker(self):
        sig.signal(sig.SIGHUP, sig.SIG_IGN)
        for s in TERM_SIGNALS:
            sig.signal(s, self._handle_term_worker)

    def _sig_shm_worker(self):
        sig.signal(sig.SIGHUP, sig.SIG_IGN)
        for s in TERM_SIGNALS:
            sig.signal(s, self._handle_term_shm)

    def _sig_pkt_rpter_worker(self):
        sig.signal(sig.SIGHUP, sig.SIG_IGN)
        for s in TERM_SIGNALS:
            sig.signal(s, self._handle_term_pkt_rpter)

    def shutdown_workers(self):
        for pid in self.worker_pids:
            self._kill(pid)

        # wait for workers to exit
        remaining = list(self.worker_pids)
        while True:
            for pid in list(remaining):
                if self._process_exists(pid):
                    os.waitpid(pid, os.WNOHANG)
                else:
                    logger.debug(f'Worker {pid} terminated')
                    remaining.remove(pid)

            if len(remaining) == 0:
                break

            time.sleep(0.5)

        # shutdown SharedMemoryManager worker at last
        shm_pid = self.shm_worker_pid
        self._kill(shm_pid)
        os.waitpid(shm_pid, 0)
        logger.debug(f'SharedMemoryManager worker {shm_pid} terminated')

        logger.debug('All workers terminated')

    def _kill(self, pid):
        try:
            logger.debug(f'Sending SIGTERM to {pid}')
            os.kill(pid, sig.SIGTERM)
        except ProcessLookupError:
            pass

    def _process_exists(self, pid):
        try:
            os.kill(pid, 0)
        except OSError:
            logger.debug(f'Process {pid} not exists')
            return False
        else:
            logger.debug(f'Process {pid} exists')
            return True

    def daemonize(self):
        pid = os.fork()
        if pid == -1:
            raise OSError('fork failed when doing daemonize')
        elif pid > 0:
            # double fork magic
            sys.exit(0)

        pid = os.fork()
        if pid == -1:
            raise OSError('fork failed when doing daemonize')

        def quit(sg, sf):
            sys.exit(0)

        if pid > 0:
            for s in TERM_SIGNALS:
                sig.signal(s, quit)
            time.sleep(5)
        else:
            self._sig_master()
            ppid = os.getppid()
            os.kill(ppid, sig.SIGTERM)
            os.setsid()

        logger.debug('Node daemonized')

    def _start_shm_mgr(self):
        self.shm_mgr = SharedMemoryManager(self.config)

        # start SharedMemoryManager worker
        pid = os.fork()
        if pid == -1:
            raise OSError('fork failed')
        elif pid == 0:
            self._sig_shm_worker()
            try:
                self.shm_mgr.run_as_worker()
            except Exception:
                err_msg = traceback.format_exc()
                shm_logger.error(
                    f'Unexpected error occurred, SHM worker crashed. '
                    f'Traceback:\n{err_msg}')
                sys.exit(1)

            sys.exit(0)  # the sub-process ends here
        else:
            self.shm_worker_pid = pid
            logger.info(f'Started SharedMemoryManager: {pid}')

    def _start_pkt_rpter(self):
        '''
        The Repeater needs some components from the node,
        so we shouldn't use this method before components are initialized
        '''

        pid = os.fork()
        if pid == -1:
            raise OSError('fork failed')
        elif pid == 0:
            self._sig_pkt_rpter_worker()
            self.pkt_rpter = SpecialPacketRepeater(
                self.config,
                self.efferent,
                self.protocol_wrapper,
            )

            try:
                self.pkt_rpter.init_shm()
                self.pkt_rpter.run()
            except Exception:
                err_msg = traceback.format_exc()
                logger.error(
                    f'Unexpected error occurred, SpecialPacketRepeater worker '
                    f'crashed. Traceback:\n{err_msg}')
                sys.exit(1)

            sys.exit(0)  # the sub-process ends here
        else:
            self.pkt_rpter = SpecialPacketRepeater(
                self.config,
                self.efferent,
                self.protocol_wrapper,
            )
            self.pkt_rpter_worker_pid = pid
            logger.info(f'Started SpecialPacketRepeater: {pid}')

    def _load_modules(self):
        self.afferent_cls = AFFERENT_MAPPING[self.role]
        self.main_afferent = self.afferent_cls(self.config)

        self.efferent = UDPTransmitter(self.config)

        self.protocol_wrapper = ProtocolWrapper(
            self.config,
            HeaderFormat,
            DataPktFormat,
            CtrlPktFormat,
            ConnCtrlPktFormat,
        )

        self.logic_handler_cls = LOGIC_HANDLER_MAPPING[self.role]
        self.logic_handler = self.logic_handler_cls(self.config)

        self.core_cls = CORE_MAPPING.get(self.role)
        self.core = self.core_cls(
            self.config,
            main_afferent=self.main_afferent,
            minor_afferents=[],
            efferent=self.efferent,
            logic_handler=self.logic_handler,
            protocol_wrapper=self.protocol_wrapper,
        )

        self.pkt_mgr = SpecialPacketManager(self.config)

        # The packet repeater is a part of the packet manager, so we will
        # use it as a normal module. Each worker shall have it's own packet
        # repeater but not share it like the shared memory manager worker
        self._start_pkt_rpter()

        self.logic_handler.init_shm()

        self.core.init_shm()
        self.core.self_allocate_core_id()

        self.pkt_mgr.init_shm()

        pid = os.getpid()
        logger.debug(f'Worker {pid} loaded modules')

    def _clean_modules(self):
        self._kill(self.pkt_rpter_worker_pid)
        os.waitpid(self.pkt_rpter_worker_pid, 0)
        logger.debug(f'SpecialPacketRepeater worker '
                     f'{self.pkt_rpter_worker_pid} terminated')

        self.core.shutdown()
        self.main_afferent.destroy()

        self.logic_handler.close_shm()
        self.core.close_shm()
        self.pkt_mgr.close_shm()

        self.main_afferent = None
        self.efferent = None
        self.protocol_wrapper = None
        self.logic_handler = None
        self.core = None
        self.pkt_mgr = None

        pid = os.getpid()
        logger.debug(f'Worker {pid} cleaned modules')

    def get_context():
        return NodeContext

    def _create_context(self):
        NodeContext.pkt_rpter_pid = self.pkt_rpter_worker_pid
        NodeContext.local_ip = get_localhost_ip()
        NodeContext.listen_port = self.config.net.aff_listen_port
        NodeContext.core = self.core
        NodeContext.main_efferent = self.efferent
        NodeContext.protocol_wrapper = self.protocol_wrapper
        NodeContext.pkt_mgr = self.pkt_mgr

        NodeContext.id_generator = IDGenerator(self.node_id, self.core.core_id)

        pid = os.getpid()
        logger.debug(f'Worker {pid} created NodeContext')

    def _clean_context(self):
        NodeContext.pkt_rpter_pid = None
        NodeContext.id_generator = None
        NodeContext.local_ip = None
        NodeContext.listen_port = None
        NodeContext.core = None
        NodeContext.main_efferent = None
        NodeContext.protocol_wrapper = None

        pid = os.getpid()
        logger.debug(f'Worker {pid} cleaned NodeContext')

    def join_cluster(self):
        if self.role == Roles.CONTROLLER:
            raise RuntimeError(
                'Controller node is the root node of the cluster')

        self.core.request_to_join_cluster()
        self.core.run_for_a_while(5)
        raise TimeoutError

    def run(self):
        pid_fl = self.config.basic.pid_file
        try:
            pid = self._read_master_pid()
            logger.warn(
                f'\n\tThe Neverland node is already running or the pid file\n'
                f'\t{pid_fl} is not removed, current pid: {pid}.\n'
                f'\tMake sure that the node is not running and try again.\n\n'
                f'\tIf you need to run multiple node on this computer, then\n'
                f'\tyou need to at least configure another pid file for it.')
            return
        except ValueError:
            logger.error(
                f'\n\tThe pid file {pid_fl} exists but seems it\'s not\n'
                f'\twritten by the Neverland node. Please make sure the node\n'
                f'\tis not running and the pid file is not occupied.')
            return
        except PidFileNotExists:
            pass

        self.daemonize()
        NodeContext.pid = os.getpid()

        self._write_master_pid()
        self._start_shm_mgr()

        # Before we start workers, we need to join the cluster first.
        if self.role != Roles.CONTROLLER:
            # Before we join the cluster, we need to load modules at first,
            # once we have joined the cluster, modules in the Master worker
            # shall be removed.
            self._load_modules()
            self._create_context()

            try:
                self.join_cluster()
            except SuccessfullyJoinedCluster:
                logger.info('Successfully joined the cluster.')
            except FailedToJoinCluster:
                logger.error('Cannot join the cluster, request not permitted')
                self._clean_modules()
                self._clean_context()
                self._on_break()
                return
            except TimeoutError:
                logger.error(
                    'No response from entrance node, Failed to join the cluster'
                )
                self._clean_modules()
                self._clean_context()
                self._on_break()
                return

            self._clean_modules()
            self._clean_context()

        # start normal workers
        worker_amount = self.config.basic.worker_amount
        for _ in range(worker_amount):
            pid = os.fork()
            NodeContext.pid = os.getpid()

            if pid == -1:
                raise OSError('fork failed')
            elif pid == 0:
                self._sig_normal_worker()
                self._load_modules()
                self._create_context()

                try:
                    self.core.run()
                except Exception:
                    err_msg = traceback.format_exc()
                    logger.error(f'Unexpected error occurred, node crashed. '
                                 f'Traceback:\n{err_msg}')

                    self._clean_modules()
                    self._clean_context()
                    sys.exit(1)

                self._clean_modules()
                self._clean_context()
                sys.exit(0)  # the sub-process ends here
            else:
                self.worker_pids.append(pid)
                logger.info(f'Started Worker: {pid}')

        while True:
            try:
                os.waitpid(-1, 0)
            except ChildProcessError:
                break

    def shutdown(self):
        pid = self._read_master_pid()
        self._kill(pid)
        logger.info('Sent SIGTERM to the master process')

    def _on_break(self):
        '''
        a hook that needs to be invoked while self.run has been broken
        by some exception
        '''

        shm_pid = self.shm_worker_pid
        self._kill(shm_pid)
        os.waitpid(shm_pid, 0)
        logger.debug(f'SharedMemoryManager worker {shm_pid} terminated')

        pid_fl = self.config.basic.pid_file
        os.remove(pid_fl)
        logger.debug(f'Removed pid file: {pid_fl}')
        logger.info('Master process exits.\n\n')