Exemple #1
0
def consul_util(mocker):
    consul = ConsulUtil()
    exc = RuntimeError('Not allowed')
    mock = mocker.patch.object
    mock(consul.kv, 'kv_get', side_effect=exc)
    mock(consul.kv, 'kv_put', side_effect=exc)
    mock(consul.kv, 'kv_put_in_transaction', side_effect=exc)
    mock(consul.kv, 'kv_delete_in_transaction', side_effect=exc)
    mock(consul.catalog, 'get_services', side_effect=exc)
    mock(consul.catalog, 'get_service_names', side_effect=exc)
    mock(consul, 'get_local_nodename', return_value='localhost')
    mock(consul, 'get_hax_hostname', return_value='localhost')
    mock(consul, 'get_hax_ip_address', return_value='192.168.0.28')
    return consul
Exemple #2
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    _setup_logging()

    # [KN] The elements in the queue will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python thread created by
    # _run_thread(ConsumerThread(..)) function.
    #
    # [KN] Note: The server is launched in the main thread.
    q = Queue(maxsize=8)

    util: ConsulUtil = ConsulUtil()
    cfg = _get_motr_fids(util)

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}, RM fid = {cfg.rm_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(queue=q, rm_fid=cfg.rm_fid, ffi=ffi, herald=herald)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.
    consumer = _run_thread(ConsumerThread(q, motr))
    try:
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   rm_service=cfg.rm_fid)
        LOG.info('Motr API has been started')
        service_monitor = _run_thread(ServiceMonitor(q))
        stats_updater = _run_thread(FsStatsUpdater(motr, interval_sec=30))

        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal
        run_server(q,
                   herald,
                   threads_to_wait=[consumer, stats_updater, service_monitor])
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.close()
Exemple #3
0
 def _generate_sub_services(self,
                            note: HaNoteStruct,
                            cns: ConsulUtil,
                            notify_devices=True) -> List[HaNoteStruct]:
     new_state = note.no_state
     fid = Fid.from_struct(note.no_id)
     service_list = cns.get_services_by_parent_process(fid)
     LOG.debug('Process fid=%s encloses %s services as follows: %s', fid,
               len(service_list), service_list)
     service_notes = [
         HaNoteStruct(no_id=x.fid.to_c(), no_state=new_state)
         for x in service_list
     ]
     if notify_devices:
         service_notes += self._generate_sub_disks(note, service_list, cns)
     return service_notes
Exemple #4
0
def run_server(
    queue: Queue,
    herald: DeliveryHerald,
    consul_util: ConsulUtil,
    threads_to_wait: List[StoppableThread] = [],
    port=8008,
):
    node_address = consul_util.get_hax_ip_address()

    # We can't use broad 0.0.0.0 IP address to make it possible to run
    # multiple hax instances at the same machine (i.e. in failover situation).
    # Instead, every hax will use a private IP only.
    web_address = node_address

    # Note that bq-delivered mechanism must use a unique node name rather than
    # broad '0.0.0.0' that doesn't identify the node from outside.
    inbox_filter = InboxFilter(
        OffsetStorage(node_address, key_prefix='bq-delivered'))

    conf_obj = ConfObjUtil(consul_util)

    app = web.Application(middlewares=[encode_exception])
    app.add_routes([
        web.get('/', hello_reply),
        web.post('/', process_ha_states(queue, consul_util)),
        web.post(
            '/watcher/bq',
            process_bq_update(inbox_filter,
                              BQProcessor(queue, herald, conf_obj))),
        web.post('/api/v1/sns/{operation}', process_sns_operation(queue)),
        web.get('/api/v1/sns/repair-status',
                get_sns_status(queue, SnsRepairStatus)),
        web.get('/api/v1/sns/rebalance-status',
                get_sns_status(queue, SnsRebalanceStatus)),
    ])
    LOG.info(f'Starting HTTP server at {web_address}:{port} ...')
    try:
        web.run_app(app, host=web_address, port=port)
        LOG.debug('Server stopped normally')
    finally:
        LOG.debug('Stopping the threads')
        for thread in threads_to_wait:
            thread.stop()
        for thread in threads_to_wait:
            thread.join()

        LOG.info('The http server has stopped')
Exemple #5
0
def prepare(args):
    url = args.config[0]
    utils = Utils(ConfStoreProvider(url))
    stop_event = Event()
    conf_dir = get_config_dir(url)
    log_dir = get_log_dir(url)
    _create_consul_namespace(conf_dir)
    consul_starter = _start_consul(utils, stop_event, conf_dir, log_dir, url)
    utils.save_config_path(url)
    utils.save_log_path()
    utils.save_node_facts()
    utils.save_drives_info()
    try:
        util: ConsulUtil = ConsulUtil()
        sess = util.get_leader_session_no_wait()
        util.destroy_session(sess)
    except Exception:
        logging.debug('No leader is elected yet')

    stop_consul_blocking(consul_starter)
Exemple #6
0
    def broadcast_ha_states(self, ha_states: List[HAState]) -> List[MessageId]:
        LOG.debug('Broadcasting HA states %s over ha_link', ha_states)
        cns = ConsulUtil()

        def ha_obj_state(st):
            return HaNoteStruct.M0_NC_ONLINE if st.status == ServiceHealth.OK \
                else HaNoteStruct.M0_NC_FAILED

        notes = []
        for st in ha_states:
            note = HaNoteStruct(st.fid.to_c(), ha_obj_state(st))
            notes.append(note)
            notes += self._generate_sub_services(note, cns)

        message_ids: List[MessageId] = self._ffi.ha_broadcast(
            self._ha_ctx, make_array(HaNoteStruct, notes), len(notes))
        LOG.debug(
            'Broadcast HA state complete with the following message_ids = %s',
            message_ids)
        return message_ids
def kv_cleanup():
    util: ConsulUtil = ConsulUtil()

    if is_cluster_running():
        logging.info('Cluster is running, shutting down')
        shutdown_cluster()

    keys: List[KeyDelete] = [
        KeyDelete(name='epoch', recurse=False),
        KeyDelete(name='eq-epoch', recurse=False),
        KeyDelete(name='last_fidk', recurse=False),
        KeyDelete(name='leader', recurse=False),
        KeyDelete(name='m0conf/', recurse=True),
        KeyDelete(name='processes/', recurse=True),
        KeyDelete(name='stats/', recurse=True)
    ]

    logging.info('Deleting Hare KV entries (%s)', keys)
    if not util.kv.kv_delete_in_transaction(keys):
        raise RuntimeError('Error during key delete in transaction')
Exemple #8
0
def init(args):
    try:
        url = args.config[0]

        if not is_mkfs_required(url):
            return

        conf = ConfStoreProvider(url)
        utils = Utils(conf)
        cns_utils = ConsulUtil()
        stop_event = Event()
        config_dir = get_config_dir(url)
        log_dir = get_log_dir(url)
        # Starting consul and hax
        consul_starter = _start_consul(utils, stop_event,
                                       config_dir, log_dir, url)
        hax_starter = _start_hax(utils, stop_event, config_dir, log_dir)
        hostname = utils.get_local_hostname()
        # Cleanup old mkfs state
        cleanup_mkfs_state(utils, cns_utils)
        start_mkfs_parallel(hostname, config_dir)
        # Update mkfs state
        set_mkfs_done_for(hostname, cns_utils)
        data_nodes = conf.get_hostnames_for_service(
            Const.SERVICE_MOTR_IO.value)

        # Wait for other nodes to complete.
        # This will block.
        while not is_mkfs_done_on_all_nodes(utils, cns_utils,
                                            data_nodes):
            sleep(5)
        # Stopping hax and consul
        stop_hax_blocking(hax_starter)
        stop_consul_blocking(consul_starter)
    except Exception as error:
        if hax_starter:
            stop_hax_blocking(hax_starter)
        if consul_starter:
            stop_consul_blocking(consul_starter)
        raise RuntimeError(f'Error while initializing cluster :key={error}')
Exemple #9
0
def _start_consul(utils: Utils,
                  stop_event: Event,
                  hare_local_dir: str,
                  hare_log_dir: str,
                  url: str):
    log_dir = hare_log_dir
    data_dir = f'{hare_local_dir}/consul/data'
    config_dir = f'{hare_local_dir}/consul/config'

    provider = ConfStoreProvider(url)
    node_id = uuid.uuid4()
    consul_endpoints = provider.get('cortx>external>consul>endpoints')
    cns_utils: ConsulUtil = ConsulUtil()
    hostname = utils.get_local_hostname()

    # remove tcp://
    peers = []
    for endpoint in consul_endpoints:
        key = endpoint.split('/')
        # Considering tcp endpoints only. Ignoring all other endpoints.
        if key[0] != 'tcp:':
            continue
        peer = ('/'.join(key[2:]))
        peers.append(peer)

    bind_addr = socket.gethostbyname(hostname)
    consul_nodename = hostname + ':' + str(node_id)[:8]
    consul_starter = ConsulStarter(utils=utils, cns_utils=cns_utils,
                                   stop_event=stop_event,
                                   log_dir=log_dir, data_dir=data_dir,
                                   config_dir=config_dir,
                                   node_id=str(node_id),
                                   node_name=consul_nodename,
                                   peers=peers, bind_addr=bind_addr)
    consul_starter.start()
    save_consul_node_name(cns_utils, consul_nodename, hostname)

    return consul_starter
Exemple #10
0
    def test_process_failure(self):
        consul_util = ConsulUtil()
        consul_cache = InvocationCache()
        ffi = Mock(spec=['init_motr_api'])
        motr = Motr(ffi, None, None, consul_util)

        # Setup for the test: notification of a process failure
        # - failure here is an ios service and a disk
        # - dummy Consul reports all processes on the node are failed
        # - expect the node, enclosure, controller, drive,
        #   process, and service to all be marked as failed
        #
        # Static names and fids for the setup are given here.
        node_name = 'testnode'

        hax_fid = Fid(0x7200000000000001, 0x6)
        site_fid = Fid(0x5300000000000001, 0x1)
        rack_fid = Fid(0x6100000000000001, 0x2)
        node_fid = Fid(0x6e00000000000001, 0x3)
        encl_fid = Fid(0x6500000000000001, 0x4)
        ctrl_fid = Fid(0x6300000000000001, 0x5)
        process_fid = Fid(0x7200000000000001, 0x15)
        service_fid = Fid(0x7300000000000001, 0xe)
        service_fid_typed = FidWithType(fid=service_fid, service_type='ios')
        drive_fid = Fid(0x6b00000000000001, 0x11)
        ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format(
            site_fid, rack_fid, encl_fid, ctrl_fid)
        ctrl_state = '{"state": "M0_NC_FAILED"}'

        # Set mock return values for the necessary Consul calls
        motr._is_mkfs = Mock(return_value=False)
        consul_util.get_hax_fid = Mock(return_value=hax_fid)
        consul_util.is_proc_client = Mock(return_value=False)
        consul_util.get_services_by_parent_process = Mock(
            return_value=[service_fid_typed])
        consul_util.get_disks_by_parent_process = Mock(
            return_value=[drive_fid])
        consul_util.get_process_node = Mock(return_value=node_name)
        consul_util.get_node_name_by_fid = Mock(return_value=node_name)
        consul_util.get_node_fid = Mock(return_value=node_fid)
        consul_util.get_node_encl_fid = Mock(return_value=encl_fid)
        consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid])

        # These failure indications are here to trigger specific code paths for
        # node failure. Additional tests can cover different scenarios (e.g.
        # drive failure but node still up), which will set differernt results
        # for these calls.
        consul_util.all_io_services_failed = Mock(return_value=True)
        consul_util.get_sdev_state = Mock(
            return_value=HaNoteStruct.M0_NC_FAILED)
        consul_util.get_ctrl_state = Mock(
            return_value=m0HaObjState.M0_NC_FAILED)
        consul_util.get_ctrl_state_updates = Mock(
            return_value=[PutKV(key=ctrl_path, value=ctrl_state)])

        # We'll use these mocks to check that expected updates are happening.
        consul_util.update_drive_state = Mock()
        consul_util.set_process_state = Mock()
        consul_util.set_node_state = Mock()
        consul_util.set_encl_state = Mock()
        motr._ha_broadcast = Mock()
        motr._write_updates = Mock()

        # Send the mock event.
        motr.broadcast_ha_states(
            [HAState(fid=process_fid, status=ObjHealth.FAILED)],
            notify_devices=True,
            broadcast_hax_only=False,
            kv_cache=consul_cache)

        # ConsulUtil is responsible for the actual KV updates, just check
        # here that the appropriate util function is called for each
        # component.
        consul_util.update_drive_state.assert_called_with([drive_fid],
                                                          ObjHealth.OFFLINE,
                                                          device_event=False)
        consul_util.set_process_state.assert_called_with(
            process_fid, ObjHealth.FAILED)
        consul_util.set_node_state.assert_called_with(node_fid,
                                                      ObjHealth.FAILED)
        consul_util.set_encl_state.assert_called_with(encl_fid,
                                                      ObjHealth.FAILED,
                                                      kv_cache=consul_cache)
        # This KV update is batched, so the check looks different.
        motr._write_updates.assert_any_call(
            [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache)

        # Check hax broadcast. We should see states updated to FAILED.
        broadcast_list = motr._ha_broadcast.call_args[0][0]
        self.assertTrue(_has_failed_note(broadcast_list, node_fid))
        self.assertTrue(_has_failed_note(broadcast_list, encl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, process_fid))
        self.assertTrue(_has_failed_note(broadcast_list, service_fid))
        self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
Exemple #11
0
class ConsumerThread(StoppableThread):
    """
    The only Motr-aware thread in whole HaX. This thread pulls messages from
    the multithreaded Queue and considers the messages as commands. Every such
    a command describes what should be sent to Motr land.

    The thread exits gracefully when it receives message of type Die (i.e.
    it is a 'poison pill').
    """
    def __init__(self, q: Queue, motr: Motr, herald: DeliveryHerald):
        super().__init__(target=self._do_work,
                         name='qconsumer',
                         args=(q, motr))
        self.is_stopped = False
        self.consul = ConsulUtil()
        self.eq_publisher = EQPublisher()
        self.herald = herald

    def stop(self) -> None:
        self.is_stopped = True

    @repeat_if_fails(wait_seconds=1)
    def _update_process_status(self, event: ConfHaProcess) -> None:
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        self.consul.update_process_status(event)

    def update_process_failure(self, ha_states: List[HAState]) -> None:
        for state in ha_states:
            if state.status == ServiceHealth.FAILED:
                m0status = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                pevent = ConfHaProcess(chp_event=m0status,
                                       chp_type=3,
                                       chp_pid=0,
                                       fid=state.fid)
                self._update_process_status(pevent)

    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        LOG.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    LOG.debug('Got %s message from queue', item)
                    if isinstance(item, FirstEntrypointRequest):
                        LOG.debug('first entrypoint request, broadcast FAILED')
                        ids: List[MessageId] = motr.broadcast_ha_states([
                            HAState(fid=item.process_fid,
                                    status=ServiceHealth.FAILED)
                        ])
                        LOG.debug('waiting for broadcast of %s for ep: %s',
                                  ids, item.remote_rpc_endpoint)
                        self.herald.wait_for_all(HaLinkMessagePromise(ids))
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            item.states)
                        self.update_process_failure(item.states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
Exemple #12
0
 def __init__(self, consul_util: Optional[ConsulUtil]):
     self.consul = consul_util or ConsulUtil()
Exemple #13
0
def _get_motr_fids(util: ConsulUtil) -> HL_Fids:
    hax_ep: str = util.get_hax_endpoint()
    hax_fid: Fid = util.get_hax_fid()
    ha_fid: Fid = util.get_ha_fid()
    rm_fid: Fid = util.get_rm_fid()
    return HL_Fids(hax_ep, hax_fid, ha_fid, rm_fid)
Exemple #14
0
def get_hare_motr_s3_processes(utils: ConsulUtil) -> Dict[str, List[Fid]]:
    nodes = utils.catalog.get_node_names()
    processes: Dict[str, List[Fid]] = {}
    for node in nodes:
        processes[node] = utils.get_node_hare_motr_s3_fids(node)
    return processes
Exemple #15
0
class FsStatsUpdater(StoppableThread):
    def __init__(self, motr: Motr, interval_sec=5):
        super().__init__(target=self._execute,
                         name='fs-stats-updater',
                         args=(motr, ))
        self.stopped = False
        self.consul = ConsulUtil()
        self.interval_sec = interval_sec
        self.event = Event()

    def stop(self) -> None:
        LOG.debug('Stop signal received')
        self.stopped = True
        self.event.set()

    def _sleep(self, interval_sec) -> None:
        interrupted = self.event.wait(timeout=interval_sec)
        if interrupted:
            raise InterruptedException()

    @log_exception
    def _execute(self, motr: Motr):
        try:
            ffi = motr._ffi
            LOG.info('filesystem stats updater thread has started')
            ffi.adopt_motr_thread()
            self._ensure_motr_all_started()
            while not self.stopped:
                started = self._ioservices_running()
                if not all(started):
                    self._sleep(self.interval_sec)
                    continue
                result: int = motr.start_rconfc()
                if result == 0:
                    stats = motr.get_filesystem_stats()
                    motr.stop_rconfc()
                    if not stats:
                        continue
                    LOG.debug('FS stats are as follows: %s', stats)
                    now_time = datetime.datetime.now()
                    data = FsStatsWithTime(stats=stats,
                                           timestamp=now_time.timestamp(),
                                           date=now_time.isoformat())
                    try:
                        self.consul.update_fs_stats(data)
                    except HAConsistencyException:
                        LOG.debug('Failed to update Consul KV '
                                  'due to an intermittent error. The '
                                  'error is swallowed since new attempts '
                                  'will be made timely')
                self._sleep(self.interval_sec)
        except InterruptedException:
            # No op. _sleep() has interrupted before the timeout exceeded:
            # the application is shutting down.
            # There are no resources that we need to dispose specially.
            pass
        except Exception:
            LOG.exception('Aborting due to an error')
        finally:
            LOG.debug('Releasing motr-related resources for this thread')
            ffi.shun_motr_thread()
            LOG.debug('filesystem stats updater thread exited')

    def _ioservices_running(self) -> List[bool]:
        statuses = self.consul.get_m0d_statuses()
        LOG.debug('The following statuses received: %s', statuses)
        started = ['M0_CONF_HA_PROCESS_STARTED' == v[1] for v in statuses]
        return started

    def _ensure_motr_all_started(self):
        while True:
            started = self._ioservices_running()
            if all(started):
                LOG.debug('According to Consul all confds have been started')
                return
            self._sleep(5)
Exemple #16
0
    def _update_process_status(self, p: WorkPlanner, motr: Motr,
                               event: ConfHaProcess) -> None:
        LOG.info('Updating process status: %s', event.fid)
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        motr_to_svc_status = {
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED)}
        if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                               m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
            broadcast_hax_only = False
            if ((event.chp_type ==
                 m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or
               (event.fid == self.consul.get_hax_fid())):
                # Motr-mkfs processes do not require updates on their peer
                # mkfs processes. Motr-mkfs is an independent and typically a
                # one-time operation. So avoid broadcasting a motr-mkfs state
                # to the peer motr-mkfs processes but hax still needs to be
                # notified in-order to disconnect the hax-motr halink when
                # motr-mkfs process stops.
                broadcast_hax_only = True

            LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type,
                      broadcast_hax_only)
            motr.broadcast_ha_states(
                [HAState(fid=event.fid, status=svc_status)],
                broadcast_hax_only=broadcast_hax_only)
        self.consul.update_process_status(event)

        # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes
        # then we will check if all the M0D processes on the local node are
        # started. If yes then we are going to send node online event to
        # MessageBus
        if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
            try:
                util: ConsulUtil = ConsulUtil()
                producer = get_producer(util)
                if producer:
                    producer.check_and_send(parent_resource_type=ObjT.NODE,
                                            fid=event.fid,
                                            resource_status='online')
                else:
                    LOG.warning('Could not sent an event as producer'
                                ' is not available')
            except Exception as e:
                LOG.warning("Send event failed due to '%s'", e)
Exemple #17
0
def main():
    # Note: no logging must happen before this call.
    # Otherwise the log configuration will not apply.
    setup_logging()
    set_locale()
    inject.configure(di_configuration)

    state = inject.instance(HaxGlobalState)

    # [KN] The elements in the work planner will appear if
    # 1. A callback is invoked from ha_link (this will happen in a motr
    #    thread which must be free ASAP)
    # 2. A new HA notification has come form Consul via HTTP
    # [KN] The messages are consumed by Python threads created by
    # _run_qconsumer_thread function.
    #
    # [KN] Note: The server is launched in the main thread.
    planner = WorkPlanner()

    def handle_signal(sig, frame):
        state.set_stopping()
        planner.shutdown()

    # This is necessary to allow hax to exit early if Consul is not available
    # (otherwise _get_motr_fids() may be retrying forever even if the hax
    # process needs to shutdown).
    signal.signal(signal.SIGINT, handle_signal)

    util: ConsulUtil = ConsulUtil()
    # Avoid removing session on hax start as this will happen
    # on every node, thus leader election will keep re-triggering
    # until the final hax node starts, this will delay further
    # bootstrapping operations.
    _remove_stale_session(util)
    cfg: HL_Fids = _get_motr_fids(util)
    hax_http_port = util.get_hax_http_port()
    util.init_motr_processes_status()

    LOG.info('Welcome to HaX')
    LOG.info(f'Setting up ha_link interface with the options as follows: '
             f'hax fid = {cfg.hax_fid}, hax endpoint = {cfg.hax_ep}, '
             f'HA fid = {cfg.ha_fid}')

    ffi = HaxFFI()
    herald = DeliveryHerald()
    motr = Motr(planner=planner, ffi=ffi, herald=herald, consul_util=util)

    # Note that consumer thread must be started before we invoke motr.start(..)
    # Reason: hax process will send entrypoint request and somebody needs
    # to reply it.

    # TODO make the number of threads configurable
    consumer_threads = [
        _run_qconsumer_thread(planner, motr, herald, util, i)
        for i in range(32)
    ]

    try:
        # [KN] We use just the first profile for Spiel API for now.
        motr.start(cfg.hax_ep,
                   process=cfg.hax_fid,
                   ha_service=cfg.ha_fid,
                   profile=cfg.profiles[0])
        LOG.info('Motr API has been started')
        rconfc_starter = _run_rconfc_starter_thread(motr, consul_util=util)

        stats_updater = _run_stats_updater_thread(motr, consul_util=util)
        bc_updater = _run_bc_updater_thread(motr, consul_util=util)
        event_poller = _run_thread(create_ha_thread(planner, util))
        # [KN] This is a blocking call. It will work until the program is
        # terminated by signal

        server = ServerRunner(planner,
                              herald,
                              consul_util=util,
                              hax_state=state)
        server.run(threads_to_wait=[
            *consumer_threads, stats_updater, bc_updater, rconfc_starter,
            event_poller
        ],
                   port=hax_http_port)
    except Exception:
        LOG.exception('Exiting due to an exception')
    finally:
        motr.fini()
Exemple #18
0
class ServiceMonitor(StoppableThread):
    """
    The service monitoring thread.

    This thread polls the service health status
    from Consul via Health API and broadcasts the states to Motr land.
    """
    def __init__(self, queue: Queue, interval_sec=1):
        """
        Constructor.

        queue - the multithreaded blocking queue to send BroadcastHAStates.
        messages (assuming that the queue is being read out by ConsumerThread).

        interval_sec - float value, represents the delay between the
        polling iterations.
        """
        super().__init__(target=self._execute, name='service-monitor')
        self.stopped = False
        self.consul = ConsulUtil()
        self.interval_sec = interval_sec
        self.event = Event()
        self.q = queue

    def stop(self) -> None:
        """Stop the thread."""
        LOG.debug('Stop signal received')
        self.stopped = True
        self.event.set()

    def _sleep(self, interval_sec) -> bool:
        interrupted = self.event.wait(timeout=interval_sec)
        return interrupted

    def _get_services(self) -> List[str]:
        services = self.consul.catalog_service_names()
        excluded = {'consul'}
        return [s for s in services if s not in excluded]

    def _broadcast(self, state_list: List[HAState]) -> None:
        if not state_list:
            return
        LOG.debug('Changes in statuses: %s', state_list)
        self.q.put(BroadcastHAStates(states=state_list, reply_to=None))

    def _execute(self):
        service_names: List[str] = self._get_services()
        LOG.debug('The following services will be monitored %s', service_names)
        known_statuses: Dict[str, ServiceHealth] = {
            service: ServiceHealth.UNKNOWN
            for service in service_names
        }
        try:
            while not self.stopped:
                try:
                    delta: List[HAState] = []

                    for name in service_names:
                        health: HAState = self.consul.get_local_service_health(
                            name)
                        if (health.status != known_statuses[name]):
                            delta.append(health)
                            known_statuses[name] = health.status
                            LOG.debug('%s is now %s', name, health.status)
                    self._broadcast(delta)
                except HAConsistencyException:
                    # No action - we'll just try again at next iteration
                    pass
                self._sleep(self.interval_sec)
        except Exception:
            LOG.exception('Aborting due to an error')
        finally:
            LOG.debug('Thread exited')