Example #1
0
    def ha_nvec_set_process(self, event: HaNvecSetEvent) -> None:
        LOG.debug('Processing HaNvecSetEvent (nvec size = %s)',
                  len(event.nvec))
        self.consul_util.get_all_nodes()
        ha_states: List[HAState] = []
        bcast_ss: List[HAState] = []
        for n in event.nvec:
            fid = Fid.from_struct(n.note.no_id)
            obj_health = ObjHealth.from_ha_note_state(n.note.no_state)
            ha_states.append(HAState(fid, obj_health))
            if n.note.no_state in {
                    HaNoteStruct.M0_NC_REPAIRED, HaNoteStruct.M0_NC_ONLINE
            }:
                bcast_ss.append(HAState(fid, obj_health))

            # In case of failed repair, roll back to failed state.
            elif n.note.no_state == HaNoteStruct.M0_NC_REPAIR:
                obj_health = ObjHealth.from_ha_note_state(
                    HaNoteStruct.M0_NC_FAILED)
                bcast_ss.append(HAState(fid, obj_health))

            # In case of failed rebalance, roll back to repaired state.
            elif n.note.no_state == HaNoteStruct.M0_NC_REBALANCE:
                obj_health = ObjHealth.from_ha_note_state(
                    HaNoteStruct.M0_NC_REPAIRED)
                bcast_ss.append(HAState(fid, obj_health))

        LOG.debug('got ha_states %s', ha_states)
        if bcast_ss:
            self.broadcast_ha_states(bcast_ss)
Example #2
0
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]:
    """Converts a dictionary, obtained from JSON data, into a list of
    HA states.

    Format of an HA state: HAState(fid= <service fid>, status= <state>),
    where <state> is either 'online' or 'offline'.
    """
    if not data:
        return []

    def get_svc_node(checks: List[Dict[str, Any]], svc_id: str) -> str:
        for x in checks:
            if x.get('ServiceID') == svc_id:
                return str(x.get('Node'))
        return ""

    def get_svc_health(item: Any) -> ServiceHealth:
        node = get_svc_node(item['Checks'], item['Service']['ID'])
        LOG.debug('Checking current state of the process %s',
                  item['Service']['ID'])
        status: ServiceHealth = consul_util.get_service_health(
            item['Service']['Service'], node, int(item['Service']['ID']))
        return status

    return [
        HAState(fid=create_process_fid(int(t['Service']['ID'])),
                status=get_svc_health(t)) for t in data
    ]
Example #3
0
async def test_service_health_broadcast(hax_client, planner, status: str,
                                        health: ServiceHealth):
    service_health = [{
        'Node': {
            'Node': 'localhost',
            'Address': '10.1.10.12',
        },
        'Service': {
            'ID': '12',
            'Service': 'ios',
            'Tags': [],
            'Port': 8000,
        },
        'Checks': [
            {
                'Node': '12',
                'CheckID': 'service:ios',
                'Name': "Service 'ios' check",
                'Status': status,
                'Notes': '',
                'Output': '',
                'ServiceID': '12',
                'ServiceName': 'ios',
            },
        ],
    }]
    resp = await hax_client.post('/', json=service_health)
    assert resp.status == 200
    assert planner.add_command.called
    planner.add_command.assert_called_once_with(
        BroadcastHAStates(
            states=[HAState(fid=Fid(0x7200000000000001, 12), status=health)],
            reply_to=None))
Example #4
0
 def _update_process_status(self, p: WorkPlanner, motr: Motr,
                            event: ConfHaProcess) -> None:
     # If a consul-related exception appears, it will
     # be processed by repeat_if_fails.
     #
     # This thread will become blocked until that
     # intermittent error gets resolved.
     motr_to_svc_status = {
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.OFFLINE),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_M0D, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.FAILED),
         (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED):
         (ServiceHealth.OK),
         (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER, m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         (ServiceHealth.FAILED)
     }
     self.consul.update_process_status(event)
     if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                            m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
         svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
         motr.broadcast_ha_states(
             [HAState(fid=event.fid, status=svc_status)])
Example #5
0
 def fn():
     proc_state_to_objhealth = {
         'M0_CONF_HA_PROCESS_STARTING': ObjHealth.OFFLINE,
         'M0_CONF_HA_PROCESS_STARTED': ObjHealth.OK,
         'M0_CONF_HA_PROCESS_STOPPING': ObjHealth.OFFLINE,
         'M0_CONF_HA_PROCESS_STOPPED': ObjHealth.OFFLINE
     }
     # import pudb.remote
     # pudb.remote.set_trace(term_size=(80, 40), port=9998)
     ha_states: List[HAState] = []
     LOG.debug('process status: %s', data)
     for item in data:
         proc_val = base64.b64decode(item['Value'])
         proc_status = json.loads(str(proc_val.decode('utf-8')))
         LOG.debug('process update item key %s item val: %s',
                   item['Key'].split('/')[1], proc_status)
         proc_fid = Fid.parse(item['Key'].split('/')[1])
         proc_state = proc_status['state']
         proc_type = proc_status['type']
         if (proc_type != 'M0_CONF_HA_PROCESS_M0MKFS'
                 and proc_state in ('M0_CONF_HA_PROCESS_STARTED',
                                    'M0_CONF_HA_PROCESS_STOPPED')):
             ha_states.append(
                 HAState(fid=proc_fid,
                         status=proc_state_to_objhealth[proc_state]))
             planner.add_command(
                 BroadcastHAStates(states=ha_states, reply_to=None))
Example #6
0
def test_nonmkfs_process_stop_causes_drive_offline(mocker, motr, consul_util):
    mocker.patch.object(consul_util.kv,
                        'kv_get',
                        side_effect=create_stub_get('M0_CONF_HA_PROCESS_M0D'))
    mocker.patch.object(consul_util.kv, 'kv_put', return_value=0)
    mocker.patch.object(consul_util, 'update_drive_state')
    mocker.patch.object(consul_util,
                        'get_node_fid',
                        return_value=Fid(0x6e00000000000001, 0x3))
    mocker.patch.object(consul_util,
                        'get_node_encl_fid',
                        return_value=Fid(0x6500000000000001, 0x4))

    motr.broadcast_ha_states([
        HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED)
    ])

    assert consul_util.update_drive_state.called, \
        'The drive state should be updated in Consul KV'

    traces = motr._ffi.traces
    assert AssertionPlan(
        tr_and(tr_method('ha_broadcast'),
               contains_drive())).exists(traces), \
        'DRIVE must be broadcast when non-MKFS process is stopped'
Example #7
0
 def to_ha_state(self, objinfo: dict) -> Optional[HAState]:
     try:
         sdev_fid = self.confobjutil.drive_to_sdev_fid(
             objinfo['node'], objinfo['device'])
     except KeyError as error:
         logging.error('Invalid json payload, no key (%s) present', error)
         return None
     return HAState(sdev_fid, status=objinfo['state'])
Example #8
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.OK:
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                     continue
             if current_status in (ServiceHealth.FAILED,
                                   ServiceHealth.STOPPED):
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                     # Consul may report failure of a process multiple
                     # times, so we don't want to send duplicate failure
                     # notifications, it may cause delay in cleanup
                     # activities.
                     continue
             # XXX:
             # Sometime, there can be situation where Consul event is sent
             # and can be delayed, where state reported by Consul for a
             # given process can be in its past already, e.g. consul
             # reported process failure but when hax received the event,
             # process might have already restarted. In this case the event
             # still needs to be handled. Also, it is possible that Consul
             # reported failure but process status is not yet updated in
             # Consul services catalog, in such a case the reported status
             # can be true and cannot be just dropped. These scenarios must
             # be re-visited.
             if current_status not in (ServiceHealth.UNKNOWN,
                                       ServiceHealth.OFFLINE):
                 # We also need to account and report the failure of remote
                 # Motr processes to this node's hax and motr processes.
                 # When Consul reports a remote process failure, hax
                 # confirms its current status from Consul KV and updates
                 # the list of failed services and also adds it to the
                 # broadcast list.
                 if current_status != ServiceHealth.OK:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                 else:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                 self.consul.update_process_status(
                     ConfHaProcess(
                         chp_event=event,
                         chp_type=int(
                             m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                         chp_pid=0,
                         fid=state.fid))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states
Example #9
0
 def to_ha_state(self, objinfo: Dict[str, str]) -> Optional[HAState]:
     try:
         sdev_fid = self.confobjutil.drive_to_sdev_fid(
             objinfo['node'], objinfo['device'])
         state = ServiceHealth.OK if objinfo[
             'state'] == 'online' else ServiceHealth.FAILED
     except KeyError as error:
         LOG.error('Invalid json payload, no key (%s) present', error)
         return None
     return HAState(sdev_fid, status=state)
Example #10
0
 def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
     logging.info('fid=%s, chp_event=%s', fid, chp_event)
     self.queue.put(
         ProcessEvent(
             ConfHaProcess(chp_event=chp_event,
                           chp_type=chp_type,
                           chp_pid=chp_pid,
                           fid=fid)))
     if chp_event == 3:
         self.queue.put(
             BroadcastHAStates(states=[HAState(fid=fid, status='offline')],
                               reply_to=None))
Example #11
0
    def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None:
        fid = Fid.parse(payload['conf_sdev'])
        if fid.is_null():
            LOG.debug('Fid is 0:0. Skipping the message.')
            return

        q: Queue = Queue(1)
        self.planner.add_command(
            BroadcastHAStates(states=[HAState(fid, status=ObjHealth.FAILED)],
                              reply_to=q))
        ids: List[MessageId] = q.get()
        self.herald.wait_for_any(HaLinkMessagePromise(ids))
Example #12
0
    def handle_ioq_stob_error(self, payload: Dict[str, Any]) -> None:
        fid = Fid.parse(payload['conf_sdev'])
        if fid.is_null():
            logging.debug('Fid is 0:0. Skipping the message.')
            return

        q: Queue = Queue(1)
        self.queue.put(
            BroadcastHAStates(states=[HAState(fid, status='offline')],
                              reply_to=q))
        ids: List[MessageId] = q.get()
        self.herald.wait_for_any(HaLinkMessagePromise(ids))
Example #13
0
async def test_bq_stob_message_deserialized(hax_client, planner, herald,
                                            consul_util, mocker):
    def fake_get(key, allow_null):
        # ret = {'bq-delivered/192.168.0.28': ''}
        ret = {'bq-delivered/localhost': ''}
        return ret[key]

    mocker.patch.object(herald, 'wait_for_any')
    #
    # InboxFilter will try to read epoch - let's mock KV operations
    stob = StobId(Fid(12, 13), Fid(14, 15))
    msg = StobIoqError(fid=Fid(5, 6),
                       conf_sdev=Fid(0x103, 0x204),
                       stob_id=stob,
                       fd=42,
                       opcode=4,
                       rc=2,
                       offset=0xBF,
                       size=100,
                       bshift=4)

    # Here we make sure that rea StobIoqError can be used as the payload
    # for STOB_IOQ_ERROR bq message.
    stob_payload = dump_json(msg)
    parsed_stob = simplejson.loads(stob_payload)

    mocker.patch.object(consul_util.kv, 'kv_put')
    mocker.patch.object(consul_util.kv, 'kv_get', fake_get)
    event_payload = {'message_type': 'STOB_IOQ_ERROR', 'payload': parsed_stob}
    event_str = simplejson.dumps(event_payload)
    b64: bytes = b64encode(event_str.encode())
    b64_str = b64.decode()

    payload = [{
        'Key': 'bq/12',
        'CreateIndex': 1793,
        'ModifyIndex': 1793,
        'LockIndex': 0,
        'Flags': 0,
        'Value': b64_str,
        'Session': ''
    }]
    # Test execution
    resp = await hax_client.post('/watcher/bq', json=payload)
    # Validate now
    if resp.status != 200:
        resp_json = await resp.json()
        logging.getLogger('hax').debug('Response: %s', resp_json)
    assert resp.status == 200
    planner.add_command.assert_called_once_with(
        ContainsStates(
            [HAState(fid=Fid(0x103, 0x204), status=ObjHealth.FAILED)]))
Example #14
0
 def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
     LOG.info('fid=%s, chp_event=%s', fid, chp_event)
     self.queue.put(
         ProcessEvent(
             ConfHaProcess(chp_event=chp_event,
                           chp_type=chp_type,
                           chp_pid=chp_pid,
                           fid=fid)))
     if chp_event == 3:
         self.queue.put(
             BroadcastHAStates(
                 states=[HAState(fid=fid, status=ServiceHealth.FAILED)],
                 reply_to=None))
Example #15
0
 def update_process_failure(self, q: Queue,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.FAILED:
                 self.consul.service_health_to_m0dstatus_update(
                     state.fid, current_status)
             elif current_status == ServiceHealth.UNKNOWN:
                 # We got service status as UNKNOWN, that means hax was
                 # notified about process failure but hax couldn't
                 # confirm if the process is in failed state or have
                 # failed and restarted. So, we will not loose the
                 # event and try again to confirm the real time
                 # process status by enqueing a broadcast event
                 # specific to this process.
                 # It is expected that the process status gets
                 # eventually confirmed as either failed or passing (OK).
                 # This situation typically arises due to delay
                 # in receiving failure notification during which the
                 # corresponding process might be restarting or have
                 # already restarted. Thus it is important to confirm
                 # the real time status of the process before
                 # broadcasting failure.
                 current_status = ServiceHealth.OK
                 q.put(
                     BroadcastHAStates(states=[
                         HAState(fid=state.fid, status=ServiceHealth.FAILED)
                     ],
                                       reply_to=None))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states
Example #16
0
    def handle(self, msg: Event) -> None:
        node_fid = self.cns.get_node_fid(msg.node_id)
        if not node_fid:
            LOG.warn('Unknown [node_id=%s] provided. HA event is ignored',
                     msg.node_id)
            return

        get_health = self._get_status_by_text

        self.planner.add_command(
            BroadcastHAStates(states=[
                HAState(fid=node_fid, status=get_health(msg.event_type))
            ],
                              reply_to=None))
Example #17
0
    def _process_event_cb(self, fid, chp_event, chp_type, chp_pid):
        LOG.info('fid=%s, chp_event=%s', fid, chp_event)
        self.queue.put(
            ProcessEvent(
                ConfHaProcess(chp_event=chp_event,
                              chp_type=chp_type,
                              chp_pid=chp_pid,
                              fid=fid)))

        if chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D:
            if chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
                self.queue.put(
                    BroadcastHAStates(
                        states=[HAState(fid=fid, status=ServiceHealth.OK)],
                        reply_to=None))
Example #18
0
    def get_local_service_health(self, service_name: str) -> HAState:
        local_nodename = self.get_local_nodename()
        srv_data: List[Dict[str, Any]] = self._get_service_health(service_name)
        local_services = [
            srv for srv in srv_data if srv['Node']['Node'] == local_nodename
        ]
        if not local_services:
            raise RuntimeError(
                f'Node {local_nodename} has no service {service_name}')
        service = local_services[0]

        ok = all(x.get('Status') == 'passing' for x in service['Checks'])
        status = ServiceHealth.OK if ok else ServiceHealth.FAILED
        fid = create_process_fid(int(service['Service']['ID']))
        return HAState(fid=fid, status=status)
Example #19
0
 def _update_process_status(self, q: Queue, event: ConfHaProcess) -> None:
     # If a consul-related exception appears, it will
     # be processed by repeat_if_fails.
     #
     # This thread will become blocked until that
     # intermittent error gets resolved.
     self.consul.update_process_status(event)
     svc_status = m0HaProcessEvent.event_to_svchealth(event.chp_event)
     if event.chp_type == m0HaProcessType.M0_CONF_HA_PROCESS_M0D:
         # Broadcast the received motr process status to other motr
         # processes in the cluster.
         q.put(
             BroadcastHAStates(
                 states=[HAState(fid=event.fid, status=svc_status)],
                 reply_to=None))
Example #20
0
 def to_ha_state(self, objinfo: Dict[str, str]) -> Optional[HAState]:
     hastate_to_objstate = {
         'online': ObjHealth.OK,
         'failed': ObjHealth.FAILED,
         'offline': ObjHealth.OFFLINE,
         'repair': ObjHealth.REPAIR,
         'repaired': ObjHealth.REPAIRED,
         'rebalance': ObjHealth.REBALANCE
     }
     try:
         sdev_fid = self.confobjutil.drive_to_sdev_fid(
             objinfo['node'], objinfo['device'])
         state = hastate_to_objstate[objinfo['state']]
     except KeyError as error:
         LOG.error('Invalid json payload, no key (%s) present', error)
         return None
     return HAState(sdev_fid, status=state)
Example #21
0
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]:
    """Converts a dictionary, obtained from JSON data, into a list of
    HA states.

    Format of an HA state: HAState(fid= <service fid>, status= <state>),
    where <state> is either 'online' or 'offline'.
    """
    if not data:
        return []

    def get_status(checks: List[Dict[str, Any]]) -> ServiceHealth:
        ok = all(x.get('Status') == 'passing' for x in checks)
        return ServiceHealth.OK if ok else ServiceHealth.FAILED

    return [
        HAState(fid=create_process_fid(int(t['Service']['ID'])),
                status=get_status(t['Checks'])) for t in data
    ]
Example #22
0
    def handle(self, msg: Event) -> None:
        node_name = self.cns.get_node_name_by_machineid(msg.resource_id,
                                                        allow_null=True)
        if not node_name:
            LOG.warn('Unknown [resource_id=%s] provided. HA event is ignored',
                     msg.resource_id)
            return
        node_fid = self.cns.get_node_fid(node_name)
        if not node_fid:
            LOG.warn('Unknown [node_name=%s] provided. HA event is ignored',
                     node_name)
            return
        get_health = self._get_status_by_text

        self.planner.add_command(
            BroadcastHAStates(states=[
                HAState(fid=node_fid, status=get_health(msg.event_type))
            ],
                              reply_to=None))
Example #23
0
async def test_bq_stob_message_type_recognized(hax_client, planner, herald,
                                               consul_util, mocker):
    def fake_get(key):
        ret = {'bq-delivered/192.168.0.28': ''}
        return ret[key]

    mocker.patch.object(herald, 'wait_for_any')
    #
    # InboxFilter will try to read epoch - let's mock KV operations
    mocker.patch.object(consul_util.kv, 'kv_put')
    mocker.patch.object(consul_util.kv, 'kv_get', fake_get)
    event_payload = {
        'message_type': 'STOB_IOQ_ERROR',
        'payload': {
            'fid': '0x1:0x2',
            'conf_sdev': '0x1:0x4'
        }
    }
    event_str = simplejson.dumps(event_payload)
    b64: bytes = b64encode(event_str.encode())
    b64_str = b64.decode()

    payload = [{
        'Key': 'bq/12',
        'CreateIndex': 1793,
        'ModifyIndex': 1793,
        'LockIndex': 0,
        'Flags': 0,
        'Value': b64_str,
        'Session': ''
    }]
    # Test execution
    resp = await hax_client.post('/watcher/bq', json=payload)
    # Validate now
    if resp.status != 200:
        resp_json = await resp.json()
        logging.getLogger('hax').debug('Response: %s', resp_json)
    assert resp.status == 200
    planner.add_command.assert_called_once_with(
        ContainsStates(
            [HAState(fid=Fid(0x1, 0x4), status=ServiceHealth.FAILED)]))
Example #24
0
def to_ha_states(data: Any, consul_util: ConsulUtil) -> List[HAState]:
    """Converts a dictionary, obtained from JSON data, into a list of
    HA states.

    Format of an HA state: HAState(fid= <service fid>, status= <state>),
    where <state> is either 'online' or 'offline'.
    """
    if not data:
        return []

    ha_states = []
    for node in data:
        svc_status = ObjHealth.OK
        for check in node['Checks']:
            if check.get('Status') != 'passing':
                svc_status = ObjHealth.FAILED
            svc_id = check.get('ServiceID')
            if svc_id:
                ha_states.append(
                    HAState(fid=create_process_fid(int(svc_id)),
                            status=svc_status))
    LOG.debug('Reporting ha states: %s', ha_states)
    return ha_states
Example #25
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     for state in ha_states:
         # We are only concerned with process statuses.
         if state.fid.container == ObjT.PROCESS.value:
             current_status = self.consul.get_process_current_status(
                 state.status, state.fid)
             if current_status == ServiceHealth.OK:
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STARTED'):
                     continue
             if current_status in (ServiceHealth.FAILED,
                                   ServiceHealth.STOPPED):
                 if (self.consul.get_process_local_status(
                         state.fid) == 'M0_CONF_HA_PROCESS_STOPPED'):
                     # Consul may report failure of a process multiple
                     # times, so we don't want to send duplicate failure
                     # notifications, it may cause delay in cleanup
                     # activities.
                     continue
             if current_status == ServiceHealth.UNKNOWN:
                 # We got service status as UNKNOWN, that means hax was
                 # notified about process failure but hax couldn't
                 # confirm if the process is in failed state or have
                 # failed and restarted. So, we will not loose the
                 # event and try again to confirm the real time
                 # process status by enqueing a broadcast event
                 # specific to this process.
                 # It is expected that the process status gets
                 # eventually confirmed as either failed or passing (OK).
                 # This situation typically arises due to delay
                 # in receiving failure notification during which the
                 # corresponding process might be restarting or have
                 # already restarted. Thus it is important to confirm
                 # the real time status of the process before
                 # broadcasting failure.
                 current_status = ServiceHealth.UNKNOWN
                 planner.add_command(
                     BroadcastHAStates(states=[
                         HAState(fid=state.fid, status=ServiceHealth.FAILED)
                     ],
                         reply_to=None))
             if current_status not in (ServiceHealth.UNKNOWN,
                                       ServiceHealth.OFFLINE):
                 # We also need to account and report the failure of remote
                 # Motr processes to this node's hax and motr processes.
                 # When Consul reports a remote process failure, hax
                 # confirms its current status from Consul KV and updates
                 # the list of failed services and also adds it to the
                 # broadcast list.
                 if current_status != ServiceHealth.OK:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED
                 else:
                     event = m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
                 self.consul.update_process_status(
                     ConfHaProcess(
                         chp_event=event,
                         chp_type=int(
                             m0HaProcessType.M0_CONF_HA_PROCESS_M0D),
                         chp_pid=0,
                         fid=state.fid))
             new_ha_states.append(
                 HAState(fid=state.fid, status=current_status))
         else:
             new_ha_states.append(state)
     return new_ha_states
Example #26
0
    def test_process_failure(self):
        consul_util = ConsulUtil()
        consul_cache = InvocationCache()
        ffi = Mock(spec=['init_motr_api'])
        motr = Motr(ffi, None, None, consul_util)

        # Setup for the test: notification of a process failure
        # - failure here is an ios service and a disk
        # - dummy Consul reports all processes on the node are failed
        # - expect the node, enclosure, controller, drive,
        #   process, and service to all be marked as failed
        #
        # Static names and fids for the setup are given here.
        node_name = 'testnode'

        hax_fid = Fid(0x7200000000000001, 0x6)
        site_fid = Fid(0x5300000000000001, 0x1)
        rack_fid = Fid(0x6100000000000001, 0x2)
        node_fid = Fid(0x6e00000000000001, 0x3)
        encl_fid = Fid(0x6500000000000001, 0x4)
        ctrl_fid = Fid(0x6300000000000001, 0x5)
        process_fid = Fid(0x7200000000000001, 0x15)
        service_fid = Fid(0x7300000000000001, 0xe)
        service_fid_typed = FidWithType(fid=service_fid, service_type='ios')
        drive_fid = Fid(0x6b00000000000001, 0x11)
        ctrl_path = 'm0conf/sites/{}/racks/{}/encls/{}/ctrls/{}'.format(
            site_fid, rack_fid, encl_fid, ctrl_fid)
        ctrl_state = '{"state": "M0_NC_FAILED"}'

        # Set mock return values for the necessary Consul calls
        motr._is_mkfs = Mock(return_value=False)
        consul_util.get_hax_fid = Mock(return_value=hax_fid)
        consul_util.is_proc_client = Mock(return_value=False)
        consul_util.get_services_by_parent_process = Mock(
            return_value=[service_fid_typed])
        consul_util.get_disks_by_parent_process = Mock(
            return_value=[drive_fid])
        consul_util.get_process_node = Mock(return_value=node_name)
        consul_util.get_node_name_by_fid = Mock(return_value=node_name)
        consul_util.get_node_fid = Mock(return_value=node_fid)
        consul_util.get_node_encl_fid = Mock(return_value=encl_fid)
        consul_util.get_node_ctrl_fids = Mock(return_value=[ctrl_fid])

        # These failure indications are here to trigger specific code paths for
        # node failure. Additional tests can cover different scenarios (e.g.
        # drive failure but node still up), which will set differernt results
        # for these calls.
        consul_util.all_io_services_failed = Mock(return_value=True)
        consul_util.get_sdev_state = Mock(
            return_value=HaNoteStruct.M0_NC_FAILED)
        consul_util.get_ctrl_state = Mock(
            return_value=m0HaObjState.M0_NC_FAILED)
        consul_util.get_ctrl_state_updates = Mock(
            return_value=[PutKV(key=ctrl_path, value=ctrl_state)])

        # We'll use these mocks to check that expected updates are happening.
        consul_util.update_drive_state = Mock()
        consul_util.set_process_state = Mock()
        consul_util.set_node_state = Mock()
        consul_util.set_encl_state = Mock()
        motr._ha_broadcast = Mock()
        motr._write_updates = Mock()

        # Send the mock event.
        motr.broadcast_ha_states(
            [HAState(fid=process_fid, status=ObjHealth.FAILED)],
            notify_devices=True,
            broadcast_hax_only=False,
            kv_cache=consul_cache)

        # ConsulUtil is responsible for the actual KV updates, just check
        # here that the appropriate util function is called for each
        # component.
        consul_util.update_drive_state.assert_called_with([drive_fid],
                                                          ObjHealth.OFFLINE,
                                                          device_event=False)
        consul_util.set_process_state.assert_called_with(
            process_fid, ObjHealth.FAILED)
        consul_util.set_node_state.assert_called_with(node_fid,
                                                      ObjHealth.FAILED)
        consul_util.set_encl_state.assert_called_with(encl_fid,
                                                      ObjHealth.FAILED,
                                                      kv_cache=consul_cache)
        # This KV update is batched, so the check looks different.
        motr._write_updates.assert_any_call(
            [PutKV(key=ctrl_path, value=ctrl_state)], consul_cache)

        # Check hax broadcast. We should see states updated to FAILED.
        broadcast_list = motr._ha_broadcast.call_args[0][0]
        self.assertTrue(_has_failed_note(broadcast_list, node_fid))
        self.assertTrue(_has_failed_note(broadcast_list, encl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, ctrl_fid))
        self.assertTrue(_has_failed_note(broadcast_list, process_fid))
        self.assertTrue(_has_failed_note(broadcast_list, service_fid))
        self.assertTrue(_has_failed_note(broadcast_list, drive_fid))
Example #27
0
def test_broadcast_io_service_failure(mocker, planner, motr, consumer,
                                      consul_util):
    def new_kv(key: str, val: str):
        return {
            'Key': key,
            'CreateIndex': 1793,
            'ModifyIndex': 1793,
            'LockIndex': 0,
            'Flags': 0,
            'Value': val,
            'Session': ''
        }

    def my_get(key: str, recurse: bool = False, **kwds):
        if key == 'm0conf/nodes' and recurse:
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "offline"
                  })), ('m0conf/nodes/cmu/processes/6/services/rm', '16'),
                 ('m0conf/nodes/localhost/processes/7/services/rms', '17'),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services'
                  '/0x7300000000000001:0x17',
                  json.dumps({
                      "name": "ios",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0xa/services/0x7300000000000001'
                  ':0xc', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif key == 'm0conf/sites' and recurse:
            return [
                new_kv(k, v) for k, v in
                [('m0conf/sites/0x5300000000000001:0x1/racks'
                  '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4'
                  '/ctrls/0x6300000000000001:0x5',
                  json.dumps({"state": "M0_NC_UNKNOWN"})),
                 ('m0conf/sites/0x5300000000000001:0x1/racks'
                  '/0x6100000000000001:0x2/encls/0x6500000000000001:0x4'
                  '/ctrls/0x6300000000000001:0x6',
                  json.dumps({"state": "M0_NC_UNKNOWN"}))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3'
              '/processes' and recurse):
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services/0x7300000000000001'
                  ':0x17', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0xa/services/0x7300000000000001:0xc',
                  json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0x15' and recurse):
            return [
                new_kv(k, v) for k, v in
                [('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15',
                  json.dumps({
                      "name": "m0_server",
                      "state": "failed"
                  })),
                 ('m0conf/nodes/0x6e00000000000001:0x3/processes'
                  '/0x7200000000000001:0x15/services/0x7300000000000001'
                  ':0x17', json.dumps({
                      "name": "ios",
                      "state": "failed"
                  }))]
            ]
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0x15'):
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3/processes'
                '/0x7200000000000001:0x15',
                json.dumps({
                    "name": "m0_server",
                    "state": "failed"
                }))
        elif (key == 'm0conf/nodes/0x6e00000000000001:0x3/processes'
              '/0x7200000000000001:0xa'):
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3/processes'
                '/0x7200000000000001:0xa',
                json.dumps({
                    "name": "m0_server",
                    "state": "online"
                }))
        elif key == 'm0conf/nodes/localhost/processes/7/services/rms':
            return new_kv('m0conf/nodes/localhost/processes/7/services/rms',
                          '17')
        elif key == 'localhost/processes/0x7200000000000001:0x15':
            return new_kv(
                'localhost/processes/0x7200000000000001',
                json.dumps({
                    'type': 'M0_CONF_HA_PROCESS_OTHER',
                    'state': 'Unknown'
                }))
        elif key == 'm0conf/nodes/0x6e00000000000001:0x3':
            return new_kv(
                'm0conf/nodes/0x6e00000000000001:0x3',
                json.dumps({
                    "name": "localhost",
                    "state": "M0_NC_UNKNOWN"
                }))
        raise RuntimeError(f'Unexpected call: key={key}, recurse={recurse}')

    mocker.patch.object(consul_util.kv, 'kv_get', side_effect=my_get)
    # TODO: Handle 'kv_put' by updating kv returned by 'kv_get'
    mocker.patch.object(consul_util.kv, 'kv_put', return_value=0)
    mocker.patch.object(consul_util,
                        'get_node_fid',
                        return_value=Fid(0x6e00000000000001, 0x3))
    mocker.patch.object(consul_util,
                        'get_node_encl_fid',
                        return_value=Fid(0x6500000000000001, 0x4))

    motr.broadcast_ha_states([
        HAState(fid=Fid(0x7200000000000001, 0x15), status=ServiceHealth.FAILED)
    ])

    traces = motr._ffi.traces
    assert AssertionPlan(tr_and(
        tr_method('ha_broadcast'),
        io_service_failed())).exists(traces), 'IOservice failure not broadcast'
    assert AssertionPlan(tr_and(tr_method('ha_broadcast'),
                                node_fid_failed())).not_exists(traces), \
        'Node failure should not be broadcast'
Example #28
0
    def _do_work(self, q: Queue, motr: Motr):
        ffi = motr._ffi
        LOG.info('Handler thread has started')
        ffi.adopt_motr_thread()

        def pull_msg():
            try:
                return q.get(block=False)
            except Empty:
                return None

        try:
            while True:
                try:
                    LOG.debug('Waiting for the next message')

                    item = pull_msg()
                    while item is None:
                        time.sleep(0.2)
                        if self.is_stopped:
                            raise StopIteration()
                        item = pull_msg()

                    LOG.debug('Got %s message from queue', item)
                    if isinstance(item, FirstEntrypointRequest):
                        LOG.debug('first entrypoint request, broadcast FAILED')
                        ids: List[MessageId] = motr.broadcast_ha_states([
                            HAState(fid=item.process_fid,
                                    status=ServiceHealth.FAILED)
                        ])
                        LOG.debug('waiting for broadcast of %s for ep: %s',
                                  ids, item.remote_rpc_endpoint)
                        self.herald.wait_for_all(HaLinkMessagePromise(ids))
                        motr.send_entrypoint_request_reply(
                            EntrypointRequest(
                                reply_context=item.reply_context,
                                req_id=item.req_id,
                                remote_rpc_endpoint=item.remote_rpc_endpoint,
                                process_fid=item.process_fid,
                                git_rev=item.git_rev,
                                pid=item.pid,
                                is_first_request=item.is_first_request))
                    elif isinstance(item, EntrypointRequest):
                        # While replying any Exception is catched. In such a
                        # case, the motr process will receive EAGAIN and
                        # hence will need to make new attempt by itself
                        motr.send_entrypoint_request_reply(item)
                    elif isinstance(item, ProcessEvent):
                        self._update_process_status(q, item.evt)
                    elif isinstance(item, HaNvecGetEvent):
                        fn = motr.ha_nvec_get_reply
                        # If a consul-related exception appears, it will
                        # be processed by repeat_if_fails.
                        #
                        # This thread will become blocked until that
                        # intermittent error gets resolved.
                        decorated = (repeat_if_fails(wait_seconds=5))(fn)
                        decorated(item)
                    elif isinstance(item, BroadcastHAStates):
                        LOG.info('HA states: %s', item.states)
                        ha_states = self.update_process_failure(q, item.states)
                        result: List[MessageId] = motr.broadcast_ha_states(
                            ha_states)
                        if item.reply_to:
                            item.reply_to.put(result)
                    elif isinstance(item, StobIoqError):
                        LOG.info('Stob IOQ: %s', item.fid)
                        payload = dump_json(item)
                        LOG.debug('Stob IOQ JSON: %s', payload)
                        offset = self.eq_publisher.publish('stob-ioq', payload)
                        LOG.debug('Written to epoch: %s', offset)
                    elif isinstance(item, SnsRepairStatus):
                        LOG.info('Requesting SNS repair status')
                        status = motr.get_repair_status(item.fid)
                        LOG.info('SNS repair status is received: %s', status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStatus):
                        LOG.info('Requesting SNS rebalance status')
                        status = motr.get_rebalance_status(item.fid)
                        LOG.info('SNS rebalance status is received: %s',
                                 status)
                        item.reply_to.put(status)
                    elif isinstance(item, SnsRebalanceStart):
                        LOG.info('Requesting SNS rebalance start')
                        motr.start_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceStop):
                        LOG.info('Requesting SNS rebalance stop')
                        motr.stop_rebalance(item.fid)
                    elif isinstance(item, SnsRebalancePause):
                        LOG.info('Requesting SNS rebalance pause')
                        motr.pause_rebalance(item.fid)
                    elif isinstance(item, SnsRebalanceResume):
                        LOG.info('Requesting SNS rebalance resume')
                        motr.resume_rebalance(item.fid)
                    elif isinstance(item, SnsRepairStart):
                        LOG.info('Requesting SNS repair start')
                        motr.start_repair(item.fid)
                    elif isinstance(item, SnsRepairStop):
                        LOG.info('Requesting SNS repair stop')
                        motr.stop_repair(item.fid)
                    elif isinstance(item, SnsRepairPause):
                        LOG.info('Requesting SNS repair pause')
                        motr.pause_repair(item.fid)
                    elif isinstance(item, SnsRepairResume):
                        LOG.info('Requesting SNS repair resume')
                        motr.resume_repair(item.fid)

                    else:
                        LOG.warning('Unsupported event type received: %s',
                                    item)
                except StopIteration:
                    raise
                except Exception:
                    # no op, swallow the exception
                    LOG.exception('**ERROR**')
        except StopIteration:
            ffi.shun_motr_thread()
        finally:
            LOG.info('Handler thread has exited')
Example #29
0
 def update_process_failure(self, planner: WorkPlanner,
                            ha_states: List[HAState]) -> List[HAState]:
     new_ha_states: List[HAState] = []
     proc_Health_to_status = {
         ObjHealth.OFFLINE: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
         ObjHealth.FAILED: m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED,
         ObjHealth.OK: m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED
     }
     try:
         for state in ha_states:
             if state.fid.container == ObjT.PROCESS.value:
                 current_status = self.consul.get_process_current_status(
                     state.status, state.fid)
                 if current_status == ObjHealth.UNKNOWN:
                     continue
                 proc_status_remote = self.consul.get_process_status(
                                          state.fid)
                 proc_status: Any = None
                 # MKFS states are upated by the node corresponding to a
                 # given process. So we ignore notifications for mkfs
                 # processes.
                 if proc_status_remote.proc_type in (
                         'Unknown',
                         m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS.name):
                     continue
                 proc_type = m0HaProcessType.str_to_Enum(
                      proc_status_remote.proc_type)
                 # Following cases are handled here,
                 # 1. Delayed consul service failure notification:
                 # -  We re-confirm the current process state before
                 #     notifying the process as offline/failed.
                 # 2. Consul reported process failure, current process
                 #    state is offline (this means the corresponding node
                 #    is online, i.e. hax and consul are online):
                 # -  So process's status in consul kv might not be updated
                 #    as the process died abruptly. In this case we handle
                 #    it as local process failure, update the process
                 #    status in consul kv and notify motr.
                 # 3. Consul reported process failure, current process
                 #    state is failed (this means the node corresponding to
                 #    the process also failed, i.e. hax and consul are no
                 #    more):
                 # -  Process's status in consul kv might not be updated as
                 #    the node went down abruptly. In this case, when
                 #    consul reports failure for corresponding node
                 #    processes, Hare verifies the node status and
                 #    accordingly Hare RC node processes the failures.
                 #    This may take some time if Consul server loose
                 #    the quorum and take time sync up the state.
                 # 4. Consul reported process failure, probably due to mkfs
                 #    process completion (m0tr mkfs and m0ds share the same
                 #    fid). which got delayed and process has starting now:
                 # -  Hare checks the current status of the process but it
                 #    is possible that the process state is not synced up
                 #    yet within the quorum. In this case, we continue
                 #    processing the failure event but once the process
                 #    starts successfully Hare will update and notify the
                 #    process state eventually.
                 # 5. For some reason Consul may report a process as
                 #    offline and subsequently report it as online, this
                 #    may happen due to intermittent monitor failure:
                 # -  Hare must handle the change in process states
                 #    accordingly in-order to maintain the eventual
                 #    consistency of the cluster state.
                 proc_status = proc_Health_to_status.get(current_status)
                 LOG.debug('current_status: %s proc_status_remote: %s',
                           current_status, proc_status_remote.proc_status)
                 if proc_status is not None:
                     LOG.debug('proc_status: %s', proc_status.name)
                     if proc_status_remote.proc_status != proc_status.name:
                         if (self.consul.am_i_rc() or
                                 self.consul.is_proc_local(state.fid)):
                             # Probably process node failed, in such a
                             # case, only RC must be allowed to update
                             # the process's persistent state.
                             # Or, if the node's alive then allow the node
                             # to update the local process's state.
                             self.consul.update_process_status(
                                 ConfHaProcess(chp_event=proc_status,
                                               chp_type=proc_type,
                                               chp_pid=0,
                                               fid=state.fid))
                         # RC or not RC, i.e. even without persistent state
                         # update, it is important that the notification to
                         # local motr processes must still be sent.
                         new_ha_states.append(
                             HAState(fid=state.fid, status=current_status))
                     if not self.consul.is_proc_local(state.fid):
                         proc_status_local = (
                             self.consul.get_process_status_local(
                                 state.fid))
                         # Consul monitors a process every 1 second and
                         # this notification is sent to every node. Thus
                         # to avoid notifying about a process multiple
                         # times about the same status every node
                         # maintains a local copy of the remote process
                         # status, which is checked everytime a consul
                         # notification is received and accordingly
                         # the status is notified locally to all the local
                         # motr processes.
                         if (proc_status_local.proc_status !=
                                 proc_status.name):
                             self.consul.update_process_status_local(
                                 ConfHaProcess(chp_event=proc_status,
                                               chp_type=proc_type,
                                               chp_pid=0,
                                               fid=state.fid))
                             new_ha_states.append(
                                 HAState(fid=state.fid,
                                         status=current_status))
                     else:
                         continue
             else:
                 new_ha_states.append(state)
     except Exception as e:
         raise HAConsistencyException('failed to process ha states') from e
     return new_ha_states
Example #30
0
    def _update_process_status(self, p: WorkPlanner, motr: Motr,
                               event: ConfHaProcess) -> None:
        LOG.info('Updating process status: %s', event.fid)
        # If a consul-related exception appears, it will
        # be processed by repeat_if_fails.
        #
        # This thread will become blocked until that
        # intermittent error gets resolved.
        motr_to_svc_status = {
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_M0D,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED): (
                    ObjHealth.OK),
            (m0HaProcessType.M0_CONF_HA_PROCESS_OTHER,
                m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED): (
                    ObjHealth.FAILED)}
        if event.chp_event in (m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED,
                               m0HaProcessEvent.M0_CONF_HA_PROCESS_STOPPED):
            svc_status = motr_to_svc_status[(event.chp_type, event.chp_event)]
            broadcast_hax_only = False
            if ((event.chp_type ==
                 m0HaProcessType.M0_CONF_HA_PROCESS_M0MKFS) or
               (event.fid == self.consul.get_hax_fid())):
                # Motr-mkfs processes do not require updates on their peer
                # mkfs processes. Motr-mkfs is an independent and typically a
                # one-time operation. So avoid broadcasting a motr-mkfs state
                # to the peer motr-mkfs processes but hax still needs to be
                # notified in-order to disconnect the hax-motr halink when
                # motr-mkfs process stops.
                broadcast_hax_only = True

            LOG.debug('chp_type %d broadcast_hax_only %s', event.chp_type,
                      broadcast_hax_only)
            motr.broadcast_ha_states(
                [HAState(fid=event.fid, status=svc_status)],
                broadcast_hax_only=broadcast_hax_only)
        self.consul.update_process_status(event)

        # If we are receiving M0_CONF_HA_PROCESS_STARTED for M0D processes
        # then we will check if all the M0D processes on the local node are
        # started. If yes then we are going to send node online event to
        # MessageBus
        if event.chp_event == m0HaProcessEvent.M0_CONF_HA_PROCESS_STARTED:
            try:
                util: ConsulUtil = ConsulUtil()
                producer = get_producer(util)
                if producer:
                    producer.check_and_send(parent_resource_type=ObjT.NODE,
                                            fid=event.fid,
                                            resource_status='online')
                else:
                    LOG.warning('Could not sent an event as producer'
                                ' is not available')
            except Exception as e:
                LOG.warning("Send event failed due to '%s'", e)