def poll_node_osds_state(cls):
     # type: () -> None
     """
     Polling system to check the states of AlbaNodes
     Runs in a locker context to avoid querying Alba too often
     When the lock is taken: fetch the state of the OSDs for all AlbaNode that are part of a Cluster
     When the state is too dire -> initiate a failover towards another node in that cluster
     :return: None
     :rtype: NoneType
     """
     try:
         threads = []
         with Configuration.lock('albanodes_osd_state',
                                 wait=5,
                                 expiration=60):
             for node_cluster in AlbaNodeClusterList.get_alba_node_clusters(
             ):
                 for node in node_cluster.albanodes:
                     device_summary = node.local_summary['devices']
                     if len(device_summary['green'] +
                            device_summary['warning']) < len(
                                device_summary['red']):
                         # @todo to offload to celery or not...
                         # Offloading so the polling lock can be released faster
                         thread = Thread(target=cls.initiate_failover,
                                         args=(node.guid, ))
                         thread.start()
                         threads.append(thread)
         for thread in threads:
             thread.join()
     except NoLockAvailableException:
         cls._logger.info('Unable to acquire the lock')
    def initiate_failover(cls, node_guid):
        # type: (basestring) -> None
        """
        Initiate an OSD failover for a particular AlbaNode
        This AlbaNode has to be part of an AlbaNodeCluster with multiple AlbaNodes
        :param node_guid: Guid of the AlbaNode
        :type node_guid: basestring
        :return: None
        :rtype: NoneType
        """
        with Configuration.lock('albanode_{0}_failover'.format(node_guid),
                                wait=5,
                                expiration=60):
            node = AlbaNode(node_guid)
            node_cluster = node.albanode_cluster
            if node_cluster is None:
                raise ValueError(
                    'Unable to failover Node with guid {0} as it has no relation to a cluster'
                    .format(node_guid))
            other_node_guids = [
                guid for guid in node.albanode_cluster.albanode_guids
                if guid != node_guid
            ]
            if len(other_node_guids) == 0:
                raise ValueError(
                    'Unable to failover Node with guid {0} as there are no failover candidates'
                    .format(node_guid))
            while len(other_node_guids) > 0:
                # Select random failover node from the pool
                failover_node = AlbaNode(
                    other_node_guids.pop(
                        random.randrange(len(other_node_guids))))
                cls._logger.info(
                    'Checking if Node with guid {0} is responsive so a failover can happen'
                    .format(failover_node.guid))
                success = False
                count = 0
                while success is False:
                    count += 1
                    if count > 3:
                        cls._logger.error(
                            'Node with guid {0} is not responsive. Looking for another node'
                            .format(failover_node.guid))
                        break
                    try:
                        failover_node.client.get_metadata()
                        success = True
                        continue  # Avoid sleep
                    except:
                        cls._logger.exception(
                            'Node with guid {0} is not responsive'.format(
                                failover_node.guid))
                    time.sleep(5)
                if success is False:
                    # Another node must be selected
                    continue
                # Kill current node through IPMI
                ipmi_info = node.ipmi_info
                try:
                    ipmi_controller = IPMIController(client=cls._client,
                                                     **ipmi_info)
                    ipmi_controller.power_off_node()
                except:
                    cls._logger.exception(
                        'Unable to control node with guid {0} through IPMI'.
                        format(node_guid))

        raise RuntimeError('No failover happened. Exhausted all options')