Beispiel #1
0
    def on_change_status(self, prev_value, current_value):
        if current_value is None:
            self.status = "unhealthy"
            self.save()

            _ctc = \
                NS.tendrl.objects.ClusterTendrlContext(
                    integration_id=self.integration_id
                ).load()

            msg = "Cluster {0} moved to unhealthy state".format(
                _ctc.cluster_name
            )
            event_utils.emit_event(
                "cluster_health_status",
                "unhealthy",
                msg,
                "cluster_{0}".format(
                    _ctc.integration_id
                ),
                "WARNING",
                integration_id=_ctc.integration_id,
                cluster_name=_ctc.cluster_name,
                sds_name=_ctc.sds_name
            )
Beispiel #2
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       "UP",
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "INFO",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
            del _cnc
Beispiel #3
0
def sync_cluster_status(volumes, sync_ttl):
    degraded_count = 0
    is_healthy = True

    # Check if there is a failed import cluster
    # flow, mark the cluster status as unhealthy
    _cluster = NS.tendrl.objects.Cluster(
        integration_id=NS.tendrl_context.integration_id).load()
    if _cluster.current_job.get('job_name', '') == "ImportCluster" and \
        _cluster.current_job.get('status', '') == "failed":
        is_healthy = False

    # Calculate status based on volumes status
    if len(volumes) > 0:
        volume_states = _derive_volume_states(volumes)
        for vol_id, state in volume_states.iteritems():
            if 'down' in state or 'partial' in state:
                is_healthy = False
            if 'degraded' in state:
                degraded_count += 1

    # Change status basd on node status
    cmd = cmd_utils.Command('gluster pool list', True)
    out, err, rc = cmd.run()
    peer_count = 0
    if not err:
        out_lines = out.split('\n')
        connected = True
        for index in range(1, len(out_lines)):
            peer_count += 1
            node_status_det = out_lines[index].split('\t')
            if len(node_status_det) > 2:
                if node_status_det[2].strip() != 'Connected':
                    connected = connected and False
        if not connected:
            is_healthy = False

    cluster_gd = NS.gluster.objects.GlobalDetails().load()
    old_status = cluster_gd.status or 'unhealthy'
    curr_status = 'healthy' if is_healthy else 'unhealthy'
    if curr_status != old_status:
        msg = ("Health status of cluster: %s "
               "changed from %s to %s") % (NS.tendrl_context.integration_id,
                                           old_status, curr_status)
        instance = "cluster_%s" % NS.tendrl_context.integration_id
        event_utils.emit_event(
            "cluster_health_status", curr_status, msg, instance,
            'WARNING' if curr_status == 'unhealthy' else 'INFO')

    # Persist the cluster status
    NS.gluster.objects.GlobalDetails(
        status='healthy' if is_healthy else 'unhealthy',
        peer_count=peer_count,
        vol_count=len(volumes),
        volume_up_degraded=degraded_count).save(ttl=sync_ttl)
def test_emit_event():
    setattr(__builtin__, "NS", maps.NamedDict())
    NS.publisher_id = 0
    NS.node_context = maps.NamedDict(fqdn="test",
                                     node_id="0")
    NS.tendrl_context = maps.NamedDict(integration_id="",
                                       cluster_name="",
                                       sds_name="")
    emit_event("test", "test", "test", "test", "test",
               tags=maps.NamedDict(entity_type="brick"))
    emit_event("test", "test", "test", "test", "test",
               tags=maps.NamedDict(entity_type="volume"))
Beispiel #5
0
def brick_status_alert(hostname):
    try:
        # fetching brick details of disconnected node
        lock = None
        path = "clusters/%s/Bricks/all/%s" % (NS.tendrl_context.integration_id,
                                              hostname)
        lock = etcd.Lock(NS._int.client, path)
        lock.acquire(blocking=True, lock_ttl=60)
        if lock.is_acquired:
            bricks = NS.gluster.objects.Brick(fqdn=hostname).load_all()
            for brick in bricks:
                if brick.status.lower() == BRICK_STARTED:
                    # raise an alert for brick
                    msg = ("Status of brick: %s "
                           "under volume %s in cluster %s chan"
                           "ged from %s to %s") % (
                               brick.brick_path, brick.vol_name,
                               NS.tendrl_context.integration_id,
                               BRICK_STARTED.title(), BRICK_STOPPED.title())
                    instance = "volume_%s|brick_%s" % (
                        brick.vol_name,
                        brick.brick_path,
                    )
                    event_utils.emit_event("brick_status",
                                           BRICK_STOPPED.title(),
                                           msg,
                                           instance,
                                           'WARNING',
                                           tags={
                                               "entity_type":
                                               RESOURCE_TYPE_BRICK,
                                               "volume_name": brick.vol_name,
                                               "node_id": brick.node_id,
                                               "fqdn": brick.hostname
                                           })
                    # Update brick status as stopped
                    brick.status = BRICK_STOPPED.title()
                    brick.save()
                    lock.release()
    except (etcd.EtcdException, KeyError, ValueError, AttributeError) as ex:
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 "Unable to raise an brick status "
                                 "alert for host %s" % hostname,
                                 "exception":
                                 ex
                             }))
    finally:
        if isinstance(lock, etcd.lock.Lock) and lock.is_acquired:
            lock.release()
Beispiel #6
0
def sync_volume_rebalance_status(volumes):
    for volume in volumes:
        rebal_status_list = []
        if "Distribute" in volume.vol_type or (
                "arbiter" in volume.vol_type and
            (int(volume.brick_count) > int(volume.replica_count))):
            vol_rebal_details = NS.gluster.objects.RebalanceDetails(
                vol_id=volume.vol_id).load_all()
            for entry in vol_rebal_details:
                rebal_status_list.append(entry.rebal_status)
            if not rebal_status_list:
                continue

            new_rebal_status = "unknown"

            if all(item == "not_started" for item in rebal_status_list):
                new_rebal_status = "not_started"
            else:
                # remove not_stated states from the list as these are
                # from nodes that are not involved in rebalance
                rebal_status_list = filter(
                    lambda state: state != 'not_started', rebal_status_list)
                if "failed" in rebal_status_list:
                    new_rebal_status = "failed"
                elif "layout_fix_failed" in rebal_status_list:
                    new_rebal_status = "layout_fix_failed"
                elif "layout_fix_started" in rebal_status_list:
                    new_rebal_status = "layout_fix_started"
                elif "started" in rebal_status_list:
                    new_rebal_status = "started"
                elif all(item == "completed" for item in rebal_status_list):
                    new_rebal_status = "completed"
                elif all(item == "stopped" for item in rebal_status_list):
                    new_rebal_status = "stopped"
                elif all(item == "layout_fix_"
                         "complete" for item in rebal_status_list):
                    new_rebal_status = "layout_fix_complete"
                elif all(item == "layout_fix_"
                         "stopped" for item in rebal_status_list):
                    new_rebal_status = "layout_fix_stopped"

            if volume.rebal_status != "" and \
                new_rebal_status != volume.rebal_status:
                msg = ("Volume:%s rebalance status has %s") % (
                    volume.name, new_rebal_status)
                instance = "volume_%s" % volume.name
                event_utils.emit_event("rebalance_status", new_rebal_status,
                                       msg, instance, 'INFO')

            volume.rebal_status = new_rebal_status
            volume.save()
Beispiel #7
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status":
            if current_value is None:
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id)

                _tc = NS.tendrl.objects.TendrlContext(
                    node_id=self.node_id).load()
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                    _msg = "node_sync, STALE provisioner node "\
                        "found! re-configuring monitoring "\
                        "(job-id: %s) on this node"
                    payload = {
                        "tags": ["tendrl/node_%s" % self.node_id],
                        "run": "tendrl.flows.ConfigureMonitoring",
                        "status": "new",
                        "parameters": {
                            'TendrlContext.integration_id': _tc.integration_id
                        },
                        "type": "node"
                    }
                    _job_id = str(uuid.uuid4())
                    NS.tendrl.objects.Job(job_id=_job_id,
                                          status="new",
                                          payload=payload).save()
                    logger.log("debug", NS.publisher_id,
                               {"message": _msg % _job_id})

                if _tc.sds_name == "gluster":
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
Beispiel #8
0
def test_emit_event():
    setattr(__builtin__, "NS", maps.NamedDict())
    NS.publisher_id = 0
    NS.node_context = maps.NamedDict(fqdn="test", node_id="0")
    NS.tendrl_context = maps.NamedDict(integration_id="",
                                       cluster_name="",
                                       sds_name="")
    emit_event("test",
               "test",
               "test",
               "test",
               "test",
               tags=maps.NamedDict(entity_type="brick"))
    emit_event("test",
               "test",
               "test",
               "test",
               "test",
               tags=maps.NamedDict(entity_type="volume"))
Beispiel #9
0
    def on_change_status(self, prev_value, current_value):
        if current_value is None:
            self.status = "unhealthy"
            self.save()

            _ctc = \
                NS.tendrl.objects.ClusterTendrlContext(
                    integration_id=self.integration_id
                ).load()

            msg = "Cluster {0} moved to unhealthy state".format(
                _ctc.cluster_name)
            event_utils.emit_event("cluster_health_status",
                                   "unhealthy",
                                   msg,
                                   "cluster_{0}".format(_ctc.integration_id),
                                   "WARNING",
                                   integration_id=_ctc.integration_id,
                                   cluster_name=_ctc.cluster_name,
                                   sds_name=_ctc.sds_name)
Beispiel #10
0
def process_events():
    events = NS.gluster.objects.NativeEvents().load_all()
    if events:
        for event in events:
            try:
                event.tags = json.loads(event.tags)
            except(TypeError, ValueError):
                # tags can be None
                pass
            if event.severity == "recovery" and not event.recovery_processed:
                # this perticular event is recovery event
                # so process this event and delete it
                event_utils.emit_event(
                    event.context.split("|")[0],
                    event.current_value,
                    event.message,
                    event.context,
                    "INFO",
                    tags=event.tags
                )
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context,
                    recovery_processed=True
                )
                processed_event.save(ttl=POST_RECOVERY_TTL)
                continue

            if event.alert_notify and not event.processed:
                event_utils.emit_event(
                    event.context.split("|")[0],
                    event.current_value,
                    event.message,
                    event.context,
                    event.severity.upper(),
                    alert_notify=event.alert_notify,
                    tags=event.tags
                )
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context,
                    processed=True
                )
                processed_event.save(NOTIFICATION_TTL)
                continue

            if event.severity == "warning" and not event.processed:
                event_utils.emit_event(
                    event.context.split("|")[0],
                    event.current_value,
                    event.message,
                    event.context,
                    "WARNING",
                    tags=event.tags
                )
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context,
                    processed=True
                )
                processed_event.save()
                continue
def process_events():
    events = NS.gluster.objects.NativeEvents().load_all()
    if events:
        for event in events:
            try:
                event.tags = json.loads(event.tags)
            except (TypeError, ValueError):
                # tags can be None
                pass
            if event.severity == "recovery" and not event.recovery_processed:
                # this perticular event is recovery event
                # so process this event and delete it
                event_utils.emit_event(event.context.split("|")[0],
                                       event.current_value,
                                       event.message,
                                       event.context,
                                       "INFO",
                                       tags=event.tags)
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context, recovery_processed=True)
                processed_event.save(ttl=POST_RECOVERY_TTL)
                continue

            if event.alert_notify and not event.processed:
                event_utils.emit_event(event.context.split("|")[0],
                                       event.current_value,
                                       event.message,
                                       event.context,
                                       event.severity.upper(),
                                       alert_notify=event.alert_notify,
                                       tags=event.tags)
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context, processed=True)
                processed_event.save(NOTIFICATION_TTL)
                continue

            if event.severity == "warning" and not event.processed:
                event_utils.emit_event(event.context.split("|")[0],
                                       event.current_value,
                                       event.message,
                                       event.context,
                                       "WARNING",
                                       tags=event.tags)
                processed_event = NS.gluster.objects.NativeEvents(
                    event.context, processed=True)
                processed_event.save()
                continue
Beispiel #12
0
def brick_status_alert(hostname):
    try:
        # fetching brick details of disconnected node
        lock = None
        path = "clusters/%s/Bricks/all/%s" % (
            NS.tendrl_context.integration_id,
            hostname
        )
        lock = etcd.Lock(
            NS._int.client,
            path
        )
        lock.acquire(
            blocking=True,
            lock_ttl=60
        )
        if lock.is_acquired:
            bricks = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                fqdn=hostname
            ).load_all()
            for brick in bricks:
                if brick.status.lower() == BRICK_STARTED:
                    # raise an alert for brick
                    msg = (
                        "Brick:%s in volume:%s has %s") % (
                            brick.brick_path,
                            brick.vol_name,
                            BRICK_STOPPED.title()
                        )
                    instance = "volume_%s|brick_%s" % (
                        brick.vol_name,
                        brick.brick_path,
                    )
                    event_utils.emit_event(
                        "brick_status",
                        BRICK_STOPPED.title(),
                        msg,
                        instance,
                        'WARNING',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": brick.vol_name,
                              "node_id": brick.node_id,
                              "fqdn": brick.hostname
                              }
                    )
                    # Update brick status as stopped
                    brick.status = BRICK_STOPPED.title()
                    brick.save()
                    lock.release()
    except (
        etcd.EtcdException,
        KeyError,
        ValueError,
        AttributeError
    ) as ex:
        Event(
            ExceptionMessage(
                priority="error",
                publisher=NS.publisher_id,
                payload={
                    "message": "Unable to raise an brick status "
                               "alert for host %s" % hostname,
                    "exception": ex
                }
            )
        )
    finally:
        if isinstance(lock, etcd.lock.Lock) and lock.is_acquired:
            lock.release()
Beispiel #13
0
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and (
                    _cluster.current_job['status'] == 'failed')) or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    time.sleep(_sleep)
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING'
                                        if current_status != 'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    # create devicetree using lsblk
                    devicetree = get_device_tree()
                    # find lvs
                    lvs = brick_utilization.get_lvs()
                    index = 1
                    volumes = raw_data['Volumes']
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree,
                                lvs
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._update_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
Beispiel #14
0
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree,
    lvs
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)],
                lvs
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
Beispiel #15
0
 def update_cluster_details(self, integration_id):
     try:
         nodes = etcd_utils.read(
             "/clusters/%s/nodes" % integration_id
         )
         for node in nodes.leaves:
             _cnc = NS.tendrl.objects.ClusterNodeContext(
                 node_id=node.key.split("/")[-1],
                 integration_id=integration_id
             ).load()
             # Verify all nodes in a cluster are down
             if str(_cnc.status).lower() != "down" and \
                     str(_cnc.is_managed).lower() == "yes":
                 # Any one managed node not down don't update
                 # cluster details, No need to consider unmanaged
                 # nodes
                 return
         # when all managed nodes are down update cluster details
         global_details = NS.tendrl.objects.GlobalDetails(
             integration_id=integration_id
         ).load()
         # Update cluster as unhealthy
         if global_details.status.lower() == "healthy":
             global_details.status = "unhealthy"
             global_details.save()
             _cluster = NS.tendrl.objects.Cluster(
                 integration_id=integration_id
             ).load()
             msg = "Cluster:%s is %s" % (
                 _cluster.short_name, "unhealthy")
             instance = "cluster_%s" % integration_id
             event_utils.emit_event(
                 "cluster_health_status",
                 "unhealthy",
                 msg,
                 instance,
                 'WARNING',
                 integration_id=integration_id
             )
         # Update all bricks are down
         nodes = etcd_utils.read(
             "/clusters/%s/Bricks/all" % integration_id
         )
         for node in nodes.leaves:
             bricks = NS.tendrl.objects.GlusterBrick(
                 integration_id,
                 fqdn=node.key.split("/")[-1]
             ).load_all()
             for brick in bricks:
                 if brick.status.lower() != "stopped":
                     brick.status = "Stopped"
                     brick.save()
                     msg = ("Brick:%s in volume:%s has %s") % (
                         brick.brick_path,
                         brick.vol_name,
                         "Stopped"
                     )
                     instance = "volume_%s|brick_%s" % (
                         brick.vol_name,
                         brick.brick_path
                     )
                     event_utils.emit_event(
                         "brick_status",
                         "Stopped",
                         msg,
                         instance,
                         "WARNING",
                         integration_id=integration_id,
                         tags={"entity_type": "brick",
                               "volume_name": brick.vol_name,
                               "node_id": brick.node_id
                               }
                     )
         # Update all volumes are down
         volumes = NS.tendrl.objects.GlusterVolume(
             integration_id
         ).load_all()
         for volume in volumes:
             if volume.state.lower() != "down":
                 volume.state = "down"
                 volume.status = "Stopped"
                 volume.save()
                 msg = "Volume:%s is %s" % (volume.name, "down")
                 instance = "volume_%s" % volume.name
                 event_utils.emit_event(
                     "volume_state",
                     "down",
                     msg,
                     instance,
                     "WARNING",
                     integration_id=integration_id,
                     tags={"entity_type": "volume",
                           "volume_name": volume.name
                           }
                 )
     except etcd.EtcdKeyNotFound:
         pass
Beispiel #16
0
    def run(self):
        logger.log("info", NS.publisher_id,
                   {"message": "%s running" % self.__class__.__name__})

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        try:
            etcd_utils.read("clusters/%s/"
                            "cluster_network" %
                            NS.tendrl_context.integration_id)
        except etcd.EtcdKeyNotFound:
            try:
                node_networks = etcd_utils.read("nodes/%s/Networks" %
                                                NS.node_context.node_id)
                # TODO(team) this logic needs to change later
                # multiple networks supported for gluster use case
                node_network = NS.tendrl.objects.NodeNetwork(
                    interface=node_networks.leaves.next().key.split(
                        '/')[-1]).load()
                cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                cluster.cluster_network = node_network.subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error", NS.publisher_id,
                    {"message": "Failed to sync cluster network details"})

        if NS.tendrl_context.integration_id:
            # Initialize alert node alert count
            try:
                key = 'clusters/%s/nodes/%s/alert_counters' % (
                    NS.tendrl_context.integration_id, NS.node_context.node_id)
                etcd_utils.read(key)
            except (etcd.EtcdException) as ex:
                if type(ex) == etcd.EtcdKeyNotFound:
                    NS.tendrl.objects.ClusterNodeAlertCounters(
                        node_id=NS.node_context.node_id,
                        integration_id=NS.tendrl_context.integration_id).save(
                        )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                if (_cluster.status == "importing" and
                    _cluster.current_job['status'] == 'failed') or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call([
                    'gluster', 'get-state', 'glusterd', 'odir', '/var/run',
                    'file', 'glusterd-state', 'detail'
                ])
                raw_data = ini2json.ini_to_dict('/var/run/glusterd-state')
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call([
                    'gluster', 'get-state', 'glusterd', 'odir', '/var/run',
                    'file', 'glusterd-state-vol-opts', 'volumeoptions'
                ])
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts')
                subprocess.call(
                    ['rm', '-rf', '/var/run/glusterd-state-vol-opts'])
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = etcd_utils.read(
                                    "clusters/%s/nodes/%s/Peers/%s/connected" %
                                    (NS.tendrl_context.integration_id,
                                     NS.node_context.node_id,
                                     peers['peer%s.uuid' % index])).value
                                current_status = peers['peer%s.connected' %
                                                       index]
                                if stored_peer_status != "" and \
                                        current_status != stored_peer_status:
                                    msg = (
                                        "Status of peer: %s in cluster %s "
                                        "changed from %s to %s") % (
                                            peers['peer%s.primary_hostname' %
                                                  index],
                                            NS.tendrl_context.integration_id,
                                            stored_peer_status, current_status)
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index]
                                    event_utils.emit_event(
                                        "peer_status", current_status, msg,
                                        instance, 'WARNING'
                                        if current_status != 'Connected' else
                                        'INFO')
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                            "disconnected":
                                        disconnected_hosts.append(
                                            peers['peer%s.primary_hostname' %
                                                  index])
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(disconnected_host)
                if "Volumes" in raw_data:
                    index = 1
                    volumes = raw_data['Volumes']
                    while True:
                        try:
                            sync_volumes(
                                volumes,
                                index,
                                raw_data_options.get('Volume Options'),
                                # sync_interval + 100 + no of peers + 350
                                SYNC_TTL + 350)
                            index += 1
                            SYNC_TTL += 1
                        except KeyError:
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        NS.gluster.objects.VolumeOptions(vol_id=vol_id,
                                                         options=dict1).save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.gluster.objects.Volume().load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" or \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed']:
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(volumes, SYNC_TTL + 350)
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    evt.process_events()
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes)
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get("sync_interval", 10)) +
                        len(volumes) * 4)

                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._enable_disable_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id)
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
                    # Initialize alert count
                    try:
                        alerts_count_key = '/clusters/%s/alert_counters' % (
                            NS.tendrl_context.integration_id)
                        etcd_utils.read(alerts_count_key)
                    except (etcd.EtcdException) as ex:
                        if type(ex) == etcd.EtcdKeyNotFound:
                            NS.tendrl.objects.ClusterAlertCounters(
                                integration_id=NS.tendrl_context.integration_id
                            ).save()

            except Exception as ex:
                Event(
                    ExceptionMessage(priority="error",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message":
                                         "gluster sds state sync error",
                                         "exception": ex
                                     }))
            try:
                etcd_utils.read('/clusters/%s/_sync_now' %
                                NS.tendrl_context.integration_id)
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log("debug", NS.publisher_id,
                   {"message": "%s complete" % self.__class__.__name__})
Beispiel #17
0
def sync_volumes(volumes, index, vol_options, sync_ttl):
    # instantiating blivet class, this will be used for
    # getting brick_device_details
    b = blivet.Blivet()

    # reset blivet during every sync to get latest information
    # about storage devices in the machine
    b.reset()
    devicetree = b.devicetree
    node_context = NS.node_context.load()
    tag_list = node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.gluster.objects.Volume(vol_id=volumes['volume%s.id' %
                                                               index]).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           NS.tendrl_context.integration_id,
                           stored_volume_status, current_status)
                instance = "volume_%s" % volumes['volume%s.name' % index]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped' else 'INFO',
                    tags={
                        "entity_type": RESOURCE_TYPE_VOLUME,
                        "volume_name": volumes['volume%s.name' % index]
                    })
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.gluster.objects.Volume(
            vol_id=volumes['volume%s.id' % index],
            vol_type="arbiter"
            if int(volumes['volume%s.arbiter_count' % index]) > 0 else
            volumes['volume%s.type' % index],
            name=volumes['volume%s.name' % index],
            transport_type=volumes['volume%s.transport_type' % index],
            status=volumes['volume%s.status' % index],
            brick_count=volumes['volume%s.brickcount' % index],
            snap_count=volumes['volume%s.snap_count' % index],
            stripe_count=volumes['volume%s.stripe_count' % index],
            replica_count=volumes['volume%s.replica_count' % index],
            subvol_count=volumes['volume%s.subvol_count' % index],
            arbiter_count=volumes['volume%s.arbiter_count' % index],
            disperse_count=volumes['volume%s.disperse_count' % index],
            redundancy_count=volumes['volume%s.redundancy_count' % index],
            quorum_status=volumes['volume%s.quorum_status' % index],
            snapd_status=volumes['volume%s.snapd_svc.online_status' % index],
            snapd_inited=volumes['volume%s.snapd_svc.inited' % index],
        )
        if NS.gluster.objects.Volume(vol_id=volumes['volume%s.id' %
                                                    index]).exists():
            existing_vol = NS.gluster.objects.Volume(
                vol_id=volumes['volume%s.id' % index]).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" %
                   (volumes['volume%s.name' % index],
                    NS.tendrl_context.integration_id,
                    volume_profiling_old_value, volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event("volume_profiling_status",
                                   volume_profiling_new_value,
                                   msg,
                                   instance,
                                   'INFO',
                                   tags={
                                       "entity_type":
                                       RESOURCE_TYPE_BRICK,
                                       "volume_name":
                                       volumes['volume%s.name' % index]
                                   })
        volume.save(ttl=sync_ttl)

        # Initialize volume alert count
        try:
            volume_alert_count_key = '/clusters/%s/Volumes/%s/'\
                                     'alert_counters' % (
                                         NS.tendrl_context.integration_id,
                                         volumes['volume%s.id' % index]
                                     )
            etcd_utils.read(volume_alert_count_key)
        except (etcd.EtcdException) as ex:
            if type(ex) == etcd.EtcdKeyNotFound:
                NS.gluster.objects.VolumeAlertCounters(
                    integration_id=NS.tendrl_context.integration_id,
                    volume_id=volumes['volume%s.id' % index]).save()
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[vol_options['volume%s.options.key%s' %
                                     (index, opt_count)]] = vol_options[
                                         'volume%s.options.value%s' %
                                         (index, opt_count)]
        NS.gluster.objects.VolumeOptions(
            vol_id=volume.vol_id, options=vol_opt_dict).save(ttl=sync_ttl)

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS._int.client.read("nodes/%s/Networks" %
                                       NS.node_context.node_id)
        for interface in networks.leaves:
            key = interface.key.split("/")[-1]
            network = NS.tendrl.objects.NodeNetwork(interface=key).load()
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(priority="debug",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 "Could not find "
                                 "any ipv4 networks for node"
                                 " %s" % NS.node_context.node_id,
                                 "exception":
                                 ex
                             }))
    while True:
        try:
            # Update brick node wise
            hostname = volumes['volume%s.brick%s.hostname' % (index, b_index)]
            if (NS.node_context.fqdn != hostname) and (hostname
                                                       not in network_ip):
                b_index += 1
                continue
            sub_vol_size = (int(volumes['volume%s.brickcount' % index])) / int(
                volumes['volume%s.subvol_count' % index])
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s'
                                  '.path' %
                                  (index, b_index)].split(":")[-1].replace(
                                      "/", "_")

            # Raise alerts if the brick path changes
            try:
                sbs = NS._int.client.read(
                    "clusters/%s/Bricks/all/"
                    "%s/%s/status" %
                    (NS.tendrl_context.integration_id, NS.node_context.fqdn,
                     brick_name.split(":_")[-1])).value
                current_status = volumes.get('volume%s.brick%s.status' %
                                             (index, b_index))
                if current_status != sbs:
                    msg = ("Status of brick: %s "
                           "under volume %s in cluster %s chan"
                           "ged from %s to %s") % (
                               volumes['volume%s.brick%s'
                                       '.path' % (index, b_index)],
                               volumes['volume%s.'
                                       'name' % index],
                               NS.tendrl_context.integration_id, sbs,
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (index, b_index)])
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped' else 'INFO',
                        tags={
                            "entity_type": RESOURCE_TYPE_BRICK,
                            "volume_name": volumes['volume%s.'
                                                   'name' % index]
                        })

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (NS.tendrl_context.integration_id,
                                        volumes['volume%s.id' % index],
                                        str((b_index - 1) / sub_vol_size),
                                        brick_name)

            NS._int.wclient.write(vol_brick_path, "")

            brick = NS.gluster.objects.Brick(
                NS.node_context.fqdn,
                brick_name.split(":_")[-1],
                name=brick_name,
                vol_id=volumes['volume%s.id' % index],
                sequence_number=b_index,
                brick_path=volumes['volume%s.brick%s.path' % (index, b_index)],
                hostname=volumes.get('volume%s.brick%s.hostname' %
                                     (index, b_index)),
                port=volumes.get('volume%s.brick%s.port' % (index, b_index)),
                vol_name=volumes['volume%s.name' % index],
                used=True,
                node_id=NS.node_context.node_id,
                status=volumes.get('volume%s.brick%s.status' %
                                   (index, b_index)),
                filesystem_type=volumes.get(
                    'volume%s.brick%s.filesystem_type' % (index, b_index)),
                mount_opts=volumes.get('volume%s.brick%s.mount_options' %
                                       (index, b_index)),
                utilization=brick_utilization.brick_utilization(
                    volumes['volume%s.brick%s.path' % (index, b_index)]),
                client_count=volumes.get('volume%s.brick%s.client_count' %
                                         (index, b_index)),
                is_arbiter=volumes.get('volume%s.brick%s.is_arbiter' %
                                       (index, b_index)),
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get('volume%s.brick%s.client_count' %
                           (index, b_index)) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' %
                                (index, b_index, c_index)],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' %
                                (index, b_index, c_index)],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' %
                                (index, b_index, c_index)],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' %
                                (index, b_index, c_index)]).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
Beispiel #18
0
def sync(sync_ttl):
    try:
        NS.node_context = NS.node_context.load()
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Running SDS detection"}
        )
        try:
            sds_discovery_manager = sds_manager.SDSDiscoveryManager()
        except ValueError as ex:
            Event(
                ExceptionMessage(
                    priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": "Failed to init SDSDiscoveryManager.",
                             "exception": ex
                             }
                )
            )
            return

        # Execute the SDS discovery plugins and tag the nodes with data
        for plugin in sds_discovery_manager.get_available_plugins():
            sds_details = plugin.discover_storage_system()
            if sds_details is None:
                break

            if "peers" in sds_details and NS.tendrl_context.integration_id:
                _cnc = NS.tendrl.objects.ClusterNodeContext().load()
                this_peer_uuid = ""
                if _cnc.is_managed != "yes" or not NS.node_context.fqdn:
                    for peer_uuid, data in sds_details.get("peers",
                                                           {}).iteritems():
                        peer = NS.tendrl.objects.GlusterPeer(
                            peer_uuid=peer_uuid,
                            hostname=data['hostname'],
                            connected=data['connected']
                        )
                        peer.save()
                        if data['hostname'] == "localhost":
                            this_peer_uuid = peer_uuid

                    # Figure out the hostname used to probe this peer
                    integration_id_index_key = \
                        "indexes/tags/tendrl/integration/%s" %\
                        NS.tendrl_context.integration_id
                    _node_ids = etcd_utils.read(integration_id_index_key).value
                    _node_ids = json.loads(_node_ids)
                    for _node_id in _node_ids:
                        if _node_id != NS.node_context.node_id:
                            peer = NS.tendrl.objects.GlusterPeer(
                                peer_uuid=this_peer_uuid, node_id=_node_id
                            ).load()
                            if peer.hostname:
                                NS.node_context.pkey = peer.hostname
                                NS.node_context.fqdn = peer.hostname
                                NS.node_context.ipv4_addr = \
                                    socket.gethostbyname(
                                        peer.hostname
                                    )
                                NS.node_context.save()
                                break

            if ('detected_cluster_id' in sds_details and sds_details[
                    'detected_cluster_id'] != ""):
                try:
                    integration_index_key = \
                        "indexes/detected_cluster_id_to_integration_id/" \
                        "%s" % sds_details['detected_cluster_id']
                    dc = NS.tendrl.objects.DetectedCluster().load()
                    if dc is None or dc.detected_cluster_id is None:
                        time.sleep(sync_ttl)
                        integration_id = str(uuid.uuid4())
                        try:
                            etcd_utils.write(
                                integration_index_key,
                                integration_id,
                                prevExist=False
                            )
                        except etcd.EtcdAlreadyExist:
                            pass

                    _ptag = None
                    if NS.tendrl_context.integration_id:
                        _ptag = "provisioner/%s" % \
                            NS.tendrl_context.integration_id

                        if _ptag in NS.node_context.tags:
                            if dc.detected_cluster_id and \
                                dc.detected_cluster_id != sds_details.get(
                                    'detected_cluster_id'):

                                # Gluster peer list has changed
                                integration_id = \
                                    NS.tendrl_context.integration_id
                                etcd_utils.write(
                                    integration_index_key,
                                    integration_id
                                )
                                # Set the cluster status as new peer detected
                                _cluster = NS.tendrl.objects.Cluster(
                                    integration_id=integration_id
                                ).load()
                                _cluster.status = "new_peers_detected"
                                _cluster.save()
                                # Raise an alert regarding the same
                                msg = "New peers identified in cluster: %s. " \
                                    "Make sure tendrl-ansible is executed " \
                                    "for the new nodes so that expand " \
                                    "cluster option can be triggered" % \
                                    _cluster.short_name
                                event_utils.emit_event(
                                    "cluster_status",
                                    "new_peers_detected",
                                    msg,
                                    "cluster_{0}".format(integration_id),
                                    "WARNING",
                                    integration_id=integration_id
                                )
                            _cluster = NS.tendrl.objects.Cluster(
                                integration_id=NS.tendrl_context.integration_id
                            ).load()
                            if _cluster.status == "new_peers_detected":
                                peers = []
                                cmd = subprocess.Popen(
                                    "gluster pool list",
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE
                                )
                                out, err = cmd.communicate()
                                if err or out is None or \
                                    "Connection failed" in out:
                                    pass  # set the no of peers as zero
                                if out:
                                    lines = out.split('\n')[1:]
                                    for line in lines:
                                        if line.strip() != '':
                                            peers.append(line.split()[0])
                                nodes_ids = json.loads(etcd_utils.read(
                                    "indexes/tags/tendrl/integration/%s" %
                                    NS.tendrl_context.integration_id
                                ).value)
                                if len(nodes_ids) == len(peers):
                                    # All the nodes are having node-agents
                                    # running and known to tendrl
                                    msg = "New nodes in cluster: %s have " \
                                        "node agents running now. Cluster " \
                                        "is ready to expand." % \
                                        _cluster.short_name
                                    event_utils.emit_event(
                                        "cluster_status",
                                        "expand_pending",
                                        msg,
                                        "cluster_{0}".format(
                                            NS.tendrl_context.integration_id
                                        ),
                                        "INFO",
                                        integration_id=NS.tendrl_context.
                                        integration_id
                                    )
                                    # Set the cluster status accordingly
                                    _cluster.status = 'expand_pending'
                                    _cluster.save()
                    loop_count = 0
                    while True:
                        # Wait till provisioner node assigns
                        # integration_id for this detected_cluster_id
                        if loop_count >= 72:
                            return
                        try:
                            time.sleep(5)
                            integration_id = etcd_utils.read(
                                integration_index_key).value
                            if integration_id:
                                break
                        except etcd.EtcdKeyNotFound:
                            loop_count += 1
                            continue

                    NS.tendrl_context.integration_id = integration_id
                    NS.tendrl_context.cluster_id = sds_details.get(
                        'detected_cluster_id')
                    NS.tendrl_context.cluster_name = sds_details.get(
                        'detected_cluster_name')
                    NS.tendrl_context.sds_name = sds_details.get(
                        'pkg_name')
                    NS.tendrl_context.sds_version = sds_details.get(
                        'pkg_version')
                    NS.tendrl_context.save()

                    NS.node_context = NS.node_context.load()
                    integration_tag = "tendrl/integration/%s" % \
                                      integration_id
                    detected_cluster_tag = "detected_cluster/%s" % \
                                           sds_details[
                                               'detected_cluster_id']
                    NS.node_context.tags += [detected_cluster_tag,
                                             integration_tag]
                    NS.node_context.tags = list(set(NS.node_context.tags))
                    NS.node_context.save()

                    NS.tendrl.objects.DetectedCluster(
                        detected_cluster_id=sds_details.get(
                            'detected_cluster_id'),
                        detected_cluster_name=sds_details.get(
                            'detected_cluster_name'),
                        sds_pkg_name=sds_details.get('pkg_name'),
                        sds_pkg_version=sds_details.get('pkg_version'),
                    ).save()
                    _cluster = NS.tendrl.objects.Cluster(
                        integration_id=NS.tendrl_context.integration_id
                    ).load()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] \
                        and _cluster.status in [None, ""]:
                        _cluster.save()

                except (etcd.EtcdException, KeyError) as ex:
                    Event(
                        ExceptionMessage(
                            priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": "Failed SDS detection",
                                     "exception": ex
                                     }
                        )
                    )
                break
    except Exception as ex:
        Event(
            ExceptionMessage(
                priority="error",
                publisher=NS.publisher_id,
                payload={"message": "node_sync "
                                    "SDS detection failed: " +
                                    ex.message,
                         "exception": ex}
            )
        )
Beispiel #19
0
    def run(self):
        logger.log("info", NS.publisher_id,
                   {"message": "%s running" % self.__class__.__name__})
        NS.node_context = NS.node_context.load()
        current_tags = list(NS.node_context.tags)
        current_tags += ["tendrl/node_%s" % NS.node_context.node_id]
        NS.node_context.tags = list(set(current_tags))
        NS.node_context.status = "UP"
        NS.node_context.save()
        _sleep = 0
        msg = "{0} is UP".format(NS.node_context.fqdn)
        event_utils.emit_event("node_status",
                               "UP",
                               msg,
                               "node_{0}".format(NS.node_context.fqdn),
                               "INFO",
                               node_id=NS.node_context.node_id)
        while not self._complete.is_set():
            _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "in_progress"

            current_tags = list(NS.node_context.tags)
            current_tags += ["tendrl/node_%s" % NS.node_context.node_id]
            NS.node_context.tags = list(set(current_tags))
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)
            NS.tendrl_context = NS.tendrl_context.load()

            sync_service_and_index_thread = threading.Thread(
                target=services_and_index_sync.sync, args=(_sync_ttl, ))
            sync_service_and_index_thread.daemon = True
            sync_service_and_index_thread.start()
            sync_service_and_index_thread.join()

            NS.node_context = NS.node_context.load()
            if "tendrl/monitor" in NS.node_context.tags:
                check_all_managed_node_status_thread = threading.Thread(
                    target=check_all_managed_nodes_status.run)
                check_all_managed_node_status_thread.daemon = True
                check_all_managed_node_status_thread.start()
                check_all_managed_node_status_thread.join()

                check_cluster_status_thread = threading.Thread(
                    target=check_cluster_status.run)
                check_cluster_status_thread.daemon = True
                check_cluster_status_thread.start()
                check_cluster_status_thread.join()

            if "tendrl/monitor" not in NS.node_context.tags:
                sync_cluster_contexts_thread = threading.Thread(
                    target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
                sync_cluster_contexts_thread.daemon = True
                sync_cluster_contexts_thread.start()
                sync_cluster_contexts_thread.join()

            platform_detect_thread = threading.Thread(
                target=platform_detect.sync)
            platform_detect.daemon = True
            platform_detect_thread.start()
            platform_detect_thread.join()

            if "tendrl/monitor" not in NS.node_context.tags:
                sds_detect_thread = threading.Thread(target=sds_detect.sync,
                                                     args=(_sleep, ))

                sds_detect_thread.daemon = True
                sds_detect_thread.start()
                sds_detect_thread.join()

            NS.tendrl_context = NS.tendrl_context.load()

            try:
                NS.tendrl.objects.Os().save()
                NS.tendrl.objects.Cpu().save()
                NS.tendrl.objects.Memory().save()
            except Exception as ex:
                Event(
                    ExceptionMessage(priority="error",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message":
                                         "node_sync "
                                         "os/cpu/memory sync failed: " +
                                         ex.message,
                                         "exception":
                                         ex
                                     }))
                NS.node_context = NS.node_context.load()
                NS.node_context.sync_status = "failed"
                NS.node_context.last_sync = str(time_utils.now())
                NS.node_context.status = "UP"
                NS.node_context.save(ttl=_sync_ttl)
                time.sleep(_sleep)

            sync_disks_thread = threading.Thread(target=disk_sync.sync)
            sync_disks_thread.daemon = True
            sync_disks_thread.start()
            sync_disks_thread.join()

            sync_networks_thread = threading.Thread(target=network_sync.sync)
            sync_networks_thread.daemon = True
            sync_networks_thread.start()
            sync_networks_thread.join()

            NS.node_context = NS.node_context.load()
            NS.node_context.sync_status = "done"
            NS.node_context.last_sync = str(time_utils.now())
            NS.node_context.status = "UP"
            NS.node_context.save(ttl=_sync_ttl)

            if "tendrl/monitor" not in NS.node_context.tags:
                sync_cluster_contexts_thread = threading.Thread(
                    target=cluster_contexts_sync.sync, args=(_sync_ttl, ))
                sync_cluster_contexts_thread.daemon = True
                sync_cluster_contexts_thread.start()
                sync_cluster_contexts_thread.join()
            # Update node alert count
            if not NS.tendrl.objects.ClusterNodeAlertCounters().exists():
                update_cluster_node_alert_count()
            time.sleep(_sleep)
        logger.log("info", NS.publisher_id,
                   {"message": "%s complete" % self.__class__.__name__})
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and
                    _cluster.current_job['status'] == 'failed') or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING' if current_status !=
                                        'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    index = 1
                    volumes = raw_data['Volumes']
                    # instantiating blivet class, this will be used for
                    # getting brick_device_details
                    b = blivet.Blivet()

                    # reset blivet during every sync to get latest information
                    # about storage devices in the machine
                    b.reset()
                    devicetree = b.devicetree
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._enable_disable_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
def sync_volume_rebalance_status(volumes):
    for volume in volumes:
        rebal_status_list = []
        if "Distribute" in volume.vol_type or (
                "arbiter" in volume.vol_type and (
                int(volume.brick_count) > int(volume.replica_count))
        ):
            vol_rebal_details = NS.gluster.objects.RebalanceDetails(
                vol_id=volume.vol_id
            ).load_all()
            for entry in vol_rebal_details:
                rebal_status_list.append(entry.rebal_status)
            if not rebal_status_list:
                continue

            new_rebal_status = "unknown"

            if all(item == "not_started" for item in rebal_status_list):
                new_rebal_status = "not_started"
            else:
                # remove not_stated states from the list as these are
                # from nodes that are not involved in rebalance
                rebal_status_list = filter(
                    lambda state: state != 'not_started', rebal_status_list
                )
                if "failed" in rebal_status_list:
                    new_rebal_status = "failed"
                elif "layout_fix_failed" in rebal_status_list:
                    new_rebal_status = "layout_fix_failed"
                elif "layout_fix_started" in rebal_status_list:
                    new_rebal_status = "layout_fix_started"
                elif "started" in rebal_status_list:
                    new_rebal_status = "started"
                elif all(item == "completed" for item in rebal_status_list):
                    new_rebal_status = "completed"
                elif all(item == "stopped" for item in rebal_status_list):
                    new_rebal_status = "stopped"
                elif all(
                        item == "layout_fix_"
                        "complete" for item in rebal_status_list
                ):
                    new_rebal_status = "layout_fix_complete"
                elif all(
                        item == "layout_fix_"
                        "stopped" for item in rebal_status_list
                ):
                    new_rebal_status = "layout_fix_stopped"

            if volume.rebal_status != "" and \
                new_rebal_status != volume.rebal_status:
                msg = ("Volume:%s rebalance status has %s") % (
                    volume.name,
                    new_rebal_status)
                instance = "volume_%s" % volume.name
                event_utils.emit_event(
                    "rebalance_status",
                    new_rebal_status,
                    msg,
                    instance,
                    'INFO'
                )

            volume.rebal_status = new_rebal_status
            volume.save()
Beispiel #22
0
 def update_cluster_details(self, integration_id):
     try:
         nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id)
         for node in nodes.leaves:
             _cnc = NS.tendrl.objects.ClusterNodeContext(
                 node_id=node.key.split("/")[-1],
                 integration_id=integration_id).load()
             # Verify all nodes in a cluster are down
             if str(_cnc.status).lower() != "down" and \
                     str(_cnc.is_managed).lower() == "yes":
                 # Any one managed node not down don't update
                 # cluster details, No need to consider unmanaged
                 # nodes
                 return
         # when all managed nodes are down update cluster details
         global_details = NS.tendrl.objects.GlobalDetails(
             integration_id=integration_id).load()
         # Update cluster as unhealthy
         if global_details.status.lower() == "healthy":
             global_details.status = "unhealthy"
             global_details.save()
             _cluster = NS.tendrl.objects.Cluster(
                 integration_id=integration_id).load()
             msg = "Cluster:%s is %s" % (_cluster.short_name, "unhealthy")
             instance = "cluster_%s" % integration_id
             event_utils.emit_event("cluster_health_status",
                                    "unhealthy",
                                    msg,
                                    instance,
                                    'WARNING',
                                    integration_id=integration_id)
         # Update all bricks are down
         nodes = etcd_utils.read("/clusters/%s/Bricks/all" % integration_id)
         for node in nodes.leaves:
             bricks = NS.tendrl.objects.GlusterBrick(
                 integration_id, fqdn=node.key.split("/")[-1]).load_all()
             for brick in bricks:
                 if brick.status.lower() != "stopped":
                     brick.status = "Stopped"
                     brick.save()
                     msg = ("Brick:%s in volume:%s has %s") % (
                         brick.brick_path, brick.vol_name, "Stopped")
                     instance = "volume_%s|brick_%s" % (brick.vol_name,
                                                        brick.brick_path)
                     event_utils.emit_event("brick_status",
                                            "Stopped",
                                            msg,
                                            instance,
                                            "WARNING",
                                            integration_id=integration_id,
                                            tags={
                                                "entity_type": "brick",
                                                "volume_name":
                                                brick.vol_name,
                                                "node_id": brick.node_id
                                            })
         # Update all volumes are down
         volumes = NS.tendrl.objects.GlusterVolume(
             integration_id).load_all()
         for volume in volumes:
             if volume.state.lower() != "down":
                 volume.state = "down"
                 volume.status = "Stopped"
                 volume.save()
                 msg = "Volume:%s is %s" % (volume.name, "down")
                 instance = "volume_%s" % volume.name
                 event_utils.emit_event("volume_state",
                                        "down",
                                        msg,
                                        instance,
                                        "WARNING",
                                        integration_id=integration_id,
                                        tags={
                                            "entity_type": "volume",
                                            "volume_name": volume.name
                                        })
     except etcd.EtcdKeyNotFound:
         pass
Beispiel #23
0
def _derive_volume_states(volumes):
    out_dict = {}
    for volume in volumes:
        if volume.status == "Stopped":
            out_dict[volume.vol_id] = "down"
        else:
            subvol_count = 0
            bricks = []
            subvol_states = []
            while True:
                try:
                    subvol = NS._int.client.read(
                        "clusters/%s/Volumes/%s/Bricks/subvolume%s" %
                        (NS.tendrl_context.integration_id, volume.vol_id,
                         subvol_count))
                    state = 0
                    for entry in subvol.leaves:
                        brick_name = entry.key.split("/")[-1]
                        fetched_brick = NS.gluster.objects.Brick(
                            brick_name.split(":")[0],
                            brick_name.split(":_")[-1]).load()
                        if not fetched_brick.status:
                            fetched_brick.status = "Stopped"
                        bricks.append(fetched_brick)
                        if fetched_brick.status != "Started":
                            state += 1
                    subvol_states.append(state)
                    subvol_count += 1
                except etcd.EtcdKeyNotFound:
                    break

            total_bricks = len(bricks)
            up_bricks = 0
            for brick in bricks:
                if brick.status == "Started":
                    up_bricks += 1
            if total_bricks == 0 or total_bricks < int(volume.brick_count):
                # No brick details updated for the volume yet
                out_dict[volume.vol_id] = 'unknown'
            elif up_bricks == 0:
                out_dict[volume.vol_id] = 'down'
            else:
                out_dict[volume.vol_id] = 'up'
                if int(volume.replica_count) > 1 or \
                    int(volume.disperse_count) > 0:
                    worst_subvol = max(subvol_states)
                    if worst_subvol > 0:
                        subvol_prob = max(int(volume.replica_count),
                                          int(volume.redundancy_count) + 1)
                        if worst_subvol == subvol_prob:
                            # if this volume contains only one subvolume,
                            # and the bricks down > redundancy level
                            # then the volume state needs to show down
                            if subvol_count == 1:
                                out_dict[volume.vol_id] = 'down'
                            else:
                                out_dict[volume.vol_id] = '(partial)'
                        else:
                            out_dict[volume.vol_id] = '(degraded)'
                else:
                    # This volume is not 'protected', so any brick
                    # disruption leads straight to a 'partial'
                    # availability state
                    if up_bricks != total_bricks:
                        out_dict[volume.vol_id] = '(partial)'
        # Raise the alert if volume state changes
        if volume.state != "" and \
            out_dict[volume.vol_id] != volume.state:
            msg = "State of volume: %s " \
                  "changed from %s to %s" % (
                      volume.name,
                      volume.state,
                      out_dict[volume.vol_id]
                  )
            instance = "volume_%s" % volume.name
            event_utils.emit_event(
                "volume_state",
                out_dict[volume.vol_id],
                msg,
                instance,
                'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING',
                tags={
                    "entity_type": RESOURCE_TYPE_VOLUME,
                    "volume_name": volume.name
                })
        # Save the volume status
        volume.state = out_dict[volume.vol_id]
        volume.save()

    return out_dict
Beispiel #24
0
def save_georep_details(volumes, index):
    pair_index = 1
    while True:
        try:
            session_id = "{0}_{1}_{2}".format(
                volumes['volume%s.pair%s.master_volume' % (index, pair_index)],
                volumes['volume%s.pair%s.slave' %
                        (index, pair_index)].split("::")[-1],
                volumes['volume%s.pair%s.session_slave' %
                        (index, pair_index)].split(":")[-1])
            pair_name = "{0}-{1}".format(
                volumes['volume%s.pair%s.master_node' % (index, pair_index)],
                volumes['volume%s.pair%s.master_brick' %
                        (index, pair_index)].replace("/", "_"))

            readable_pair_name = "{0}:{1}".format(
                volumes['volume%s.pair%s.master_node' % (index, pair_index)],
                volumes['volume%s.pair%s.master_brick' % (index, pair_index)])

            try:
                pair = NS.gluster.objects.GeoReplicationPair(
                    vol_id=volumes['volume%s.id' % index],
                    session_id=session_id,
                    pair=pair_name).load()
                fetched_pair_status = None
                if pair:
                    fetched_pair_status = pair.status
                pair_status = volumes['volume%s.pair%s.status' %
                                      (index, pair_index)]
                if fetched_pair_status and \
                    fetched_pair_status != pair_status and \
                    pair_status.lower() == 'faulty':
                    msg = ("Geo-replication between %s "
                           "and %s is faulty") % (readable_pair_name,
                                                  volumes['volume%s.name' %
                                                          index])
                    instance = "volume_%s|georep_%s" % (
                        volumes['volume%s.name' % index], pair_name)
                    event_utils.emit_event("georep_status",
                                           pair_status,
                                           msg,
                                           instance,
                                           'WARNING',
                                           tags={
                                               "entity_type":
                                               RESOURCE_TYPE_VOLUME,
                                               "volume_name":
                                               volumes['volume%s.name' % index]
                                           })
                if fetched_pair_status and \
                    fetched_pair_status.lower() == 'faulty' and \
                    pair_status.lower() in ['active', 'passive']:
                    msg = ("Geo-replication between %s "
                           "and %s is %s") % (readable_pair_name,
                                              volumes['volume%s.name' % index],
                                              pair_status)
                    instance = "volume_%s|georep_%s" % (
                        volumes['volume%s.name' % index], pair_name)
                    event_utils.emit_event("georep_status",
                                           pair_status,
                                           msg,
                                           instance,
                                           'INFO',
                                           tags={
                                               "entity_type":
                                               RESOURCE_TYPE_VOLUME,
                                               "volume_name":
                                               volumes['volume%s.name' % index]
                                           })
            except etcd.EtcdKeyNotFound:
                pass

            pair = NS.gluster.objects.GeoReplicationPair(
                vol_id=volumes['volume%s.id' % index],
                session_id=session_id,
                pair=pair_name,
                master_volume=volumes['volume%s.pair%s.master_volume' %
                                      (index, pair_index)],
                master_brick=volumes['volume%s.pair%s.master_brick' %
                                     (index, pair_index)],
                master_node=volumes['volume%s.pair%s.master_node' %
                                    (index, pair_index)],
                slave_user=volumes['volume%s.pair%s.slave_user' %
                                   (index, pair_index)],
                slave=volumes['volume%s.pair%s.slave' % (index, pair_index)],
                slave_node=volumes['volume%s.pair%s.slave_node' %
                                   (index, pair_index)],
                status=volumes['volume%s.pair%s.status' % (index, pair_index)],
                crawl_status=volumes['volume%s.pair%s.crawl_status' %
                                     (index, pair_index)],
                last_synced=volumes['volume%s.pair%s.last_synced' %
                                    (index, pair_index)],
                entry=volumes['volume%s.pair%s.entry' % (index, pair_index)],
                data=volumes['volume%s.pair%s.data' % (index, pair_index)],
                meta=volumes['volume%s.pair%s.meta' % (index, pair_index)],
                failures=volumes['volume%s.pair%s.failures' %
                                 (index, pair_index)],
                checkpoint_time=volumes['volume%s.pair%s.checkpoint_time' %
                                        (index, pair_index)],
                checkpoint_completed=volumes[
                    'volume%s.pair%s.checkpoint_completed' %
                    (index, pair_index)],
                checkpoint_completed_time=volumes[
                    'volume%s.pair%s.checkpoint_completion_time' %
                    (index, pair_index)])
        except KeyError:
            break
        pair.save()
        pair_index += 1
    return
Beispiel #25
0
def sync_cluster_status(volumes, sync_ttl):
    degraded_count = 0
    is_healthy = True

    # Check if there is a failed import cluster
    # flow, mark the cluster status as unhealthy
    _cluster = NS.tendrl.objects.Cluster(
        integration_id=NS.tendrl_context.integration_id
    ).load()
    if _cluster.current_job.get('job_name', '') == "ImportCluster" and \
        _cluster.current_job.get('status', '') == "failed":
        is_healthy = False

    # Calculate status based on volumes status
    if len(volumes) > 0:
        volume_states = _derive_volume_states(volumes)
        for vol_id, state in volume_states.iteritems():
            if 'down' in state or 'partial' in state:
                is_healthy = False
            if 'degraded' in state:
                degraded_count += 1

    # Change status basd on node status
    cmd = cmd_utils.Command(
        'gluster pool list', True
    )
    out, err, rc = cmd.run()
    peer_count = 0
    if not err:
        out_lines = out.split('\n')
        connected = True
        for index in range(1, len(out_lines)):
            peer_count += 1
            node_status_det = out_lines[index].split('\t')
            if len(node_status_det) > 2:
                if node_status_det[2].strip() != 'Connected':
                    connected = connected and False
        if not connected:
            is_healthy = False

    cluster_gd = NS.tendrl.objects.GlobalDetails(
        integration_id=NS.tendrl_context.integration_id
    ).load()
    old_status = cluster_gd.status or 'unhealthy'
    curr_status = 'healthy' if is_healthy else 'unhealthy'
    if curr_status != old_status:
        msg = ("Cluster:%s is %s"
               ) % (
                   _cluster.short_name,
                   curr_status)
        instance = "cluster_%s" % NS.tendrl_context.integration_id
        event_utils.emit_event(
            "cluster_health_status",
            curr_status,
            msg,
            instance,
            'WARNING' if curr_status == 'unhealthy'
            else 'INFO'
        )

    # Persist the cluster status
    NS.tendrl.objects.GlobalDetails(
        integration_id=NS.tendrl_context.integration_id,
        status='healthy' if is_healthy else 'unhealthy',
        peer_count=peer_count,
        vol_count=len(volumes),
        volume_up_degraded=degraded_count
    ).save(ttl=sync_ttl)
Beispiel #26
0
def _derive_volume_states(volumes):
    out_dict = {}
    for volume in volumes:
        if volume.status == "Stopped":
            out_dict[volume.vol_id] = "down"
        else:
            subvol_count = 0
            bricks = []
            subvol_states = []
            while True:
                try:
                    subvol = etcd_utils.read(
                        "clusters/%s/Volumes/%s/Bricks/subvolume%s" % (
                            NS.tendrl_context.integration_id,
                            volume.vol_id,
                            subvol_count
                        )
                    )
                    state = 0
                    for entry in subvol.leaves:
                        brick_name = entry.key.split("/")[-1]
                        fetched_brick = NS.tendrl.objects.GlusterBrick(
                            NS.tendrl_context.integration_id,
                            brick_name.split(":")[0],
                            brick_name.split(":_")[-1]
                        ).load()
                        if not fetched_brick.status:
                            fetched_brick.status = "Stopped"
                        bricks.append(fetched_brick)
                        if fetched_brick.status != "Started":
                            state += 1
                    subvol_states.append(state)
                    subvol_count += 1
                except etcd.EtcdKeyNotFound:
                    break

            total_bricks = len(bricks)
            up_bricks = 0
            for brick in bricks:
                if brick.status == "Started":
                    up_bricks += 1
            if total_bricks == 0 or total_bricks < int(volume.brick_count):
                # No brick details updated for the volume yet
                out_dict[volume.vol_id] = 'unknown'
            elif up_bricks == 0:
                out_dict[volume.vol_id] = 'down'
            else:
                out_dict[volume.vol_id] = 'up'
                if int(volume.replica_count) > 1 or \
                    int(volume.disperse_count) > 0:
                    worst_subvol = max(subvol_states)
                    if worst_subvol > 0:
                        subvol_prob = max(
                            int(volume.replica_count),
                            int(volume.redundancy_count) + 1
                        )
                        if worst_subvol == subvol_prob:
                            # if this volume contains only one subvolume,
                            # and the bricks down > redundancy level
                            # then the volume state needs to show down
                            if subvol_count == 1:
                                out_dict[volume.vol_id] = 'down'
                            else:
                                out_dict[volume.vol_id] = '(partial)'
                        else:
                            out_dict[volume.vol_id] = '(degraded)'
                else:
                    # This volume is not 'protected', so any brick
                    # disruption leads straight to a 'partial'
                    # availability state
                    if up_bricks != total_bricks:
                        out_dict[volume.vol_id] = '(partial)'
        # Raise the alert if volume state changes
        if volume.state != "" and \
            out_dict[volume.vol_id] not in [volume.state, 'unknown']:
            msg = "Volume:%s is %s" % (volume.name, out_dict[volume.vol_id])
            instance = "volume_%s" % volume.name
            event_utils.emit_event(
                "volume_state",
                out_dict[volume.vol_id],
                msg,
                instance,
                'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING',
                tags={"entity_type": RESOURCE_TYPE_VOLUME,
                      "volume_name": volume.name
                      }
            )
        # Save the volume status
        volume.state = out_dict[volume.vol_id]
        volume.save()

    return out_dict
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)]
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
Beispiel #28
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                    _msg = "node_sync, STALE provisioner node "\
                        "found! re-configuring monitoring "\
                        "(job-id: %s) on this node"
                    payload = {
                        "tags": ["tendrl/node_%s" % self.node_id],
                        "run": "tendrl.flows.ConfigureMonitoring",
                        "status": "new",
                        "parameters": {
                            'TendrlContext.integration_id': _tc.integration_id
                        },
                        "type": "node"
                    }
                    _job_id = str(uuid.uuid4())
                    NS.tendrl.objects.Job(job_id=_job_id,
                                          status="new",
                                          payload=payload).save()
                    logger.log("debug", NS.publisher_id,
                               {"message": _msg % _job_id})

                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       "UP",
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "INFO",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
            del _cnc
Beispiel #29
0
def save_georep_details(volumes, index):
    pair_index = 1
    while True:
        try:
            session_id = "{0}_{1}_{2}".format(
                volumes[
                    'volume%s.pair%s.master_volume' % (
                        index, pair_index
                    )
                ],
                volumes[
                    'volume%s.pair%s.slave' % (
                        index, pair_index)
                ].split("::")[-1],
                volumes[
                    'volume%s.pair%s.session_slave' % (
                        index, pair_index)
                ].split(":")[-1]
            )
            pair_name = "{0}-{1}".format(
                volumes[
                    'volume%s.pair%s.master_node' % (
                        index, pair_index)
                ],
                volumes[
                    'volume%s.pair%s.master_brick' % (
                        index, pair_index)
                ].replace("/", "_")
            )

            readable_pair_name = "{0}:{1}".format(
                volumes[
                    'volume%s.pair%s.master_node' % (
                        index, pair_index)
                ],
                volumes[
                    'volume%s.pair%s.master_brick' % (
                        index, pair_index)
                ]
            )

            try:
                pair = NS.gluster.objects.GeoReplicationPair(
                    vol_id=volumes['volume%s.id' % index],
                    session_id=session_id,
                    pair=pair_name
                ).load()
                fetched_pair_status = None
                if pair:
                    fetched_pair_status = pair.status
                pair_status = volumes[
                    'volume%s.pair%s.status' % (index, pair_index)
                ]
                if fetched_pair_status and \
                    fetched_pair_status != pair_status and \
                    pair_status.lower() == 'faulty':
                    msg = ("Geo-replication between %s "
                           "and %s is faulty") % (
                               readable_pair_name,
                               volumes['volume%s.name' % index])
                    instance = "volume_%s|georep_%s" % (
                        volumes['volume%s.name' % index],
                        pair_name
                    )
                    event_utils.emit_event(
                        "georep_status",
                        pair_status,
                        msg,
                        instance,
                        'WARNING',
                        tags={"entity_type": RESOURCE_TYPE_VOLUME,
                              "volume_name": volumes['volume%s.name' % index]
                              }
                    )
                if fetched_pair_status and \
                    fetched_pair_status.lower() == 'faulty' and \
                    pair_status.lower() in ['active', 'passive']:
                    msg = ("Geo-replication between %s "
                           "and %s is %s") % (
                               readable_pair_name,
                               volumes['volume%s.name' % index],
                               pair_status)
                    instance = "volume_%s|georep_%s" % (
                        volumes['volume%s.name' % index],
                        pair_name
                    )
                    event_utils.emit_event(
                        "georep_status",
                        pair_status,
                        msg,
                        instance,
                        'INFO',
                        tags={"entity_type": RESOURCE_TYPE_VOLUME,
                              "volume_name": volumes['volume%s.name' % index]
                              }
                    )
            except etcd.EtcdKeyNotFound:
                pass

            pair = NS.gluster.objects.GeoReplicationPair(
                vol_id=volumes['volume%s.id' % index],
                session_id=session_id,
                pair=pair_name,
                master_volume=volumes[
                    'volume%s.pair%s.master_volume' % (
                        index, pair_index)],
                master_brick=volumes[
                    'volume%s.pair%s.master_brick' % (
                        index, pair_index)],
                master_node=volumes[
                    'volume%s.pair%s.master_node' % (
                        index, pair_index)],
                slave_user=volumes[
                    'volume%s.pair%s.slave_user' % (
                        index, pair_index)],
                slave=volumes[
                    'volume%s.pair%s.slave' % (
                        index, pair_index)],
                slave_node=volumes[
                    'volume%s.pair%s.slave_node' % (
                        index, pair_index)],
                status=volumes[
                    'volume%s.pair%s.status' % (
                        index, pair_index)],
                crawl_status=volumes[
                    'volume%s.pair%s.crawl_status' % (
                        index, pair_index)],
                last_synced=volumes[
                    'volume%s.pair%s.last_synced' % (
                        index, pair_index)],
                entry=volumes[
                    'volume%s.pair%s.entry' % (
                        index, pair_index)],
                data=volumes[
                    'volume%s.pair%s.data' % (
                        index, pair_index)],
                meta=volumes[
                    'volume%s.pair%s.meta' % (
                        index, pair_index)],
                failures=volumes[
                    'volume%s.pair%s.failures' % (
                        index, pair_index)],
                checkpoint_time=volumes[
                    'volume%s.pair%s.checkpoint_time' % (
                        index, pair_index)],
                checkpoint_completed=volumes[
                    'volume%s.pair%s.checkpoint_completed' % (
                        index, pair_index)],
                checkpoint_completed_time=volumes[
                    'volume%s.pair%s.checkpoint_completion_time' % (
                        index, pair_index)]
            )
        except KeyError:
            break
        pair.save()
        pair_index += 1
    return
Beispiel #30
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(
                node_id=self.node_id
            ).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id
            ).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    self.status,
                    msg,
                    "node_{0}".format(self.fqdn),
                    "WARNING",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id,
                            self.fqdn
                        )
                    )

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write(
                                "{0}/status".format(brick.key),
                                "Stopped"
                            )
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    "UP",
                    msg,
                    "node_{0}".format(self.fqdn),
                    "INFO",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
            del _cnc