Beispiel #1
0
def alert_job_status(curr_value, msg, integration_id=None, cluster_name=None):
    alert = {}
    alert['source'] = NS.publisher_id
    alert['classification'] = 'cluster'
    alert['pid'] = os.getpid()
    alert['time_stamp'] = tendrl_now().isoformat()
    alert['alert_type'] = 'STATUS'
    severity = "INFO"
    if curr_value.lower() == "failed":
        severity = "WARNING"
    alert['severity'] = severity
    alert['resource'] = 'job_status'
    alert['current_value'] = curr_value
    alert['tags'] = dict(
        message=msg,
        integration_id=integration_id or
        NS.tendrl_context.integration_id,
        cluster_name=cluster_name or
        NS.tendrl_context.cluster_name,
        sds_name=NS.tendrl_context.sds_name,
        fqdn=NS.node_context.fqdn
    )
    alert['node_id'] = NS.node_context.node_id
    if not NS.node_context.node_id:
        return
    logger.log(
        "notice",
        "alerting",
        {'message': json.dumps(alert)}
    )
 def format_alert(self, alert_json):
     alert = self.parse_alert_metrics(alert_json)
     try:
         alert["alert_id"] = None
         alert["node_id"] = utils.find_node_id(
             alert['tags']['integration_id'], alert['tags']['fqdn'])
         alert["time_stamp"] = tendrl_now().isoformat()
         alert["resource"] = self.representive_name
         alert['alert_type'] = constants.ALERT_TYPE
         alert['significance'] = constants.SIGNIFICANCE_HIGH
         alert['pid'] = utils.find_grafana_pid()
         alert['source'] = constants.ALERT_SOURCE
         alert['tags']['fqdn'] = alert['tags']['fqdn']
         if alert_json['State'] == constants.GRAFANA_ALERT:
             if "critical" in alert_json['Name'].lower():
                 alert['severity'] = \
                     constants.TENDRL_SEVERITY_MAP['critical']
             else:
                 alert['severity'] = \
                     constants.TENDRL_SEVERITY_MAP['warning']
             alert['tags']['message'] = (
                 "Cpu utilization on node %s in %s"
                 " at %s %% and running out of cpu" %
                 (alert['tags']['fqdn'],
                  alert['tags']['cluster_short_name'],
                  alert['current_value']))
         elif alert_json['State'] == constants.GRAFANA_CLEAR_ALERT:
             # Identifying clear alert from which panel critical/warning
             if "critical" in alert_json['Name'].lower():
                 alert['tags']['clear_alert'] = \
                     constants.TENDRL_SEVERITY_MAP['critical']
             elif "warning" in alert_json['Name'].lower():
                 alert['tags']['clear_alert'] = \
                     constants.TENDRL_SEVERITY_MAP['warning']
             alert['severity'] = constants.TENDRL_SEVERITY_MAP['info']
             alert['tags']['message'] = \
                 ("Cpu utilization on node %s in"
                     " %s back to normal" % (
                         alert['tags']['fqdn'],
                         alert['tags']['cluster_short_name']))
         else:
             logger.log(
                 "error", NS.publisher_id, {
                     "message": "Unsupported alert %s "
                     "severity" % alert_json
                 })
             raise InvalidAlertSeverity
         return alert
     except (KeyError, CalledProcessError, EtcdKeyNotFound, NodeNotFound,
             InvalidAlertSeverity) as ex:
         Event(
             ExceptionMessage(
                 "debug", NS.publisher_id, {
                     "message":
                     "Error in converting grafana"
                     "alert into tendrl alert %s" % alert_json,
                     "exception":
                     ex
                 }))
Beispiel #3
0
 def get_node_status(self, node_id):
     last_seen_at = central_store_util.get_node_last_seen_at(node_id)
     if last_seen_at:
         interval = (tendrl_now() - datetime.datetime.strptime(
             last_seen_at[:-6],
             "%Y-%m-%dT%H:%M:%S.%f").replace(tzinfo=utc)).total_seconds()
         if interval < 5:
             return pm_consts.STATUS_UP
         else:
             return pm_consts.STATUS_DOWN
     return pm_consts.STATUS_NOT_MONITORED
Beispiel #4
0
def emit_event(resource, curr_value, msg, instance,
               severity, alert_notify=False, tags={},
               integration_id=None, cluster_name=None,
               sds_name=None, node_id=None):
    alert = {}
    alert['source'] = NS.publisher_id
    alert['node_id'] = node_id
    alert['pid'] = os.getpid()
    alert['time_stamp'] = tendrl_now().isoformat()
    alert['alert_type'] = 'STATUS'
    alert['severity'] = severity
    alert['resource'] = resource
    alert['current_value'] = curr_value
    alert['tags'] = dict(
        plugin_instance=instance,
        message=msg,
        integration_id=integration_id or NS.tendrl_context.integration_id,
        cluster_name=cluster_name or NS.tendrl_context.cluster_name
    )
    if "entity_type" in tags:
        if tags["entity_type"] == BRICK_ENTITY:
            alert['node_id'] = tags.get(
                "node_id", NS.node_context.node_id
            )
            alert['tags']['fqdn'] = tags.get(
                "fqdn", NS.node_context.fqdn
            )
            alert['tags']['volume_name'] = tags.get(
                'volume_name', None
            )
        elif tags["entity_type"] == VOLUME_ENTITY:
            alert['tags']['volume_name'] = tags.get(
                'volume_name', None
            )
    payload = {'message': json.dumps(alert)}
    payload['alert_condition_state'] = severity
    payload['alert_condition_status'] = resource

    if alert_notify:
        payload['alert_notify'] = alert_notify

    if severity == "INFO":
        payload['alert_condition_unset'] = True
    else:
        payload['alert_condition_unset'] = False
    logger.log(
        "notice",
        "alerting",
        payload,
        integration_id=integration_id
    )
Beispiel #5
0
def emit_event(resource,
               curr_value,
               msg,
               instance,
               severity,
               alert_notify=False,
               tags={},
               integration_id=None,
               cluster_name=None,
               sds_name=None,
               node_id=None):
    alert = {}
    alert['source'] = NS.publisher_id
    alert['node_id'] = node_id
    alert['pid'] = os.getpid()
    alert['time_stamp'] = tendrl_now().isoformat()
    alert['alert_type'] = 'STATUS'
    alert['severity'] = severity
    alert['resource'] = resource
    alert['current_value'] = curr_value
    alert['tags'] = dict(
        plugin_instance=instance,
        message=msg,
        integration_id=integration_id or NS.tendrl_context.integration_id,
        cluster_name=cluster_name or NS.tendrl_context.cluster_name,
        sds_name=sds_name or NS.tendrl_context.sds_name,
    )
    if "entity_type" in tags:
        if tags["entity_type"] == BRICK_ENTITY:
            alert['node_id'] = tags.get("node_id", NS.node_context.node_id)
            alert['tags']['fqdn'] = tags.get("fqdn", NS.node_context.fqdn)
            alert['tags']['volume_name'] = tags.get('volume_name', None)
        elif tags["entity_type"] == VOLUME_ENTITY:
            alert['tags']['volume_name'] = tags.get('volume_name', None)
    payload = {'message': json.dumps(alert)}
    payload['alert_condition_state'] = severity
    payload['alert_condition_status'] = resource

    if alert_notify:
        payload['alert_notify'] = alert_notify

    if severity == "INFO":
        payload['alert_condition_unset'] = True
    else:
        payload['alert_condition_unset'] = False
    logger.log("notice", "alerting", payload)
Beispiel #6
0
 def _emit_event(self, resource, curr_value, msg, instance):
     alert = {}
     alert['source'] = NS.publisher_id
     alert['pid'] = os.getpid()
     alert['time_stamp'] = tendrl_now().isoformat()
     alert['alert_type'] = 'status'
     severity = "INFO"
     if curr_value.lower() == "stopped":
         severity = "CRITICAL"
     alert['severity'] = severity
     alert['resource'] = resource
     alert['current_value'] = curr_value
     alert['tags'] = dict(plugin_instance=instance,
                          message=msg,
                          cluster_id=NS.tendrl_context.integration_id,
                          cluster_name=NS.tendrl_context.cluster_name,
                          sds_name=NS.tendrl_context.sds_name,
                          fqdn=socket.getfqdn())
     alert['node_id'] = NS.node_context.node_id
     if not NS.node_context.node_id:
         return
     Event(Message("notice", "alerting", {'message': json.dumps(alert)}))
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and
                    _cluster.current_job['status'] == 'failed') or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING' if current_status !=
                                        'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    index = 1
                    volumes = raw_data['Volumes']
                    # instantiating blivet class, this will be used for
                    # getting brick_device_details
                    b = blivet.Blivet()

                    # reset blivet during every sync to get latest information
                    # about storage devices in the machine
                    b.reset()
                    devicetree = b.devicetree
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._enable_disable_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
Beispiel #8
0
from tendrl.commons.utils.time_utils import now as tendrl_now

tendrl_collectd_severity_map = {
    'FAILURE': 'CRITICAL',
    'WARNING': 'WARNING',
    'OK': 'INFO',
    'OKAY': 'INFO'
}

config = load_config('node-monitoring',
                     '/etc/tendrl/node-monitoring/node-monitoring.conf.yaml')

central_store = etcd_client(host=config['etcd_connection'],
                            port=config['etcd_port'])

timestamp = tendrl_now().isoformat()

if is_collectd_imported:
    sys.path.append('/usr/lib64/collectd')
'''Collectd forks an instance of this plugin per threshold breach detected
Read collectd detected threshold breach details from standard input of
current fork.'''


def get_notification():
    collectd_alert = {}
    is_end_of_dictionary = False
    for line in sys.stdin:
        if not line.strip():
            is_end_of_dictionary = True
            continue
def update_last_seen_at():
    etcd_utils.write(
        '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id,
        tendrl_now().isoformat())
    def run(self):
        # To detect out of band deletes
        # refresh gluster object inventory at config['sync_interval']
        # Default is 260 seconds
        SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 250

        Event(
            Message(
                priority="info",
                publisher=NS.publisher_id,
                payload={"message": "%s running" % self.__class__.__name__}))

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        try:
            etcd_utils.read("clusters/%s/"
                            "cluster_network" %
                            NS.tendrl_context.integration_id)
        except etcd.EtcdKeyNotFound:
            try:
                node_networks = etcd_utils.read("nodes/%s/Networks" %
                                                NS.node_context.node_id)
                # TODO(team) this logic needs to change later
                # multiple networks supported for gluster use case
                node_network = NS.tendrl.objects.NodeNetwork(
                    interface=node_networks.leaves.next().key.split(
                        '/')[-1]).load()
                cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                cluster.cluster_network = node_network.subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                Event(
                    Message(priority="error",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Failed to sync cluster network details"
                            }))

        _sleep = 0
        while not self._complete.is_set():
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                if _cluster.import_status == "failed":
                    continue

                try:
                    NS._int.wclient.write("clusters/%s/"
                                          "sync_status" %
                                          NS.tendrl_context.integration_id,
                                          "in_progress",
                                          prevExist=False)
                except (etcd.EtcdAlreadyExist, etcd.EtcdCompareFailed) as ex:
                    pass

                subprocess.call([
                    'gluster', 'get-state', 'glusterd', 'odir', '/var/run',
                    'file', 'glusterd-state', 'detail'
                ])
                raw_data = ini2json.ini_to_dict('/var/run/glusterd-state')
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call([
                    'gluster', 'get-state', 'glusterd', 'odir', '/var/run',
                    'file', 'glusterd-state-vol-opts', 'volumeoptions'
                ])
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts')
                subprocess.call(
                    ['rm', '-rf', '/var/run/glusterd-state-vol-opts'])
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    while True:
                        try:
                            peer = NS.gluster.\
                                objects.Peer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = NS._int.client.read(
                                    "clusters/%s/Peers/%s/connected" %
                                    (NS.tendrl_context.integration_id,
                                     peers['peer%s.uuid' % index])).value
                                current_status = peers['peer%s.connected' %
                                                       index]
                                if stored_peer_status != "" and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Status of peer: %s in cluster %s "
                                        "changed from %s to %s") % (
                                            peers['peer%s.primary_hostname' %
                                                  index],
                                            NS.tendrl_context.integration_id,
                                            stored_peer_status, current_status)
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index]
                                    event_utils.emit_event(
                                        "peer_status", current_status, msg,
                                        instance, 'WARNING'
                                        if current_status != 'Connected' else
                                        'INFO')
                            except etcd.EtcdKeyNotFound:
                                pass

                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                if "Volumes" in raw_data:
                    index = 1
                    volumes = raw_data['Volumes']
                    while True:
                        try:
                            sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'))
                            index += 1
                        except KeyError:
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        NS.gluster.objects.VolumeOptions(vol_id=vol_id,
                                                         options=dict1).save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.gluster.objects.Volume().load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true":
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(volumes)
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    evt.process_events()
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes)
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get("sync_interval", 10)) +
                        len(volumes) * 10)

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id)
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.sync_status = "done"
                    _cluster.last_sync = str(tendrl_now())
                    _cluster.is_managed = "yes"
                    _cluster.save()
                    # Initialize alert count
                    try:
                        alerts_count_key = '/clusters/%s/alert_counters' % (
                            NS.tendrl_context.integration_id)
                        etcd_utils.read(alerts_count_key)
                    except (etcd.EtcdException) as ex:
                        if type(ex) == etcd.EtcdKeyNotFound:
                            ClusterAlertCounters(
                                integration_id=NS.tendrl_context.integration_id
                            ).save()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._enable_disable_volume_profiling()

            except Exception as ex:
                Event(
                    ExceptionMessage(priority="error",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message":
                                         "gluster sds state sync error",
                                         "exception": ex
                                     }))
            try:
                etcd_utils.read('/clusters/%s/_sync_now' %
                                NS.tendrl_context.integration_id)
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        Event(
            Message(
                priority="debug",
                publisher=NS.publisher_id,
                payload={"message": "%s complete" % self.__class__.__name__}))
Beispiel #11
0
 def format_alert(self, alert_json):
     alert = self.parse_alert_metrics(alert_json)
     try:
         alert["alert_id"] = None
         alert["node_id"] = utils.find_node_id(
             alert['tags']['integration_id'], alert['tags']['fqdn'])
         alert["time_stamp"] = tendrl_now().isoformat()
         alert["resource"] = self.representive_name
         alert['alert_type'] = constants.ALERT_TYPE
         alert['significance'] = constants.SIGNIFICANCE_HIGH
         alert['pid'] = utils.find_grafana_pid()
         alert['source'] = constants.ALERT_SOURCE
         alert['tags']['cluster_name'] = utils.find_cluster_name(
             alert['tags']['integration_id'])
         alert["tags"]["volume_name"] = utils.find_volume_name(
             alert['tags']['integration_id'],
             alert['tags']['fqdn'].replace('_', '.'),
             alert['tags']['brick_path'].strip(":").replace(
                 grafana_constants.BRICK_PATH_SEPARATOR, '_'))
         if alert_json['State'] == constants.GRAFANA_ALERT:
             if "critical" in alert_json['Name'].lower():
                 alert['severity'] = \
                     constants.TENDRL_SEVERITY_MAP['critical']
             else:
                 alert['severity'] = \
                     constants.TENDRL_SEVERITY_MAP['warning']
             # Modify brick path symbol to slash(/) in alert message
             alert['tags']['message'] = (
                 "Brick utilization on %s:%s in %s "
                 "at %s %% and nearing full capacity" %
                 (alert['tags']['fqdn'],
                  alert['tags']['brick_path'].replace(
                      grafana_constants.BRICK_PATH_SEPARATOR, "/"),
                  alert["tags"]["volume_name"], alert['current_value']))
         elif alert_json['State'] == constants.GRAFANA_CLEAR_ALERT:
             # Identifying clear alert from which panel critical/warning
             if "critical" in alert_json['Name'].lower():
                 alert['tags']['clear_alert'] = \
                     constants.TENDRL_SEVERITY_MAP['critical']
             elif "warning" in alert_json['Name'].lower():
                 alert['tags']['clear_alert'] = \
                     constants.TENDRL_SEVERITY_MAP['warning']
             alert['severity'] = constants.TENDRL_SEVERITY_MAP['info']
             # Modify brick path symbol to slash(/) in alert message
             alert['tags']['message'] = (
                 "Brick utilization of %s:%s in %s "
                 "back to normal" %
                 (alert['tags']['fqdn'],
                  alert['tags']['brick_path'].replace(
                      grafana_constants.BRICK_PATH_SEPARATOR,
                      "/"), alert["tags"]["volume_name"]))
         else:
             logger.log(
                 "error", NS.publisher_id, {
                     "message": "Unsupported alert %s "
                     "severity" % alert_json
                 })
             raise InvalidAlertSeverity
         return alert
     except (KeyError, CalledProcessError, EtcdKeyNotFound, NodeNotFound,
             InvalidAlertSeverity) as ex:
         Event(
             ExceptionMessage(
                 "debug", NS.publisher_id, {
                     "message":
                     "Error in converting grafana"
                     "alert into tendrl alert %s" % alert_json,
                     "exception":
                     ex
                 }))
Beispiel #12
0
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and (
                    _cluster.current_job['status'] == 'failed')) or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    time.sleep(_sleep)
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING'
                                        if current_status != 'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    # create devicetree using lsblk
                    devicetree = get_device_tree()
                    # find lvs
                    lvs = brick_utilization.get_lvs()
                    index = 1
                    volumes = raw_data['Volumes']
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree,
                                lvs
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._update_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
def update_last_seen_at():
    NS._int.wclient.write(
        '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id,
        tendrl_now().isoformat()
    )