def on_change_status(self, prev_value, current_value): if current_value is None: self.status = "unhealthy" self.save() _ctc = \ NS.tendrl.objects.ClusterTendrlContext( integration_id=self.integration_id ).load() msg = "Cluster {0} moved to unhealthy state".format( _ctc.cluster_name ) event_utils.emit_event( "cluster_health_status", "unhealthy", msg, "cluster_{0}".format( _ctc.integration_id ), "WARNING", integration_id=_ctc.integration_id, cluster_name=_ctc.cluster_name, sds_name=_ctc.sds_name )
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id) del _cnc
def sync_cluster_status(volumes, sync_ttl): degraded_count = 0 is_healthy = True # Check if there is a failed import cluster # flow, mark the cluster status as unhealthy _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if _cluster.current_job.get('job_name', '') == "ImportCluster" and \ _cluster.current_job.get('status', '') == "failed": is_healthy = False # Calculate status based on volumes status if len(volumes) > 0: volume_states = _derive_volume_states(volumes) for vol_id, state in volume_states.iteritems(): if 'down' in state or 'partial' in state: is_healthy = False if 'degraded' in state: degraded_count += 1 # Change status basd on node status cmd = cmd_utils.Command('gluster pool list', True) out, err, rc = cmd.run() peer_count = 0 if not err: out_lines = out.split('\n') connected = True for index in range(1, len(out_lines)): peer_count += 1 node_status_det = out_lines[index].split('\t') if len(node_status_det) > 2: if node_status_det[2].strip() != 'Connected': connected = connected and False if not connected: is_healthy = False cluster_gd = NS.gluster.objects.GlobalDetails().load() old_status = cluster_gd.status or 'unhealthy' curr_status = 'healthy' if is_healthy else 'unhealthy' if curr_status != old_status: msg = ("Health status of cluster: %s " "changed from %s to %s") % (NS.tendrl_context.integration_id, old_status, curr_status) instance = "cluster_%s" % NS.tendrl_context.integration_id event_utils.emit_event( "cluster_health_status", curr_status, msg, instance, 'WARNING' if curr_status == 'unhealthy' else 'INFO') # Persist the cluster status NS.gluster.objects.GlobalDetails( status='healthy' if is_healthy else 'unhealthy', peer_count=peer_count, vol_count=len(volumes), volume_up_degraded=degraded_count).save(ttl=sync_ttl)
def test_emit_event(): setattr(__builtin__, "NS", maps.NamedDict()) NS.publisher_id = 0 NS.node_context = maps.NamedDict(fqdn="test", node_id="0") NS.tendrl_context = maps.NamedDict(integration_id="", cluster_name="", sds_name="") emit_event("test", "test", "test", "test", "test", tags=maps.NamedDict(entity_type="brick")) emit_event("test", "test", "test", "test", "test", tags=maps.NamedDict(entity_type="volume"))
def brick_status_alert(hostname): try: # fetching brick details of disconnected node lock = None path = "clusters/%s/Bricks/all/%s" % (NS.tendrl_context.integration_id, hostname) lock = etcd.Lock(NS._int.client, path) lock.acquire(blocking=True, lock_ttl=60) if lock.is_acquired: bricks = NS.gluster.objects.Brick(fqdn=hostname).load_all() for brick in bricks: if brick.status.lower() == BRICK_STARTED: # raise an alert for brick msg = ("Status of brick: %s " "under volume %s in cluster %s chan" "ged from %s to %s") % ( brick.brick_path, brick.vol_name, NS.tendrl_context.integration_id, BRICK_STARTED.title(), BRICK_STOPPED.title()) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path, ) event_utils.emit_event("brick_status", BRICK_STOPPED.title(), msg, instance, 'WARNING', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": brick.vol_name, "node_id": brick.node_id, "fqdn": brick.hostname }) # Update brick status as stopped brick.status = BRICK_STOPPED.title() brick.save() lock.release() except (etcd.EtcdException, KeyError, ValueError, AttributeError) as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to raise an brick status " "alert for host %s" % hostname, "exception": ex })) finally: if isinstance(lock, etcd.lock.Lock) and lock.is_acquired: lock.release()
def sync_volume_rebalance_status(volumes): for volume in volumes: rebal_status_list = [] if "Distribute" in volume.vol_type or ( "arbiter" in volume.vol_type and (int(volume.brick_count) > int(volume.replica_count))): vol_rebal_details = NS.gluster.objects.RebalanceDetails( vol_id=volume.vol_id).load_all() for entry in vol_rebal_details: rebal_status_list.append(entry.rebal_status) if not rebal_status_list: continue new_rebal_status = "unknown" if all(item == "not_started" for item in rebal_status_list): new_rebal_status = "not_started" else: # remove not_stated states from the list as these are # from nodes that are not involved in rebalance rebal_status_list = filter( lambda state: state != 'not_started', rebal_status_list) if "failed" in rebal_status_list: new_rebal_status = "failed" elif "layout_fix_failed" in rebal_status_list: new_rebal_status = "layout_fix_failed" elif "layout_fix_started" in rebal_status_list: new_rebal_status = "layout_fix_started" elif "started" in rebal_status_list: new_rebal_status = "started" elif all(item == "completed" for item in rebal_status_list): new_rebal_status = "completed" elif all(item == "stopped" for item in rebal_status_list): new_rebal_status = "stopped" elif all(item == "layout_fix_" "complete" for item in rebal_status_list): new_rebal_status = "layout_fix_complete" elif all(item == "layout_fix_" "stopped" for item in rebal_status_list): new_rebal_status = "layout_fix_stopped" if volume.rebal_status != "" and \ new_rebal_status != volume.rebal_status: msg = ("Volume:%s rebalance status has %s") % ( volume.name, new_rebal_status) instance = "volume_%s" % volume.name event_utils.emit_event("rebalance_status", new_rebal_status, msg, instance, 'INFO') volume.rebal_status = new_rebal_status volume.save()
def on_change(self, attr, prev_value, current_value): if attr == "status": if current_value is None: self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id) _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id).load() _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) _msg = "node_sync, STALE provisioner node "\ "found! re-configuring monitoring "\ "(job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % self.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': _tc.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) if _tc.sds_name == "gluster": bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass
def test_emit_event(): setattr(__builtin__, "NS", maps.NamedDict()) NS.publisher_id = 0 NS.node_context = maps.NamedDict(fqdn="test", node_id="0") NS.tendrl_context = maps.NamedDict(integration_id="", cluster_name="", sds_name="") emit_event("test", "test", "test", "test", "test", tags=maps.NamedDict(entity_type="brick")) emit_event("test", "test", "test", "test", "test", tags=maps.NamedDict(entity_type="volume"))
def on_change_status(self, prev_value, current_value): if current_value is None: self.status = "unhealthy" self.save() _ctc = \ NS.tendrl.objects.ClusterTendrlContext( integration_id=self.integration_id ).load() msg = "Cluster {0} moved to unhealthy state".format( _ctc.cluster_name) event_utils.emit_event("cluster_health_status", "unhealthy", msg, "cluster_{0}".format(_ctc.integration_id), "WARNING", integration_id=_ctc.integration_id, cluster_name=_ctc.cluster_name, sds_name=_ctc.sds_name)
def process_events(): events = NS.gluster.objects.NativeEvents().load_all() if events: for event in events: try: event.tags = json.loads(event.tags) except(TypeError, ValueError): # tags can be None pass if event.severity == "recovery" and not event.recovery_processed: # this perticular event is recovery event # so process this event and delete it event_utils.emit_event( event.context.split("|")[0], event.current_value, event.message, event.context, "INFO", tags=event.tags ) processed_event = NS.gluster.objects.NativeEvents( event.context, recovery_processed=True ) processed_event.save(ttl=POST_RECOVERY_TTL) continue if event.alert_notify and not event.processed: event_utils.emit_event( event.context.split("|")[0], event.current_value, event.message, event.context, event.severity.upper(), alert_notify=event.alert_notify, tags=event.tags ) processed_event = NS.gluster.objects.NativeEvents( event.context, processed=True ) processed_event.save(NOTIFICATION_TTL) continue if event.severity == "warning" and not event.processed: event_utils.emit_event( event.context.split("|")[0], event.current_value, event.message, event.context, "WARNING", tags=event.tags ) processed_event = NS.gluster.objects.NativeEvents( event.context, processed=True ) processed_event.save() continue
def process_events(): events = NS.gluster.objects.NativeEvents().load_all() if events: for event in events: try: event.tags = json.loads(event.tags) except (TypeError, ValueError): # tags can be None pass if event.severity == "recovery" and not event.recovery_processed: # this perticular event is recovery event # so process this event and delete it event_utils.emit_event(event.context.split("|")[0], event.current_value, event.message, event.context, "INFO", tags=event.tags) processed_event = NS.gluster.objects.NativeEvents( event.context, recovery_processed=True) processed_event.save(ttl=POST_RECOVERY_TTL) continue if event.alert_notify and not event.processed: event_utils.emit_event(event.context.split("|")[0], event.current_value, event.message, event.context, event.severity.upper(), alert_notify=event.alert_notify, tags=event.tags) processed_event = NS.gluster.objects.NativeEvents( event.context, processed=True) processed_event.save(NOTIFICATION_TTL) continue if event.severity == "warning" and not event.processed: event_utils.emit_event(event.context.split("|")[0], event.current_value, event.message, event.context, "WARNING", tags=event.tags) processed_event = NS.gluster.objects.NativeEvents( event.context, processed=True) processed_event.save() continue
def brick_status_alert(hostname): try: # fetching brick details of disconnected node lock = None path = "clusters/%s/Bricks/all/%s" % ( NS.tendrl_context.integration_id, hostname ) lock = etcd.Lock( NS._int.client, path ) lock.acquire( blocking=True, lock_ttl=60 ) if lock.is_acquired: bricks = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=hostname ).load_all() for brick in bricks: if brick.status.lower() == BRICK_STARTED: # raise an alert for brick msg = ( "Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, BRICK_STOPPED.title() ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path, ) event_utils.emit_event( "brick_status", BRICK_STOPPED.title(), msg, instance, 'WARNING', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": brick.vol_name, "node_id": brick.node_id, "fqdn": brick.hostname } ) # Update brick status as stopped brick.status = BRICK_STOPPED.title() brick.save() lock.release() except ( etcd.EtcdException, KeyError, ValueError, AttributeError ) as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to raise an brick status " "alert for host %s" % hostname, "exception": ex } ) ) finally: if isinstance(lock, etcd.lock.Lock) and lock.is_acquired: lock.release()
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and ( _cluster.current_job['status'] == 'failed')) or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": time.sleep(_sleep) continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: # create devicetree using lsblk devicetree = get_device_tree() # find lvs lvs = brick_utilization.get_lvs() index = 1 volumes = raw_data['Volumes'] total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree, lvs ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._update_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree, lvs ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)], lvs ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def update_cluster_details(self, integration_id): try: nodes = etcd_utils.read( "/clusters/%s/nodes" % integration_id ) for node in nodes.leaves: _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=node.key.split("/")[-1], integration_id=integration_id ).load() # Verify all nodes in a cluster are down if str(_cnc.status).lower() != "down" and \ str(_cnc.is_managed).lower() == "yes": # Any one managed node not down don't update # cluster details, No need to consider unmanaged # nodes return # when all managed nodes are down update cluster details global_details = NS.tendrl.objects.GlobalDetails( integration_id=integration_id ).load() # Update cluster as unhealthy if global_details.status.lower() == "healthy": global_details.status = "unhealthy" global_details.save() _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() msg = "Cluster:%s is %s" % ( _cluster.short_name, "unhealthy") instance = "cluster_%s" % integration_id event_utils.emit_event( "cluster_health_status", "unhealthy", msg, instance, 'WARNING', integration_id=integration_id ) # Update all bricks are down nodes = etcd_utils.read( "/clusters/%s/Bricks/all" % integration_id ) for node in nodes.leaves: bricks = NS.tendrl.objects.GlusterBrick( integration_id, fqdn=node.key.split("/")[-1] ).load_all() for brick in bricks: if brick.status.lower() != "stopped": brick.status = "Stopped" brick.save() msg = ("Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, "Stopped" ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path ) event_utils.emit_event( "brick_status", "Stopped", msg, instance, "WARNING", integration_id=integration_id, tags={"entity_type": "brick", "volume_name": brick.vol_name, "node_id": brick.node_id } ) # Update all volumes are down volumes = NS.tendrl.objects.GlusterVolume( integration_id ).load_all() for volume in volumes: if volume.state.lower() != "down": volume.state = "down" volume.status = "Stopped" volume.save() msg = "Volume:%s is %s" % (volume.name, "down") instance = "volume_%s" % volume.name event_utils.emit_event( "volume_state", "down", msg, instance, "WARNING", integration_id=integration_id, tags={"entity_type": "volume", "volume_name": volume.name } ) except etcd.EtcdKeyNotFound: pass
def run(self): logger.log("info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__}) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() try: etcd_utils.read("clusters/%s/" "cluster_network" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: try: node_networks = etcd_utils.read("nodes/%s/Networks" % NS.node_context.node_id) # TODO(team) this logic needs to change later # multiple networks supported for gluster use case node_network = NS.tendrl.objects.NodeNetwork( interface=node_networks.leaves.next().key.split( '/')[-1]).load() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() cluster.cluster_network = node_network.subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"}) if NS.tendrl_context.integration_id: # Initialize alert node alert count try: key = 'clusters/%s/nodes/%s/alert_counters' % ( NS.tendrl_context.integration_id, NS.node_context.node_id) etcd_utils.read(key) except (etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: NS.tendrl.objects.ClusterNodeAlertCounters( node_id=NS.node_context.node_id, integration_id=NS.tendrl_context.integration_id).save( ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if (_cluster.status == "importing" and _cluster.current_job['status'] == 'failed') or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call([ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ]) raw_data = ini2json.ini_to_dict('/var/run/glusterd-state') subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call([ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ]) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts') subprocess.call( ['rm', '-rf', '/var/run/glusterd-state-vol-opts']) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = etcd_utils.read( "clusters/%s/nodes/%s/Peers/%s/connected" % (NS.tendrl_context.integration_id, NS.node_context.node_id, peers['peer%s.uuid' % index])).value current_status = peers['peer%s.connected' % index] if stored_peer_status != "" and \ current_status != stored_peer_status: msg = ( "Status of peer: %s in cluster %s " "changed from %s to %s") % ( peers['peer%s.primary_hostname' % index], NS.tendrl_context.integration_id, stored_peer_status, current_status) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO') # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers['peer%s.primary_hostname' % index]) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert(disconnected_host) if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] while True: try: sync_volumes( volumes, index, raw_data_options.get('Volume Options'), # sync_interval + 100 + no of peers + 350 SYNC_TTL + 350) index += 1 SYNC_TTL += 1 except KeyError: break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) NS.gluster.objects.VolumeOptions(vol_id=vol_id, options=dict1).save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.gluster.objects.Volume().load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" or \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed']: volumes.append(volume) cluster_status.sync_cluster_status(volumes, SYNC_TTL + 350) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() evt.process_events() rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get("sync_interval", 10)) + len(volumes) * 4) # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id) if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() # Initialize alert count try: alerts_count_key = '/clusters/%s/alert_counters' % ( NS.tendrl_context.integration_id) etcd_utils.read(alerts_count_key) except (etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: NS.tendrl.objects.ClusterAlertCounters( integration_id=NS.tendrl_context.integration_id ).save() except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "gluster sds state sync error", "exception": ex })) try: etcd_utils.read('/clusters/%s/_sync_now' % NS.tendrl_context.integration_id) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log("debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__})
def sync_volumes(volumes, index, vol_options, sync_ttl): # instantiating blivet class, this will be used for # getting brick_device_details b = blivet.Blivet() # reset blivet during every sync to get latest information # about storage devices in the machine b.reset() devicetree = b.devicetree node_context = NS.node_context.load() tag_list = node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.gluster.objects.Volume(vol_id=volumes['volume%s.id' % index]).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], NS.tendrl_context.integration_id, stored_volume_status, current_status) instance = "volume_%s" % volumes['volume%s.name' % index] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={ "entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] }) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.gluster.objects.Volume( vol_id=volumes['volume%s.id' % index], vol_type="arbiter" if int(volumes['volume%s.arbiter_count' % index]) > 0 else volumes['volume%s.type' % index], name=volumes['volume%s.name' % index], transport_type=volumes['volume%s.transport_type' % index], status=volumes['volume%s.status' % index], brick_count=volumes['volume%s.brickcount' % index], snap_count=volumes['volume%s.snap_count' % index], stripe_count=volumes['volume%s.stripe_count' % index], replica_count=volumes['volume%s.replica_count' % index], subvol_count=volumes['volume%s.subvol_count' % index], arbiter_count=volumes['volume%s.arbiter_count' % index], disperse_count=volumes['volume%s.disperse_count' % index], redundancy_count=volumes['volume%s.redundancy_count' % index], quorum_status=volumes['volume%s.quorum_status' % index], snapd_status=volumes['volume%s.snapd_svc.online_status' % index], snapd_inited=volumes['volume%s.snapd_svc.inited' % index], ) if NS.gluster.objects.Volume(vol_id=volumes['volume%s.id' % index]).exists(): existing_vol = NS.gluster.objects.Volume( vol_id=volumes['volume%s.id' % index]).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % (volumes['volume%s.name' % index], NS.tendrl_context.integration_id, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event("volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes['volume%s.name' % index] }) volume.save(ttl=sync_ttl) # Initialize volume alert count try: volume_alert_count_key = '/clusters/%s/Volumes/%s/'\ 'alert_counters' % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index] ) etcd_utils.read(volume_alert_count_key) except (etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: NS.gluster.objects.VolumeAlertCounters( integration_id=NS.tendrl_context.integration_id, volume_id=volumes['volume%s.id' % index]).save() # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[vol_options['volume%s.options.key%s' % (index, opt_count)]] = vol_options[ 'volume%s.options.value%s' % (index, opt_count)] NS.gluster.objects.VolumeOptions( vol_id=volume.vol_id, options=vol_opt_dict).save(ttl=sync_ttl) rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS._int.client.read("nodes/%s/Networks" % NS.node_context.node_id) for interface in networks.leaves: key = interface.key.split("/")[-1] network = NS.tendrl.objects.NodeNetwork(interface=key).load() if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex })) while True: try: # Update brick node wise hostname = volumes['volume%s.brick%s.hostname' % (index, b_index)] if (NS.node_context.fqdn != hostname) and (hostname not in network_ip): b_index += 1 continue sub_vol_size = (int(volumes['volume%s.brickcount' % index])) / int( volumes['volume%s.subvol_count' % index]) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % (index, b_index)].split(":")[-1].replace( "/", "_") # Raise alerts if the brick path changes try: sbs = NS._int.client.read( "clusters/%s/Bricks/all/" "%s/%s/status" % (NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_name.split(":_")[-1])).value current_status = volumes.get('volume%s.brick%s.status' % (index, b_index)) if current_status != sbs: msg = ("Status of brick: %s " "under volume %s in cluster %s chan" "ged from %s to %s") % ( volumes['volume%s.brick%s' '.path' % (index, b_index)], volumes['volume%s.' 'name' % index], NS.tendrl_context.integration_id, sbs, current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % (index, b_index)]) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes['volume%s.' 'name' % index] }) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % (NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name) NS._int.wclient.write(vol_brick_path, "") brick = NS.gluster.objects.Brick( NS.node_context.fqdn, brick_name.split(":_")[-1], name=brick_name, vol_id=volumes['volume%s.id' % index], sequence_number=b_index, brick_path=volumes['volume%s.brick%s.path' % (index, b_index)], hostname=volumes.get('volume%s.brick%s.hostname' % (index, b_index)), port=volumes.get('volume%s.brick%s.port' % (index, b_index)), vol_name=volumes['volume%s.name' % index], used=True, node_id=NS.node_context.node_id, status=volumes.get('volume%s.brick%s.status' % (index, b_index)), filesystem_type=volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index)), mount_opts=volumes.get('volume%s.brick%s.mount_options' % (index, b_index)), utilization=brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)]), client_count=volumes.get('volume%s.brick%s.client_count' % (index, b_index)), is_arbiter=volumes.get('volume%s.brick%s.is_arbiter' % (index, b_index)), ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get('volume%s.brick%s.client_count' % (index, b_index)) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % (index, b_index, c_index)], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % (index, b_index, c_index)], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % (index, b_index, c_index)], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % (index, b_index, c_index)]).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break
def sync(sync_ttl): try: NS.node_context = NS.node_context.load() logger.log( "debug", NS.publisher_id, {"message": "Running SDS detection"} ) try: sds_discovery_manager = sds_manager.SDSDiscoveryManager() except ValueError as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "Failed to init SDSDiscoveryManager.", "exception": ex } ) ) return # Execute the SDS discovery plugins and tag the nodes with data for plugin in sds_discovery_manager.get_available_plugins(): sds_details = plugin.discover_storage_system() if sds_details is None: break if "peers" in sds_details and NS.tendrl_context.integration_id: _cnc = NS.tendrl.objects.ClusterNodeContext().load() this_peer_uuid = "" if _cnc.is_managed != "yes" or not NS.node_context.fqdn: for peer_uuid, data in sds_details.get("peers", {}).iteritems(): peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peer_uuid, hostname=data['hostname'], connected=data['connected'] ) peer.save() if data['hostname'] == "localhost": this_peer_uuid = peer_uuid # Figure out the hostname used to probe this peer integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" %\ NS.tendrl_context.integration_id _node_ids = etcd_utils.read(integration_id_index_key).value _node_ids = json.loads(_node_ids) for _node_id in _node_ids: if _node_id != NS.node_context.node_id: peer = NS.tendrl.objects.GlusterPeer( peer_uuid=this_peer_uuid, node_id=_node_id ).load() if peer.hostname: NS.node_context.pkey = peer.hostname NS.node_context.fqdn = peer.hostname NS.node_context.ipv4_addr = \ socket.gethostbyname( peer.hostname ) NS.node_context.save() break if ('detected_cluster_id' in sds_details and sds_details[ 'detected_cluster_id'] != ""): try: integration_index_key = \ "indexes/detected_cluster_id_to_integration_id/" \ "%s" % sds_details['detected_cluster_id'] dc = NS.tendrl.objects.DetectedCluster().load() if dc is None or dc.detected_cluster_id is None: time.sleep(sync_ttl) integration_id = str(uuid.uuid4()) try: etcd_utils.write( integration_index_key, integration_id, prevExist=False ) except etcd.EtcdAlreadyExist: pass _ptag = None if NS.tendrl_context.integration_id: _ptag = "provisioner/%s" % \ NS.tendrl_context.integration_id if _ptag in NS.node_context.tags: if dc.detected_cluster_id and \ dc.detected_cluster_id != sds_details.get( 'detected_cluster_id'): # Gluster peer list has changed integration_id = \ NS.tendrl_context.integration_id etcd_utils.write( integration_index_key, integration_id ) # Set the cluster status as new peer detected _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.status = "new_peers_detected" _cluster.save() # Raise an alert regarding the same msg = "New peers identified in cluster: %s. " \ "Make sure tendrl-ansible is executed " \ "for the new nodes so that expand " \ "cluster option can be triggered" % \ _cluster.short_name event_utils.emit_event( "cluster_status", "new_peers_detected", msg, "cluster_{0}".format(integration_id), "WARNING", integration_id=integration_id ) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.status == "new_peers_detected": peers = [] cmd = subprocess.Popen( "gluster pool list", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) out, err = cmd.communicate() if err or out is None or \ "Connection failed" in out: pass # set the no of peers as zero if out: lines = out.split('\n')[1:] for line in lines: if line.strip() != '': peers.append(line.split()[0]) nodes_ids = json.loads(etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value) if len(nodes_ids) == len(peers): # All the nodes are having node-agents # running and known to tendrl msg = "New nodes in cluster: %s have " \ "node agents running now. Cluster " \ "is ready to expand." % \ _cluster.short_name event_utils.emit_event( "cluster_status", "expand_pending", msg, "cluster_{0}".format( NS.tendrl_context.integration_id ), "INFO", integration_id=NS.tendrl_context. integration_id ) # Set the cluster status accordingly _cluster.status = 'expand_pending' _cluster.save() loop_count = 0 while True: # Wait till provisioner node assigns # integration_id for this detected_cluster_id if loop_count >= 72: return try: time.sleep(5) integration_id = etcd_utils.read( integration_index_key).value if integration_id: break except etcd.EtcdKeyNotFound: loop_count += 1 continue NS.tendrl_context.integration_id = integration_id NS.tendrl_context.cluster_id = sds_details.get( 'detected_cluster_id') NS.tendrl_context.cluster_name = sds_details.get( 'detected_cluster_name') NS.tendrl_context.sds_name = sds_details.get( 'pkg_name') NS.tendrl_context.sds_version = sds_details.get( 'pkg_version') NS.tendrl_context.save() NS.node_context = NS.node_context.load() integration_tag = "tendrl/integration/%s" % \ integration_id detected_cluster_tag = "detected_cluster/%s" % \ sds_details[ 'detected_cluster_id'] NS.node_context.tags += [detected_cluster_tag, integration_tag] NS.node_context.tags = list(set(NS.node_context.tags)) NS.node_context.save() NS.tendrl.objects.DetectedCluster( detected_cluster_id=sds_details.get( 'detected_cluster_id'), detected_cluster_name=sds_details.get( 'detected_cluster_name'), sds_pkg_name=sds_details.get('pkg_name'), sds_pkg_version=sds_details.get('pkg_version'), ).save() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] \ and _cluster.status in [None, ""]: _cluster.save() except (etcd.EtcdException, KeyError) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "Failed SDS detection", "exception": ex } ) ) break except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "node_sync " "SDS detection failed: " + ex.message, "exception": ex} ) )
def run(self): logger.log("info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__}) NS.node_context = NS.node_context.load() current_tags = list(NS.node_context.tags) current_tags += ["tendrl/node_%s" % NS.node_context.node_id] NS.node_context.tags = list(set(current_tags)) NS.node_context.status = "UP" NS.node_context.save() _sleep = 0 msg = "{0} is UP".format(NS.node_context.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(NS.node_context.fqdn), "INFO", node_id=NS.node_context.node_id) while not self._complete.is_set(): _sync_ttl = int(NS.config.data.get("sync_interval", 10)) + 100 if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 NS.node_context = NS.node_context.load() NS.node_context.sync_status = "in_progress" current_tags = list(NS.node_context.tags) current_tags += ["tendrl/node_%s" % NS.node_context.node_id] NS.node_context.tags = list(set(current_tags)) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) NS.tendrl_context = NS.tendrl_context.load() sync_service_and_index_thread = threading.Thread( target=services_and_index_sync.sync, args=(_sync_ttl, )) sync_service_and_index_thread.daemon = True sync_service_and_index_thread.start() sync_service_and_index_thread.join() NS.node_context = NS.node_context.load() if "tendrl/monitor" in NS.node_context.tags: check_all_managed_node_status_thread = threading.Thread( target=check_all_managed_nodes_status.run) check_all_managed_node_status_thread.daemon = True check_all_managed_node_status_thread.start() check_all_managed_node_status_thread.join() check_cluster_status_thread = threading.Thread( target=check_cluster_status.run) check_cluster_status_thread.daemon = True check_cluster_status_thread.start() check_cluster_status_thread.join() if "tendrl/monitor" not in NS.node_context.tags: sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() platform_detect_thread = threading.Thread( target=platform_detect.sync) platform_detect.daemon = True platform_detect_thread.start() platform_detect_thread.join() if "tendrl/monitor" not in NS.node_context.tags: sds_detect_thread = threading.Thread(target=sds_detect.sync, args=(_sleep, )) sds_detect_thread.daemon = True sds_detect_thread.start() sds_detect_thread.join() NS.tendrl_context = NS.tendrl_context.load() try: NS.tendrl.objects.Os().save() NS.tendrl.objects.Cpu().save() NS.tendrl.objects.Memory().save() except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync " "os/cpu/memory sync failed: " + ex.message, "exception": ex })) NS.node_context = NS.node_context.load() NS.node_context.sync_status = "failed" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) time.sleep(_sleep) sync_disks_thread = threading.Thread(target=disk_sync.sync) sync_disks_thread.daemon = True sync_disks_thread.start() sync_disks_thread.join() sync_networks_thread = threading.Thread(target=network_sync.sync) sync_networks_thread.daemon = True sync_networks_thread.start() sync_networks_thread.join() NS.node_context = NS.node_context.load() NS.node_context.sync_status = "done" NS.node_context.last_sync = str(time_utils.now()) NS.node_context.status = "UP" NS.node_context.save(ttl=_sync_ttl) if "tendrl/monitor" not in NS.node_context.tags: sync_cluster_contexts_thread = threading.Thread( target=cluster_contexts_sync.sync, args=(_sync_ttl, )) sync_cluster_contexts_thread.daemon = True sync_cluster_contexts_thread.start() sync_cluster_contexts_thread.join() # Update node alert count if not NS.tendrl.objects.ClusterNodeAlertCounters().exists(): update_cluster_node_alert_count() time.sleep(_sleep) logger.log("info", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__})
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and _cluster.current_job['status'] == 'failed') or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] # instantiating blivet class, this will be used for # getting brick_device_details b = blivet.Blivet() # reset blivet during every sync to get latest information # about storage devices in the machine b.reset() devicetree = b.devicetree total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def sync_volume_rebalance_status(volumes): for volume in volumes: rebal_status_list = [] if "Distribute" in volume.vol_type or ( "arbiter" in volume.vol_type and ( int(volume.brick_count) > int(volume.replica_count)) ): vol_rebal_details = NS.gluster.objects.RebalanceDetails( vol_id=volume.vol_id ).load_all() for entry in vol_rebal_details: rebal_status_list.append(entry.rebal_status) if not rebal_status_list: continue new_rebal_status = "unknown" if all(item == "not_started" for item in rebal_status_list): new_rebal_status = "not_started" else: # remove not_stated states from the list as these are # from nodes that are not involved in rebalance rebal_status_list = filter( lambda state: state != 'not_started', rebal_status_list ) if "failed" in rebal_status_list: new_rebal_status = "failed" elif "layout_fix_failed" in rebal_status_list: new_rebal_status = "layout_fix_failed" elif "layout_fix_started" in rebal_status_list: new_rebal_status = "layout_fix_started" elif "started" in rebal_status_list: new_rebal_status = "started" elif all(item == "completed" for item in rebal_status_list): new_rebal_status = "completed" elif all(item == "stopped" for item in rebal_status_list): new_rebal_status = "stopped" elif all( item == "layout_fix_" "complete" for item in rebal_status_list ): new_rebal_status = "layout_fix_complete" elif all( item == "layout_fix_" "stopped" for item in rebal_status_list ): new_rebal_status = "layout_fix_stopped" if volume.rebal_status != "" and \ new_rebal_status != volume.rebal_status: msg = ("Volume:%s rebalance status has %s") % ( volume.name, new_rebal_status) instance = "volume_%s" % volume.name event_utils.emit_event( "rebalance_status", new_rebal_status, msg, instance, 'INFO' ) volume.rebal_status = new_rebal_status volume.save()
def update_cluster_details(self, integration_id): try: nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id) for node in nodes.leaves: _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=node.key.split("/")[-1], integration_id=integration_id).load() # Verify all nodes in a cluster are down if str(_cnc.status).lower() != "down" and \ str(_cnc.is_managed).lower() == "yes": # Any one managed node not down don't update # cluster details, No need to consider unmanaged # nodes return # when all managed nodes are down update cluster details global_details = NS.tendrl.objects.GlobalDetails( integration_id=integration_id).load() # Update cluster as unhealthy if global_details.status.lower() == "healthy": global_details.status = "unhealthy" global_details.save() _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() msg = "Cluster:%s is %s" % (_cluster.short_name, "unhealthy") instance = "cluster_%s" % integration_id event_utils.emit_event("cluster_health_status", "unhealthy", msg, instance, 'WARNING', integration_id=integration_id) # Update all bricks are down nodes = etcd_utils.read("/clusters/%s/Bricks/all" % integration_id) for node in nodes.leaves: bricks = NS.tendrl.objects.GlusterBrick( integration_id, fqdn=node.key.split("/")[-1]).load_all() for brick in bricks: if brick.status.lower() != "stopped": brick.status = "Stopped" brick.save() msg = ("Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, "Stopped") instance = "volume_%s|brick_%s" % (brick.vol_name, brick.brick_path) event_utils.emit_event("brick_status", "Stopped", msg, instance, "WARNING", integration_id=integration_id, tags={ "entity_type": "brick", "volume_name": brick.vol_name, "node_id": brick.node_id }) # Update all volumes are down volumes = NS.tendrl.objects.GlusterVolume( integration_id).load_all() for volume in volumes: if volume.state.lower() != "down": volume.state = "down" volume.status = "Stopped" volume.save() msg = "Volume:%s is %s" % (volume.name, "down") instance = "volume_%s" % volume.name event_utils.emit_event("volume_state", "down", msg, instance, "WARNING", integration_id=integration_id, tags={ "entity_type": "volume", "volume_name": volume.name }) except etcd.EtcdKeyNotFound: pass
def _derive_volume_states(volumes): out_dict = {} for volume in volumes: if volume.status == "Stopped": out_dict[volume.vol_id] = "down" else: subvol_count = 0 bricks = [] subvol_states = [] while True: try: subvol = NS._int.client.read( "clusters/%s/Volumes/%s/Bricks/subvolume%s" % (NS.tendrl_context.integration_id, volume.vol_id, subvol_count)) state = 0 for entry in subvol.leaves: brick_name = entry.key.split("/")[-1] fetched_brick = NS.gluster.objects.Brick( brick_name.split(":")[0], brick_name.split(":_")[-1]).load() if not fetched_brick.status: fetched_brick.status = "Stopped" bricks.append(fetched_brick) if fetched_brick.status != "Started": state += 1 subvol_states.append(state) subvol_count += 1 except etcd.EtcdKeyNotFound: break total_bricks = len(bricks) up_bricks = 0 for brick in bricks: if brick.status == "Started": up_bricks += 1 if total_bricks == 0 or total_bricks < int(volume.brick_count): # No brick details updated for the volume yet out_dict[volume.vol_id] = 'unknown' elif up_bricks == 0: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = 'up' if int(volume.replica_count) > 1 or \ int(volume.disperse_count) > 0: worst_subvol = max(subvol_states) if worst_subvol > 0: subvol_prob = max(int(volume.replica_count), int(volume.redundancy_count) + 1) if worst_subvol == subvol_prob: # if this volume contains only one subvolume, # and the bricks down > redundancy level # then the volume state needs to show down if subvol_count == 1: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = '(partial)' else: out_dict[volume.vol_id] = '(degraded)' else: # This volume is not 'protected', so any brick # disruption leads straight to a 'partial' # availability state if up_bricks != total_bricks: out_dict[volume.vol_id] = '(partial)' # Raise the alert if volume state changes if volume.state != "" and \ out_dict[volume.vol_id] != volume.state: msg = "State of volume: %s " \ "changed from %s to %s" % ( volume.name, volume.state, out_dict[volume.vol_id] ) instance = "volume_%s" % volume.name event_utils.emit_event( "volume_state", out_dict[volume.vol_id], msg, instance, 'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING', tags={ "entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volume.name }) # Save the volume status volume.state = out_dict[volume.vol_id] volume.save() return out_dict
def save_georep_details(volumes, index): pair_index = 1 while True: try: session_id = "{0}_{1}_{2}".format( volumes['volume%s.pair%s.master_volume' % (index, pair_index)], volumes['volume%s.pair%s.slave' % (index, pair_index)].split("::")[-1], volumes['volume%s.pair%s.session_slave' % (index, pair_index)].split(":")[-1]) pair_name = "{0}-{1}".format( volumes['volume%s.pair%s.master_node' % (index, pair_index)], volumes['volume%s.pair%s.master_brick' % (index, pair_index)].replace("/", "_")) readable_pair_name = "{0}:{1}".format( volumes['volume%s.pair%s.master_node' % (index, pair_index)], volumes['volume%s.pair%s.master_brick' % (index, pair_index)]) try: pair = NS.gluster.objects.GeoReplicationPair( vol_id=volumes['volume%s.id' % index], session_id=session_id, pair=pair_name).load() fetched_pair_status = None if pair: fetched_pair_status = pair.status pair_status = volumes['volume%s.pair%s.status' % (index, pair_index)] if fetched_pair_status and \ fetched_pair_status != pair_status and \ pair_status.lower() == 'faulty': msg = ("Geo-replication between %s " "and %s is faulty") % (readable_pair_name, volumes['volume%s.name' % index]) instance = "volume_%s|georep_%s" % ( volumes['volume%s.name' % index], pair_name) event_utils.emit_event("georep_status", pair_status, msg, instance, 'WARNING', tags={ "entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] }) if fetched_pair_status and \ fetched_pair_status.lower() == 'faulty' and \ pair_status.lower() in ['active', 'passive']: msg = ("Geo-replication between %s " "and %s is %s") % (readable_pair_name, volumes['volume%s.name' % index], pair_status) instance = "volume_%s|georep_%s" % ( volumes['volume%s.name' % index], pair_name) event_utils.emit_event("georep_status", pair_status, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] }) except etcd.EtcdKeyNotFound: pass pair = NS.gluster.objects.GeoReplicationPair( vol_id=volumes['volume%s.id' % index], session_id=session_id, pair=pair_name, master_volume=volumes['volume%s.pair%s.master_volume' % (index, pair_index)], master_brick=volumes['volume%s.pair%s.master_brick' % (index, pair_index)], master_node=volumes['volume%s.pair%s.master_node' % (index, pair_index)], slave_user=volumes['volume%s.pair%s.slave_user' % (index, pair_index)], slave=volumes['volume%s.pair%s.slave' % (index, pair_index)], slave_node=volumes['volume%s.pair%s.slave_node' % (index, pair_index)], status=volumes['volume%s.pair%s.status' % (index, pair_index)], crawl_status=volumes['volume%s.pair%s.crawl_status' % (index, pair_index)], last_synced=volumes['volume%s.pair%s.last_synced' % (index, pair_index)], entry=volumes['volume%s.pair%s.entry' % (index, pair_index)], data=volumes['volume%s.pair%s.data' % (index, pair_index)], meta=volumes['volume%s.pair%s.meta' % (index, pair_index)], failures=volumes['volume%s.pair%s.failures' % (index, pair_index)], checkpoint_time=volumes['volume%s.pair%s.checkpoint_time' % (index, pair_index)], checkpoint_completed=volumes[ 'volume%s.pair%s.checkpoint_completed' % (index, pair_index)], checkpoint_completed_time=volumes[ 'volume%s.pair%s.checkpoint_completion_time' % (index, pair_index)]) except KeyError: break pair.save() pair_index += 1 return
def sync_cluster_status(volumes, sync_ttl): degraded_count = 0 is_healthy = True # Check if there is a failed import cluster # flow, mark the cluster status as unhealthy _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.current_job.get('job_name', '') == "ImportCluster" and \ _cluster.current_job.get('status', '') == "failed": is_healthy = False # Calculate status based on volumes status if len(volumes) > 0: volume_states = _derive_volume_states(volumes) for vol_id, state in volume_states.iteritems(): if 'down' in state or 'partial' in state: is_healthy = False if 'degraded' in state: degraded_count += 1 # Change status basd on node status cmd = cmd_utils.Command( 'gluster pool list', True ) out, err, rc = cmd.run() peer_count = 0 if not err: out_lines = out.split('\n') connected = True for index in range(1, len(out_lines)): peer_count += 1 node_status_det = out_lines[index].split('\t') if len(node_status_det) > 2: if node_status_det[2].strip() != 'Connected': connected = connected and False if not connected: is_healthy = False cluster_gd = NS.tendrl.objects.GlobalDetails( integration_id=NS.tendrl_context.integration_id ).load() old_status = cluster_gd.status or 'unhealthy' curr_status = 'healthy' if is_healthy else 'unhealthy' if curr_status != old_status: msg = ("Cluster:%s is %s" ) % ( _cluster.short_name, curr_status) instance = "cluster_%s" % NS.tendrl_context.integration_id event_utils.emit_event( "cluster_health_status", curr_status, msg, instance, 'WARNING' if curr_status == 'unhealthy' else 'INFO' ) # Persist the cluster status NS.tendrl.objects.GlobalDetails( integration_id=NS.tendrl_context.integration_id, status='healthy' if is_healthy else 'unhealthy', peer_count=peer_count, vol_count=len(volumes), volume_up_degraded=degraded_count ).save(ttl=sync_ttl)
def _derive_volume_states(volumes): out_dict = {} for volume in volumes: if volume.status == "Stopped": out_dict[volume.vol_id] = "down" else: subvol_count = 0 bricks = [] subvol_states = [] while True: try: subvol = etcd_utils.read( "clusters/%s/Volumes/%s/Bricks/subvolume%s" % ( NS.tendrl_context.integration_id, volume.vol_id, subvol_count ) ) state = 0 for entry in subvol.leaves: brick_name = entry.key.split("/")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, brick_name.split(":")[0], brick_name.split(":_")[-1] ).load() if not fetched_brick.status: fetched_brick.status = "Stopped" bricks.append(fetched_brick) if fetched_brick.status != "Started": state += 1 subvol_states.append(state) subvol_count += 1 except etcd.EtcdKeyNotFound: break total_bricks = len(bricks) up_bricks = 0 for brick in bricks: if brick.status == "Started": up_bricks += 1 if total_bricks == 0 or total_bricks < int(volume.brick_count): # No brick details updated for the volume yet out_dict[volume.vol_id] = 'unknown' elif up_bricks == 0: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = 'up' if int(volume.replica_count) > 1 or \ int(volume.disperse_count) > 0: worst_subvol = max(subvol_states) if worst_subvol > 0: subvol_prob = max( int(volume.replica_count), int(volume.redundancy_count) + 1 ) if worst_subvol == subvol_prob: # if this volume contains only one subvolume, # and the bricks down > redundancy level # then the volume state needs to show down if subvol_count == 1: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = '(partial)' else: out_dict[volume.vol_id] = '(degraded)' else: # This volume is not 'protected', so any brick # disruption leads straight to a 'partial' # availability state if up_bricks != total_bricks: out_dict[volume.vol_id] = '(partial)' # Raise the alert if volume state changes if volume.state != "" and \ out_dict[volume.vol_id] not in [volume.state, 'unknown']: msg = "Volume:%s is %s" % (volume.name, out_dict[volume.vol_id]) instance = "volume_%s" % volume.name event_utils.emit_event( "volume_state", out_dict[volume.vol_id], msg, instance, 'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volume.name } ) # Save the volume status volume.state = out_dict[volume.vol_id] volume.save() return out_dict
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)] ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) _msg = "node_sync, STALE provisioner node "\ "found! re-configuring monitoring "\ "(job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % self.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': _tc.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id) del _cnc
def save_georep_details(volumes, index): pair_index = 1 while True: try: session_id = "{0}_{1}_{2}".format( volumes[ 'volume%s.pair%s.master_volume' % ( index, pair_index ) ], volumes[ 'volume%s.pair%s.slave' % ( index, pair_index) ].split("::")[-1], volumes[ 'volume%s.pair%s.session_slave' % ( index, pair_index) ].split(":")[-1] ) pair_name = "{0}-{1}".format( volumes[ 'volume%s.pair%s.master_node' % ( index, pair_index) ], volumes[ 'volume%s.pair%s.master_brick' % ( index, pair_index) ].replace("/", "_") ) readable_pair_name = "{0}:{1}".format( volumes[ 'volume%s.pair%s.master_node' % ( index, pair_index) ], volumes[ 'volume%s.pair%s.master_brick' % ( index, pair_index) ] ) try: pair = NS.gluster.objects.GeoReplicationPair( vol_id=volumes['volume%s.id' % index], session_id=session_id, pair=pair_name ).load() fetched_pair_status = None if pair: fetched_pair_status = pair.status pair_status = volumes[ 'volume%s.pair%s.status' % (index, pair_index) ] if fetched_pair_status and \ fetched_pair_status != pair_status and \ pair_status.lower() == 'faulty': msg = ("Geo-replication between %s " "and %s is faulty") % ( readable_pair_name, volumes['volume%s.name' % index]) instance = "volume_%s|georep_%s" % ( volumes['volume%s.name' % index], pair_name ) event_utils.emit_event( "georep_status", pair_status, msg, instance, 'WARNING', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) if fetched_pair_status and \ fetched_pair_status.lower() == 'faulty' and \ pair_status.lower() in ['active', 'passive']: msg = ("Geo-replication between %s " "and %s is %s") % ( readable_pair_name, volumes['volume%s.name' % index], pair_status) instance = "volume_%s|georep_%s" % ( volumes['volume%s.name' % index], pair_name ) event_utils.emit_event( "georep_status", pair_status, msg, instance, 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except etcd.EtcdKeyNotFound: pass pair = NS.gluster.objects.GeoReplicationPair( vol_id=volumes['volume%s.id' % index], session_id=session_id, pair=pair_name, master_volume=volumes[ 'volume%s.pair%s.master_volume' % ( index, pair_index)], master_brick=volumes[ 'volume%s.pair%s.master_brick' % ( index, pair_index)], master_node=volumes[ 'volume%s.pair%s.master_node' % ( index, pair_index)], slave_user=volumes[ 'volume%s.pair%s.slave_user' % ( index, pair_index)], slave=volumes[ 'volume%s.pair%s.slave' % ( index, pair_index)], slave_node=volumes[ 'volume%s.pair%s.slave_node' % ( index, pair_index)], status=volumes[ 'volume%s.pair%s.status' % ( index, pair_index)], crawl_status=volumes[ 'volume%s.pair%s.crawl_status' % ( index, pair_index)], last_synced=volumes[ 'volume%s.pair%s.last_synced' % ( index, pair_index)], entry=volumes[ 'volume%s.pair%s.entry' % ( index, pair_index)], data=volumes[ 'volume%s.pair%s.data' % ( index, pair_index)], meta=volumes[ 'volume%s.pair%s.meta' % ( index, pair_index)], failures=volumes[ 'volume%s.pair%s.failures' % ( index, pair_index)], checkpoint_time=volumes[ 'volume%s.pair%s.checkpoint_time' % ( index, pair_index)], checkpoint_completed=volumes[ 'volume%s.pair%s.checkpoint_completed' % ( index, pair_index)], checkpoint_completed_time=volumes[ 'volume%s.pair%s.checkpoint_completion_time' % ( index, pair_index)] ) except KeyError: break pair.save() pair_index += 1 return
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id ).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id ).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event( "node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id ) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn ) ) for brick in bricks.leaves: try: etcd_utils.write( "{0}/status".format(brick.key), "Stopped" ) except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event( "node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id ) del _cnc