def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id) del _cnc
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") for brick in bricks: fetched_brick = NS.gluster.objects.Brick( fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_')).load() try: NS._int.wclient.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_')), recursive=True, ) except etcd.EtcdKeyNotFound: pass job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, {"message": "Update dashboard job %s " "created" % job_id}) job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id }) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info try: NS._int.wclient.delete(volume_brick_path, recursive=True) except etcd.EtcdKeyNotFound: pass _trigger_sync_key = 'clusters/%s/_sync_now' % NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
def shutdown(signum, frame): logger.log( "debug", NS.publisher_id, {"message": "Signal handler: stopping"} ) # Remove the node's name from gluster server tag try: gl_srvr_list = etcd_utils.read( "/indexes/tags/gluster/server" ).value gl_srvr_list = json.loads(gl_srvr_list) if NS.node_context.node_id in gl_srvr_list: gl_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/gluster/server", json.dumps(gl_srvr_list) ) node_tags = NS.node_context.tags if 'provisioner/%s' % NS.tendrl_context.integration_id \ in node_tags: etcd_utils.delete( "/indexes/tags/provisioner/%s" % NS.tendrl_context.integration_id, recursive=True ) int_srvr_list = etcd_utils.read( "/indexes/tags/tendrl/integration/gluster" ).value int_srvr_list = json.loads(int_srvr_list) if NS.node_context.node_id in int_srvr_list: int_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/tendrl/integration/gluster", json.dumps(int_srvr_list) ) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Couldnt remove node from " "gluster servers list tag." "integration_id: %s, node_id: %s" % ( NS.tendrl_context.integration_id, NS.node_context.node_id ) } ) pass complete.set() m.stop()
def shutdown(signum, frame): logger.log( "debug", NS.publisher_id, {"message": "Signal handler: stopping"} ) # Remove the node's name from gluster server tag try: gl_srvr_list = etcd_utils.read( "/indexes/tags/gluster/server" ).value gl_srvr_list = json.loads(gl_srvr_list) if NS.node_context.node_id in gl_srvr_list: gl_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/gluster/server", json.dumps(gl_srvr_list) ) node_tags = json.loads(NS.node_context.tags) if 'provisioner/%s' % NS.tendrl_context.integration_id \ in node_tags: etcd_utils.delete( "/indexes/tags/provisioner/%s" % NS.tendrl_context.integration_id, recursive=True ) int_srvr_list = etcd_utils.read( "/indexes/tags/tendrl/integration/gluster" ).value int_srvr_list = json.loads(int_srvr_list) if NS.node_context.node_id in int_srvr_list: int_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/tendrl/integration/gluster", json.dumps(int_srvr_list) ) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Couldnt remove node from " "gluster servers list tag." "integration_id: %s, node_id: %s" % ( NS.tendrl_context.integration_id, NS.node_context.node_id ) } ) pass complete.set() m.stop()
def push_operation(self): etcd_utils.write( "/messages/jobs/%s" % self.message.job_id, Message.to_json(self.message), append=True) etcd_utils.refresh( "/messages/jobs/%s" % self.message.job_id, ttl=NS.config.data['message_retention_time'] ) log_message = ("%s:%s") % ( self.message.job_id, self.message.payload["message"]) return log_message
def on_change(self, attr, prev_value, current_value): if attr == "status": if current_value is None: self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id) _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id).load() _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) _msg = "node_sync, STALE provisioner node "\ "found! re-configuring monitoring "\ "(job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % self.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': _tc.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) if _tc.sds_name == "gluster": bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log("debug", NS.publisher_id, {"message": _msg}) if self._ttl and item['name'] in self._attrs_with_ttl: etcd_utils.write(item['key'], item['value'], quorum=True, ttl=self._ttl) else: etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get( item['name'], {} ).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log( "debug", NS.publisher_id, {"message": _msg} ) etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def sync(): try: _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250 disks = get_node_disks() disk_map = {} for disk in disks: # Creating dict with disk name as key and disk_id as value # It will help populate block device disk_id attribute _map = dict(disk_id=disks[disk]['disk_id'], ssd=False) disk_map[disks[disk]['disk_name']] = _map block_devices = get_node_block_devices(disk_map) for disk in disks: if disk_map[disks[disk]['disk_name']]: disks[disk]['ssd'] = disk_map[disks[disk]['disk_name']]['ssd'] if "virtio" in disks[disk]["driver"]: # Virtual disk NS.tendrl.objects.VirtualDisk(**disks[disk]).save( ttl=_keep_alive_for) else: # physical disk NS.tendrl.objects.Disk(**disks[disk]).save(ttl=_keep_alive_for) for device in block_devices['all']: NS.tendrl.objects.BlockDevice(**device).save(ttl=_keep_alive_for) for device_id in block_devices['used']: etcd_utils.write("nodes/%s/LocalStorage/BlockDevices/used/%s" % (NS.node_context.node_id, device_id.replace("/", "_").replace("_", "", 1)), device_id, ttl=_keep_alive_for) for device_id in block_devices['free']: etcd_utils.write("nodes/%s/LocalStorage/BlockDevices/free/%s" % (NS.node_context.node_id, device_id.replace("/", "_").replace("_", "", 1)), device_id, ttl=_keep_alive_for) raw_reference = get_raw_reference() etcd_utils.write( "nodes/%s/LocalStorage/DiskRawReference" % NS.node_context.node_id, raw_reference, ttl=_keep_alive_for, ) except (Exception, KeyError) as ex: _msg = "node_sync disks sync failed: " + ex.message Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex }))
def test_write(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.wclient = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.wreconnect = type("Dummy", (object, ), {}) with patch.object(Client, "write") as mock_write: etcd_utils.write("key", "test_value", False) assert mock_write.assert_called with patch.object(Client, "write", raise_etcdconnectionfailed) as mock_write: with pytest.raises(etcd.EtcdConnectionFailed): etcd_utils.write("key", "test_value", False) with patch.object(Client, "write", raise_etcdkeynotfound) as mock_write: with pytest.raises(etcd.EtcdKeyNotFound): etcd_utils.write("key", "test_value", False)
def test_write(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.wclient = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.wreconnect = type("Dummy", (object,), {}) with patch.object(Client, "write") as mock_write: etcd_utils.write("key", "test_value", False) assert mock_write.assert_called with patch.object(Client, "write", raise_etcdconnectionfailed) as mock_write: with pytest.raises(etcd.EtcdConnectionFailed): etcd_utils.write("key", "test_value", False) with patch.object(Client, "write", raise_etcdkeynotfound) as mock_write: with pytest.raises(etcd.EtcdKeyNotFound): etcd_utils.write("key", "test_value", False)
def update_last_seen_at(): etcd_utils.write( '/monitoring/nodes/%s/last_seen_at' % NS.node_context.node_id, tendrl_now().isoformat())
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree, lvs ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)], lvs ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def run(self): logger.log( "info", NS.publisher_id, {"message": "Deleting cluster details."}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) integration_id = self.parameters['TendrlContext.integration_id'] etcd_keys_to_delete = [] etcd_keys_to_delete.append("/clusters/%s/nodes" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Bricks" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Volumes" % integration_id) etcd_keys_to_delete.append("/clusters/%s/GlobalDetails" % integration_id) etcd_keys_to_delete.append("/clusters/%s/TendrlContext" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Utilization" % integration_id) etcd_keys_to_delete.append("/clusters/%s/raw_map" % integration_id) etcd_keys_to_delete.append("/alerting/clusters/%s" % integration_id) nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id) node_ids = [] for node in nodes.leaves: node_id = node.key.split("/")[-1] node_ids.append(node_id) key = "/alerting/nodes/%s" % node_id etcd_keys_to_delete.append(key) try: # delete node alerts from /alerting/alerts node_alerts = etcd_utils.read(key) for node_alert in node_alerts.leaves: etcd_keys_to_delete.append("/alerting/alerts/%s" % node_alert.key.split("/")[-1]) except etcd.EtcdKeyNotFound: # No node alerts, continue pass # Find the alerting/alerts entries to be deleted try: cluster_alert_ids = etcd_utils.read("/alerting/clusters/%s" % integration_id) for entry in cluster_alert_ids.leaves: ca_id = entry.key.split("/")[-1] etcd_keys_to_delete.append("/alerting/alerts/%s" % ca_id) except etcd.EtcdKeyNotFound: # No cluster alerts, continue pass try: index_key = "/indexes/tags/tendrl/integration/%s" % integration_id _node_ids = etcd_utils.read(index_key).value _node_ids = json.loads(_node_ids) for _node_id in _node_ids[:]: node_obj = NS.tendrl.objects.NodeContext( node_id=_node_id).load() # Remove cluster indexes for down node if node_obj.status.lower() == "down": _node_ids.remove(_node_id) # Removing down node details logger.log( "warning", NS.publisher_id, { "message": "Deleting down node %s details" % node_obj.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) etcd_keys_to_delete.append("/nodes/%s" % _node_id) etcd_utils.write(index_key, json.dumps(_node_ids)) except (etcd.EtcdKeyNotFound, ValueError, TypeError, AttributeError, IndexError): # If index details not present then we don't need to stop # un-manage flow, Because when node-agent work properly these # details are populated again by the node sync pass # Remove the cluster details for key in list(set(etcd_keys_to_delete)): try: etcd_utils.delete(key, recursive=True) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, {"message": "%s key not found for deletion" % key}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) continue # remove short name cluster = NS.tendrl.objects.Cluster( integration_id=integration_id).load() cluster.short_name = "" cluster.save() return True
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") try: for brick in bricks: # find fqdn using ip ip = socket.gethostbyname(brick.split(":/")[0]) node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id).load().fqdn brick = fqdn + ":" + brick.split(":")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_')).load() # delete brick etcd_utils.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_')), recursive=True, ) # delete alert dashbaord job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, {"message": "Update dashboard job %s " "created" % job_id}) # delete brick details from graphite job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id }) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info etcd_utils.delete(volume_brick_path, recursive=True) _trigger_sync_key = 'clusters/%s/_sync_now' % \ NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval) except etcd.EtcdKeyNotFound: logger.log("debug", NS.publisher_id, {"message": "Unable to delete bricks %s" % bricks})
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)] ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def process_job(jid): job = NS.tendrl.objects.Job(job_id=jid).load() if job.status in [None, ""]: job.status = "new" job.save() NS.node_context = NS.node_context.load() # Check job not already "finished", or "processing" try: if job.status in ["finished", "processing", "failed"]: return except etcd.EtcdKeyNotFound: pass try: _timeout = None _timeout = job.timeout if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" parent jobs are timed out and marked # as "failed" if "tendrl/monitor" in NS.node_context.tags and _timeout == "yes" and \ job.status == "new" and job.payload.get('parent') is None: _valid_until = job.valid_until if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime( 1970, 1, 1).replace(tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.status = "failed" job.save() integration_id = NS.tendrl_context.integration_id alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() job = NS.tendrl.objects.Job(job_id=jid).load() if job.status == "new": # To avoid server and storage node do save same time job.valid_until = int(_now_plus_10_epoch) job.save() job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log("debug", NS.publisher_id, {"message": _msg}) return try: try: job_status_key = "/queue/%s/status" % job.job_id etcd_utils.write(job_status_key, "processing", prevValue="new") except etcd.EtcdKeyNotFound: # if status watchable attribute not present # then it will be created when job save happens pass lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) job = NS.tendrl.objects.Job(job_id=jid).load() job.locked_by = lock_info job.status = "processing" job.save(ttl=DEFAULT_JOB_TTL) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow(obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) time.sleep(2) job = NS.tendrl.objects.Job(job_id=jid).load() lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) if job.locked_by != lock_info: return the_flow = runnable_flow(parameters=job.payload['parameters'], job_id=job.job_id) # Tendrl server does not have fqdn in node_context logger.log("info", NS.publisher_id, { "message": "Starting %s Job: %s on %s" % (job.payload['run'].split('.')[-1], job.job_id, NS.node_context.fqdn or "server") }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) logger.log("info", NS.publisher_id, { "message": "Running %s job: %s on %s" % (job.payload['run'].split('.')[-1], job.job_id, NS.node_context.fqdn or "server") }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) the_flow.run() try: job = NS.tendrl.objects.Job(job_id=jid).load() job.status = "finished" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, { "message": "Job (%s) for %s finished. " % (job.job_id, job.payload['run'].split('.')[-1]) }, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "%s (job ID: %s) completed successfully " % (job.payload['run'].split('.')[-1], job.job_id), integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg + _trace, "exception": e })) if the_flow: logger.log("error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id']) else: logger.log("error", NS.publisher_id, {"message": _msg + "\n" + _trace}) try: job = NS.tendrl.objects.Job(job_id=jid).load() job.status = "failed" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = NS.tendrl.objects.Job(job_id=jid).load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id'), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name')) job.save()
def process_job(jid): job = NS.tendrl.objects.Job(job_id=jid).load() if job.status in [None, ""]: job.status = "new" job.save() NS.node_context = NS.node_context.load() # Check job not already "finished", or "processing" try: if job.status in ["finished", "processing", "failed"]: return except etcd.EtcdKeyNotFound: pass try: _timeout = None _timeout = job.timeout if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes" and job.status == "new": _valid_until = job.valid_until if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: job = job.load() if job.status == "new": job.status = "failed" job.save() except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() if job.status == "new": _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: integration_id = NS.tendrl_context.integration_id alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() time.sleep(7) job = job.load() if job.status == "new": # To avoid server and storage node do save same time job.valid_until = int(_now_plus_10_epoch) job.save() job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "debug", NS.publisher_id, {"message": _msg} ) return try: try: job_status_key = "/queue/%s/status" % job.job_id etcd_utils.write(job_status_key, "processing", prevValue="new") except etcd.EtcdKeyNotFound: # if status watchable attribute not present # then it will be created when job save happens pass lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) job = job.load() job.locked_by = lock_info job.status = "processing" job.save(ttl=DEFAULT_JOB_TTL) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) job = job.load() lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, type=NS.type) if job.locked_by != lock_info: return the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Starting Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running %s" % job.payload['run'].split('.')[-1]}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: job = job.load() job.status = "finished" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s) for %s finished. " % ( job.job_id, job.payload['run'].split('.')[-1])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "%s (job ID: %s) completed successfully " % ( job.payload['run'].split('.')[-1], job.job_id), integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: job = job.load() job.status = "failed" job.save() except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = etcd_utils.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass try: _job_timeout_key = "/queue/%s/timeout" % jid _timeout = None _timeout = etcd_utils.read(_job_timeout_key).value if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes": _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = etcd_utils.read( _job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: etcd_utils.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() etcd_utils.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "info", NS.publisher_id, {"message": _msg} ) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) etcd_utils.write(job_status_key, "processing", prevValue="new") etcd_utils.write(job_lock_key, json.dumps(lock_info)) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Processing Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running Flow %s" % job.payload['run']}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: etcd_utils.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s): Finished " "Flow %s" % ( job.job_id, job.payload['run'])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "Job finished successfully (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: etcd_utils.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") try: for brick in bricks: # find fqdn using ip ip = socket.gethostbyname(brick.split(":/")[0]) node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn brick = fqdn + ":" + brick.split(":")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_') ).load() # delete brick etcd_utils.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_') ), recursive=True, ) # delete alert dashbaord job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } ) # delete brick details from graphite job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id } ) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info etcd_utils.delete( volume_brick_path, recursive=True ) _trigger_sync_key = 'clusters/%s/_sync_now' % \ NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Unable to delete bricks %s" % bricks } )
def sync(sync_ttl): try: NS.node_context = NS.node_context.load() logger.log( "debug", NS.publisher_id, {"message": "Running SDS detection"} ) try: sds_discovery_manager = sds_manager.SDSDiscoveryManager() except ValueError as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "Failed to init SDSDiscoveryManager.", "exception": ex } ) ) return # Execute the SDS discovery plugins and tag the nodes with data for plugin in sds_discovery_manager.get_available_plugins(): sds_details = plugin.discover_storage_system() if sds_details is None: break if "peers" in sds_details and NS.tendrl_context.integration_id: _cnc = NS.tendrl.objects.ClusterNodeContext().load() this_peer_uuid = "" if _cnc.is_managed != "yes" or not NS.node_context.fqdn: for peer_uuid, data in sds_details.get("peers", {}).iteritems(): peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peer_uuid, hostname=data['hostname'], connected=data['connected'] ) peer.save() if data['hostname'] == "localhost": this_peer_uuid = peer_uuid # Figure out the hostname used to probe this peer integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" %\ NS.tendrl_context.integration_id _node_ids = etcd_utils.read(integration_id_index_key).value _node_ids = json.loads(_node_ids) for _node_id in _node_ids: if _node_id != NS.node_context.node_id: peer = NS.tendrl.objects.GlusterPeer( peer_uuid=this_peer_uuid, node_id=_node_id ).load() if peer.hostname: NS.node_context.pkey = peer.hostname NS.node_context.fqdn = peer.hostname NS.node_context.ipv4_addr = \ socket.gethostbyname( peer.hostname ) NS.node_context.save() break if ('detected_cluster_id' in sds_details and sds_details[ 'detected_cluster_id'] != ""): try: integration_index_key = \ "indexes/detected_cluster_id_to_integration_id/" \ "%s" % sds_details['detected_cluster_id'] dc = NS.tendrl.objects.DetectedCluster().load() if dc is None or dc.detected_cluster_id is None: time.sleep(sync_ttl) integration_id = str(uuid.uuid4()) try: etcd_utils.write( integration_index_key, integration_id, prevExist=False ) except etcd.EtcdAlreadyExist: pass _ptag = None if NS.tendrl_context.integration_id: _ptag = "provisioner/%s" % \ NS.tendrl_context.integration_id if _ptag in NS.node_context.tags: if dc.detected_cluster_id and \ dc.detected_cluster_id != sds_details.get( 'detected_cluster_id'): # Gluster peer list has changed integration_id = \ NS.tendrl_context.integration_id etcd_utils.write( integration_index_key, integration_id ) # Set the cluster status as new peer detected _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.status = "new_peers_detected" _cluster.save() # Raise an alert regarding the same msg = "New peers identified in cluster: %s. " \ "Make sure tendrl-ansible is executed " \ "for the new nodes so that expand " \ "cluster option can be triggered" % \ _cluster.short_name event_utils.emit_event( "cluster_status", "new_peers_detected", msg, "cluster_{0}".format(integration_id), "WARNING", integration_id=integration_id ) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.status == "new_peers_detected": peers = [] cmd = subprocess.Popen( "gluster pool list", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) out, err = cmd.communicate() if err or out is None or \ "Connection failed" in out: pass # set the no of peers as zero if out: lines = out.split('\n')[1:] for line in lines: if line.strip() != '': peers.append(line.split()[0]) nodes_ids = json.loads(etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value) if len(nodes_ids) == len(peers): # All the nodes are having node-agents # running and known to tendrl msg = "New nodes in cluster: %s have " \ "node agents running now. Cluster " \ "is ready to expand." % \ _cluster.short_name event_utils.emit_event( "cluster_status", "expand_pending", msg, "cluster_{0}".format( NS.tendrl_context.integration_id ), "INFO", integration_id=NS.tendrl_context. integration_id ) # Set the cluster status accordingly _cluster.status = 'expand_pending' _cluster.save() loop_count = 0 while True: # Wait till provisioner node assigns # integration_id for this detected_cluster_id if loop_count >= 72: return try: time.sleep(5) integration_id = etcd_utils.read( integration_index_key).value if integration_id: break except etcd.EtcdKeyNotFound: loop_count += 1 continue NS.tendrl_context.integration_id = integration_id NS.tendrl_context.cluster_id = sds_details.get( 'detected_cluster_id') NS.tendrl_context.cluster_name = sds_details.get( 'detected_cluster_name') NS.tendrl_context.sds_name = sds_details.get( 'pkg_name') NS.tendrl_context.sds_version = sds_details.get( 'pkg_version') NS.tendrl_context.save() NS.node_context = NS.node_context.load() integration_tag = "tendrl/integration/%s" % \ integration_id detected_cluster_tag = "detected_cluster/%s" % \ sds_details[ 'detected_cluster_id'] NS.node_context.tags += [detected_cluster_tag, integration_tag] NS.node_context.tags = list(set(NS.node_context.tags)) NS.node_context.save() NS.tendrl.objects.DetectedCluster( detected_cluster_id=sds_details.get( 'detected_cluster_id'), detected_cluster_name=sds_details.get( 'detected_cluster_name'), sds_pkg_name=sds_details.get('pkg_name'), sds_pkg_version=sds_details.get('pkg_version'), ).save() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] \ and _cluster.status in [None, ""]: _cluster.save() except (etcd.EtcdException, KeyError) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "Failed SDS detection", "exception": ex } ) ) break except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "node_sync " "SDS detection failed: " + ex.message, "exception": ex} ) )
def __init__(self): org_key = "_NS/monitoring/grafana_org_id" auth_key = "_NS/monitoring/grafana_auth_key" cluster_detail_list = create_dashboards.get_cluster_details() org_id = NS.config.data.get("org_id", None) if not org_id: try: org_id = etcd_utils.read(org_key).value except etcd.EtcdKeyNotFound: org_id = grafana_org_utils.create_org("Alert_dashboard") try: etcd_utils.write(org_key, org_id) except etcd.EtcdKeyNotFound: pass NS.config.data["org_id"] = org_id key = "" if grafana_org_utils.switch_context(org_id): key = NS.config.data.get("grafana_auth_key", None) if not key: try: key = etcd_utils.read(auth_key).value except etcd.EtcdKeyNotFound: key = grafana_org_utils.create_api_token( "grafana_auth_key", "Admin") try: etcd_utils.write(auth_key, key) except etcd.EtcdKeyNotFound: pass NS.config.data["grafana_auth_key"] = key response = datasource.create_datasource() if response.status_code == 200: msg = '\n' + "Datasource " + \ " uploaded successfully" + '\n' logger.log("info", NS.get("publisher_id", None), {'message': msg}) else: msg = "Datasource upload failed. Error code: {0} ," + \ "Error message: " + \ "{1} ".format( response.status_code, str(self.get_message_from_response(response))) logger.log("info", NS.get("publisher_id", None), {'message': msg}) if cluster_detail_list: resource_name = ["volumes", "hosts", "bricks", "clusters"] for resource in resource_name: # Uploading Alert Dashboards resource_dashboard = \ create_dashboards.create_resource_dashboard( cluster_detail_list, resource) response = dashboard._post_dashboard( resource_dashboard, key) if response.status_code == 200: msg = '\n' + "{} dashboard uploaded successfully". \ format(str(resource)) + '\n' logger.log("info", NS.get("publisher_id", None), {'message': msg}) else: msg = '\n' + "{} dashboard upload failed".format( str(resource)) + '\n' logger.log("info", NS.get("publisher_id", None), {'message': msg}) else: msg = "Could not switch context, Alert dashboard upload failed" logger.log("error", NS.get("publisher_id", None), {'message': msg})
def sync(): try: _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250 interfaces = get_node_network() if len(interfaces) > 0: for interface in interfaces: NS.tendrl.objects.NodeNetwork(**interface).save( ttl=_keep_alive_for) if interface['ipv4']: for ipv4 in interface['ipv4']: index_key = "/indexes/ip/%s" % ipv4 try: etcd_utils.write(index_key, NS.node_context.node_id, prevExist=False) except etcd.EtcdAlreadyExist: pass # TODO(team) add ipv6 support # if interface['ipv6']: # for ipv6 in interface['ipv6']: # index_key = "/indexes/ip/%s/%s" % (ipv6, # # NS.node_context.node_id) # NS._int.wclient.write(index_key, 1) # global network if len(interfaces) > 0: for interface in interfaces: if interface["subnet"] is not "": NS.node_agent.objects.GlobalNetwork(**interface).save( ttl=_keep_alive_for) try: networks = etcd_utils.read("/networks") for network in networks.leaves: try: # it will delete any node with empty network detail in # subnet, if one entry present then deletion never happen NS._int.wclient.delete( "%s/%s" % (network.key, NS.node_context.node_id), dir=True) # it will delete any subnet dir when it is empty # if one entry present then deletion never happen NS._int.wclient.delete(network.key, dir=True) except (etcd.EtcdKeyNotFound, etcd.EtcdDirNotEmpty): continue except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Given key is not present in " "etcd .", "exception": ex })) except Exception as ex: _msg = "node_sync networks sync failed: " + ex.message Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex }))
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) _msg = "node_sync, STALE provisioner node "\ "found! re-configuring monitoring "\ "(job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % self.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': _tc.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id) del _cnc
def sync(sync_ttl=None): try: tags = [] # update node agent service details logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating Service data"}) for service in TENDRL_SERVICES: s = NS.tendrl.objects.Service(service=service) if s.running: service_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags'][service.strip("@*")] tags.append(service_tag) if service_tag == "tendrl/server": tags.append("tendrl/monitor") s.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() # Try to claim orphan "provisioner_%integration_id" tag _tag = "provisioner/%s" % _cluster.integration_id _is_new_provisioner = False NS.node_context = NS.tendrl.objects.NodeContext().load() if _tag not in NS.node_context.tags: try: _index_key = "/indexes/tags/%s" % _tag _node_id = json.dumps([NS.node_context.node_id]) etcd_utils.write(_index_key, _node_id, prevExist=False) etcd_utils.refresh(_index_key, sync_ttl + 50) tags.append(_tag) _is_new_provisioner = True except etcd.EtcdAlreadyExist: pass # updating node context with latest tags logger.log( "debug", NS.publisher_id, {"message": "node_sync, updating node context " "data with tags"}) NS.node_context = NS.tendrl.objects.NodeContext().load() current_tags = list(NS.node_context.tags) tags += current_tags NS.node_context.tags = list(set(tags)) NS.node_context.tags.sort() current_tags.sort() if NS.node_context.tags != current_tags: NS.node_context.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = _cluster.load() if _is_new_provisioner and _cluster.is_managed == "yes": _msg = "node_sync, NEW provisioner node found! "\ "re-configuring monitoring (job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % NS.node_context.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': NS.tendrl_context.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) # Update /indexes/tags/:tag = [node_ids] for tag in NS.node_context.tags: index_key = "/indexes/tags/%s" % tag _node_ids = [] try: _node_ids = etcd_utils.read(index_key).value _node_ids = json.loads(_node_ids) except etcd.EtcdKeyNotFound: pass if _node_ids: if "provisioner" in tag: # Check if this is a stale provisioner if NS.node_context.node_id != _node_ids[0]: NS.node_context.tags.remove(tag) NS.node_context.save() continue if NS.node_context.node_id in _node_ids: if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) continue else: _node_ids += [NS.node_context.node_id] else: _node_ids = [NS.node_context.node_id] _node_ids = list(set(_node_ids)) etcd_utils.write(index_key, json.dumps(_node_ids)) if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating detected " "platform"}) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync service and indexes " "sync failed: " + ex.message, "exception": ex }))
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id ).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id ).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event( "node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id ) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn ) ) for brick in bricks.leaves: try: etcd_utils.write( "{0}/status".format(brick.key), "Stopped" ) except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event( "node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id ) del _cnc