def _execute_atom(self, atom_fqdn): try: ns, atom_name = atom_fqdn.split(".atoms.") ns, obj_name = ns.split(".objects.") ns_str = ns.split(".")[-1] if "integrations" in ns: current_ns = getattr(NS.integrations, ns_str) else: current_ns = getattr(NS, ns_str) runnable_atom = current_ns.ns.get_atom(obj_name, atom_name) try: ret_val = runnable_atom(parameters=self.parameters).run() return ret_val except AtomExecutionFailedError: exc_type, exc_value, exc_traceback = sys.exc_info() raise FlowExecutionFailedError( str( traceback.format_exception(exc_type, exc_value, exc_traceback))) except (KeyError, AttributeError) as ex: _msg = "Could not find atom {0}".format(atom_fqdn) logger.log("error", NS.publisher_id, {"message": _msg}, job_id=self.job_id, flow_id=self.parameters['flow_id']) Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) return False
def get_latest_stats(node, resource): try: node_name = central_store_util.get_node_name_from_id(node) stats = NS.time_series_db_manager.get_plugin().get_metric_stats( node_name, resource, 'latest') if stats == "[]" or not stats: raise TendrlPerformanceMonitoringException( 'Stats not yet available in time series db') return re.findall('Current:(.+?)Max', stats) except (ValueError, urllib3.exceptions.HTTPError, TendrlPerformanceMonitoringException) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to get latest stats of %s of ' 'node %s for node summary.' % (resource, node), "exception": ex })) raise ex
def __init__(self): super(NotificationPluginManager, self).__init__() self.daemon = True try: self.load_plugins() notification_medium = [] self.complete = threading.Event() for plugin in NotificationPlugin.plugins: notification_medium.append(plugin.name) NotificationMedia(media=notification_medium).save() except (AttributeError, SyntaxError, ValueError, KeyError, ImportError, etcd.EtcdException) as ex: Event( ExceptionMessage(priority="debug", publisher="notifier", payload={ "message": 'Failed to intialize notification ' 'manager', "exception": ex })) raise ex
def load_plugins(self): try: path = os.path.dirname(os.path.abspath(__file__)) + '/dbplugins' pkg = 'tendrl.performance_monitoring.time_series_db.dbplugins' for py in [f[:-3] for f in os.listdir(path) if f.endswith('.py') and f != '__init__.py']: plugin_name = '.'.join([pkg, py]) mod = importlib.import_module(plugin_name) clsmembers = inspect.getmembers(mod, inspect.isclass) for name, cls in clsmembers: exec("from %s import %s" % (plugin_name, name)) except (SyntaxError, ValueError, ImportError) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": 'Failed to load the time series db ' 'plugins.', "exception": ex } ) ) raise ex
def cluster_nodes_summary(self, cluster_id): node_summaries = [] node_ids = central_store_util.get_cluster_node_ids(cluster_id) for node_id in node_ids: try: node_summary = central_store_util.read( '/monitoring/summary/nodes/%s' % node_id ) node_summaries.append(node_summary) except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error caught fetching node summary of' ' node %s.' % node_id, "exception": ex } ) ) continue return node_summaries
def __init__(self): super(NotificationPluginManager, self).__init__() try: self.load_plugins() notification_medium = [] for plugin in NotificationPlugin.plugins: notification_medium.append(plugin.name) NS.notification_medium = notification_medium NotificationMedia(media=notification_medium).save() self.save_alertnotificationconfig() except (SyntaxError, ValueError, KeyError, etcd.EtcdKeyNotFound, etcd.EtcdConnectionFailed, etcd.EtcdException, NotificationPluginError) as ex: Event( ExceptionMessage(priority="error", publisher="alerting", payload={ "message": 'Failed to intialize notification ' 'manager', "exception": ex })) raise AlertingError(str(ex))
def _run(self): while not self._complete.is_set(): cluster_summaries = [] clusters = central_store_util.get_cluster_ids() for clusterid in clusters: gevent.sleep(0.1) try: cluster_summary = self.parse_cluster(clusterid) cluster_summaries.append(cluster_summary.copy()) cluster_summary.save(update=False) except EtcdKeyNotFound: pass except (EtcdException, AttributeError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Error caught computing summary.', "exception": ex })) continue NS.sds_monitoring_manager.compute_system_summary(cluster_summaries) gevent.sleep(60)
def init_monitoring(self): try: node_dets = central_store_util.get_nodes_details() for node_det in node_dets: if ( node_det['node_id'] not in self.monitoring_config_init_nodes ): self.init_monitoring_on_node(node_det) self.monitoring_config_init_nodes.append( node_det['node_id'] ) except TendrlPerformanceMonitoringException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": 'Failed to intialize monitoring ' 'configuration on nodes. ', "exception": ex } ) ) raise ex
def _application(self, env, start_response): try: if env['PATH_INFO'] != '/grafana_callback': start_response('404 Not Found', [('Content-Type', 'text/html')]) response = [b'<h1>Alert Not Found</h1>'] else: data = env['wsgi.input'].read() data = json.loads(data) self.alert_handler.handle_alert(data["ruleId"]) start_response('200 OK', [('Content-Type', 'text/html')]) response = [b'<h1>Alert Received</h1>'] except (IOError, AssertionError) as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to read alert from socket", "exception": ex })) response = [b'<h1>Error in reading alert from socket</h1>'] return response
def on_sync_object(self, data): assert data['fsid'] == self.fsid sync_object = copy.deepcopy(data['data']) sync_type = SYNC_OBJECT_STR_TYPE[data['type']] new_object = self.inject_sync_object(data['type'], data['version'], sync_object) self._request_coll.on_map(sync_type, new_object) if new_object: # Check and raise any alerts if required # TODO(team) Enabled the below if condition as when # alerting needed for cluster health, mon status, pool # status etc # if sync_type.str == "health": # self._on_health(sync_object) # if sync_type.str == "mon_status": # self._on_mon_status(sync_object) if sync_type.str == "osd_map": # self._on_pool_status(sync_object) self._on_osd_map(sync_object) NS.ceph.objects.SyncObject( updated=now(), sync_type=sync_type.str, version=new_object.version if isinstance( new_object.version, int) else None, when=now(), data=data['data']).save(update=False) if sync_type.str == "health": NS.ceph.objects.GlobalDetails( status=sync_object['overall_status']).save() if sync_type.str == "osd_map": # Pool out of band deletion handling try: pools = NS._int.client.read( "clusters/%s/Pools" % NS.tendrl_context.integration_id) old_pool_ids = [] for pool in pools.leaves: old_pool_ids.append(int(pool.key.split("/")[-1])) new_pool_ids = [] for raw_pool in sync_object.get('pools', []): new_pool_ids.append(raw_pool['pool']) delete_pool_ids = set(old_pool_ids) - set(new_pool_ids) for id in delete_pool_ids: NS._int.client.delete( "clusters/%s/Pools/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "No pools found \ for ceph cluster %s" % NS.tendrl_context.integration_id, "exception": ex })) for raw_pool in sync_object.get('pools', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating Pool %s" % raw_pool['pool_name'] })) pool_type = 'replicated' if 'erasure_code_profile' in raw_pool and \ raw_pool['erasure_code_profile'] != "": pool_type = 'erasure_coded' quota_enabled = False if ('quota_max_objects' in raw_pool and raw_pool['quota_max_objects'] > 0) or \ ('quota_max_bytes' in raw_pool and raw_pool['quota_max_bytes'] > 0): quota_enabled = True NS.ceph.objects.Pool( pool_id=raw_pool['pool'], pool_name=raw_pool['pool_name'], pg_num=raw_pool['pg_num'], type=pool_type, erasure_code_profile=raw_pool.get( 'erasure_code_profile'), min_size=raw_pool['min_size'], size=raw_pool.get('size', None), quota_enabled=quota_enabled, quota_max_objects=raw_pool['quota_max_objects'], quota_max_bytes=raw_pool['quota_max_bytes'], ).save() # Osd out of band deletion handling try: osds = NS._int.client.read( "clusters/%s/Osds" % NS.tendrl_context.integration_id) old_osds = [] for osd in osds.leaves: old_osds.append(str(osd.key.split("/")[-1])) new_osds = [] for raw_osd in sync_object.get('osds', []): new_osds.append(raw_osd['uuid']) delete_osds = set(old_osds) - set(new_osds) for id in delete_osds: NS._int.client.delete( "clusters/%s/Osds/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "key not found in etcd", "exception": ex })) for raw_osd in sync_object.get('osds', []): Event( Message(priority="info", publisher=NS.publisher_id, payload={ "message": "Updating OSD %s" % raw_osd['osd'] })) osd_host = socket.gethostbyaddr( raw_osd['public_addr'].split(':')[0])[0] NS.ceph.objects.Osd( id=raw_osd['osd'], uuid=raw_osd['uuid'], hostname=osd_host, public_addr=raw_osd['public_addr'], cluster_addr=raw_osd['cluster_addr'], heartbeat_front_addr=raw_osd['heartbeat_front_addr'], heartbeat_back_addr=raw_osd['heartbeat_back_addr'], down_at=raw_osd['down_at'], up_from=raw_osd['up_from'], lost_at=raw_osd['lost_at'], osd_up=raw_osd['up'], osd_in=raw_osd['in'], up_thru=raw_osd['up_thru'], weight=str(raw_osd['weight']), primary_affinity=str(raw_osd['primary_affinity']), state=raw_osd['state'], last_clean_begin=raw_osd['last_clean_begin'], last_clean_end=raw_osd['last_clean_end']).save() else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "ClusterMonitor.on_sync_object: " "stale object received for %s" % data['type'] }))
def run(self): try: # Lock nodes flow_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] if integration_id is None: raise FlowExecutionFailedError( "TendrlContext.integration_id cannot be empty") supported_sds = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['supported_sds'] sds_name = self.parameters["TendrlContext.sds_name"] if sds_name not in supported_sds: raise FlowExecutionFailedError("SDS (%s) not supported" % sds_name) ssh_job_ids = [] ssh_job_ids = \ flow_utils.gluster_create_ssh_setup_jobs( self.parameters, skip_current_node=True ) while True: time.sleep(3) all_status = {} for job_id in ssh_job_ids: job = NS.tendrl.objects.Job(job_id=job_id).load() all_status[job_id] = job.status _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise FlowExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all( [status == "finished" for status in all_status.values()]): logger.log("info", NS.publisher_id, { "message": "SSH setup completed for all " "nodes in cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) break # SSH setup jobs finished above, now install sds # bits and create cluster logger.log("info", NS.publisher_id, { "message": "Expanding Gluster Storage" " Cluster %s" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) gluster_help.expand_gluster(self.parameters) logger.log( "info", NS.publisher_id, { "message": "SDS install/config completed on newly " "expanded nodes, Please wait while " "tendrl-node-agents detect sds details on the newly " "expanded nodes %s" % self.parameters['Node[]'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # Wait till detected cluster in populated for nodes while True: time.sleep(3) all_status = [] detected_cluster = "" different_cluster_id = False dc = "" for node in self.parameters['Node[]']: try: dc = NS.tendrl.objects.DetectedCluster( node_id=node).load() if not detected_cluster: detected_cluster = dc.detected_cluster_id else: if detected_cluster != dc.detected_cluster_id: all_status.append(False) different_cluster_id = True break all_status.append(True) except etcd.EtcdKeyNotFound: all_status.append(False) if different_cluster_id: raise FlowExecutionFailedError( "Seeing different detected cluster id in" " different nodes. %s and %s" % (detected_cluster, dc.detected_cluster_id)) if all_status: if all(all_status): break # Create the params list for import cluster flow new_params = dict() new_params['Node[]'] = self.parameters['Node[]'] new_params['TendrlContext.integration_id'] = integration_id # Get node context for one of the nodes from list dc = NS.tendrl.objects.DetectedCluster( node_id=self.parameters['Node[]'][0]).load() sds_pkg_name = dc.sds_pkg_name new_params['import_after_expand'] = True sds_pkg_version = dc.sds_pkg_version new_params['DetectedCluster.sds_pkg_name'] = \ sds_pkg_name new_params['DetectedCluster.sds_pkg_version'] = \ sds_pkg_version tags = [] for node in self.parameters['Node[]']: tags.append("tendrl/node_%s" % node) payload = { "tags": tags, "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) # release lock before import cluster flow_utils.release_node_lock(self.parameters) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log( "info", NS.publisher_id, { "message": "Please wait while Tendrl imports (" "job_id: %s) newly expanded " "%s storage nodes in cluster %s" % (_job_id, sds_pkg_name, NS.tendrl.objects.Cluster( integration_id=integration_id).load().short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock if any exception came flow_utils.release_node_lock(self.parameters)
def _sync_rbds(self): try: pools = NS._int.client.read("clusters/%s/Pools" % NS.tendrl_context.integration_id, recursive=True) for child in pools._children: pool_id = child['key'].split('/')[-1] pool_name = NS._int.client.read( "clusters/%s/Pools/%s/pool_name" % (NS.tendrl_context.integration_id, pool_id)).value rbd_details = self._get_rbds(pool_name) # Rbd out of band delete handling try: rbds = NS._int.client.read( "clusters/%s/Pools/%s/Rbds" % (NS.tendrl_context.integration_id, pool_id)) old_rbds = [] for rbd in rbds.leaves: old_rbds.append(rbd.key.split("/")[-1]) new_rbds = [] for k, v in rbd_details.iteritems(): new_rbds.append(k) delete_rbds = set(old_rbds) - set(new_rbds) for id in delete_rbds: NS._int.client.delete( "clusters/%s/Pools/%s/Rbds/%s" % (NS.tendrl_context.integration_id, pool_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "No rbds found for ceph cluster %s" % NS.tendrl_context.integration_id, "exception": ex })) for k, v in rbd_details.iteritems(): NS.ceph.objects.Rbd( name=k, size=v['size'], pool_id=pool_id, flags=v['flags'], provisioned=self._to_bytes(v['provisioned']) if v.get("provisioned") else None, used=self._to_bytes(v['used'])).save() try: rbds = NS._int.client.read( "clusters/%s/Pools/%s/Rbds" % (NS.tendrl_context.integration_id, pool_id)) except etcd.EtcdKeyNotFound: # no rbds for pool, continue continue for entry in rbds.leaves: fetched_rbd = NS.ceph.objects.Rbd( pool_id=pool_id, name=entry.key.split("Rbds/")[-1]).load() if fetched_rbd.name not in rbd_details.keys(): NS._int.client.delete( "clusters/%s/Pools/%s/Rbds/%s" % (NS.tendrl_context.integration_id, pool_id, fetched_rbd.name), recursive=True) except etcd.EtcdKeyNotFound: pass
def _sync_ec_profiles(self): """Invokes the below CLI commands 1. ```ceph osd erasure-code-profile ls``` and required output format is a list of ec profiles separated with new lines as below ``` default k4m2 ``` 2. ```ceph osd erasure-code-profile get {name}``` and the required output format is '=' separated values in multiple lines ``` k=2 m=1 plugin=jerasure directory={dir} ``` """ required_ec_profiles = [(2, 1), (4, 2), (6, 3), (8, 4)] ec_profile_details = {} commands = ['osd', 'erasure-code-profile', 'ls'] cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name, commands) if cmd_out['err'] == "": ec_profile_list = [] for item in cmd_out['out'].split('\n'): if item != "": ec_profile_list.append(item) for ec_profile in ec_profile_list: commands = ['osd', 'erasure-code-profile', 'get', ec_profile] cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name, commands) if cmd_out['err'] == "": info = {} for item in cmd_out['out'].split('\n'): if item != "": info[item.split('=')[0]] = \ item.split('=')[1].strip() ec_profile_details[ec_profile] = info # Ec profile out of band delete handling try: ec_profiles = NS._int.client.read( "clusters/%s/ECProfiles" % (NS.tendrl_context.integration_id)) old_ec_profiles = [] for ec_profile in ec_profiles.leaves: old_ec_profiles.append(ec_profile.key.split("/")[-1]) new_ec_profiles = [] for k, v in ec_profile_details.iteritems(): new_ec_profiles.append(k) delete_ec_profiles = set(old_ec_profiles) - set( new_ec_profiles) for id in delete_ec_profiles: NS._int.client.delete( "clusters/%s/ECProfiles/%s" % (NS.tendrl_context.integration_id, id), recursive=True) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "key not found in etcd", "exception": ex })) available_ec_profiles = [] for k, v in ec_profile_details.iteritems(): NS.ceph.objects.ECProfile( name=k, k=v['k'], m=v['m'], plugin=v.get('plugin'), directory=v.get('directory'), ruleset_failure_domain=v.get('ruleset_failure_domain')).save() available_ec_profiles.append((int(v['k']), int(v['m']))) # Create the missing ec_profile_details missing_ec_profiles = [ item for item in required_ec_profiles if item not in available_ec_profiles ] for item in missing_ec_profiles: attrs = dict(name="k%sm%s" % (item[0], item[1]), k=item[0], m=item[1], plugin='jerasure', directory='/usr/lib/ceph/erasure-code') crud = Crud() crud.create("ec_profile", attrs)
def sync(): try: _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250 interfaces = get_node_network() if len(interfaces) > 0: for interface in interfaces: NS.tendrl.objects.NodeNetwork(**interface).save( ttl=_keep_alive_for) if interface['ipv4']: for ipv4 in interface['ipv4']: index_key = "/indexes/ip/%s" % ipv4 try: NS._int.wclient.write(index_key, NS.node_context.node_id, prevExist=False) except etcd.EtcdAlreadyExist: pass # TODO(team) add ipv6 support # if interface['ipv6']: # for ipv6 in interface['ipv6']: # index_key = "/indexes/ip/%s/%s" % (ipv6, # # NS.node_context.node_id) # NS._int.wclient.write(index_key, 1) # global network if len(interfaces) > 0: for interface in interfaces: if interface["subnet"] is not "": NS.node_agent.objects.GlobalNetwork(**interface).save( ttl=_keep_alive_for) try: networks = NS._int.client.read("/networks") for network in networks.leaves: try: # it will delete any node with empty network detail in # subnet, if one entry present then deletion never happen NS._int.wclient.delete( "%s/%s" % (network.key, NS.node_context.node_id), dir=True) # it will delete any subnet dir when it is empty # if one entry present then deletion never happen NS._int.wclient.delete(network.key, dir=True) except (etcd.EtcdKeyNotFound, etcd.EtcdDirNotEmpty): continue except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Given key is not present in " "etcd .", "exception": ex })) except Exception as ex: _msg = "node_sync networks sync failed: " + ex.message Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex }))
def get_node_brick_status_counts(self, node_id): brick_status_wise_counts = { 'stopped': 0, 'total': 0, pm_consts.WARNING_ALERTS: 0, pm_consts.CRITICAL_ALERTS: 0 } try: node_name = central_store_util.get_node_name_from_id(node_id) except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching node name for node " "%s" % node_id, "exception": ex } ) ) return brick_status_wise_counts try: ip_indexes = etcd_read_key('/indexes/ip') except EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Error fetching ip indexes", "exception": ex } ) ) return brick_status_wise_counts node_ip = '' for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip break try: cluster_id = central_store_util.get_node_cluster_id( node_id ) if cluster_id: bricks = self.get_cluster_bricks(cluster_id) for brick_path, brick_det in bricks.iteritems(): if ( brick_det['hostname'] == node_name or brick_det['hostname'] == node_ip ): if ( 'status' in brick_det and brick_det['status'] == 'Stopped' ): brick_status_wise_counts['stopped'] = \ brick_status_wise_counts['stopped'] + 1 brick_status_wise_counts['total'] = \ brick_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'brick', pm_consts.CLUSTER, cluster_id=cluster_id ) count = 0 for alert in crit_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = count count = 0 for alert in warn_alerts: if alert['node_id'] == node_id: count = count + 1 brick_status_wise_counts[ pm_consts.WARNING_ALERTS ] = count except ( TendrlPerformanceMonitoringException, AttributeError, ValueError, KeyError ) as ex: Event( Message( priority="info", publisher=NS.publisher_id, payload={ "message": "Exception caught fetching node brick" " status wise counts", "exception": ex } ) ) return brick_status_wise_counts
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and _cluster.current_job['status'] == 'failed') or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] # instantiating blivet class, this will be used for # getting brick_device_details b = blivet.Blivet() # reset blivet during every sync to get latest information # about storage devices in the machine b.reset() devicetree = b.devicetree total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def get_node_osd_status_wise_counts(self, node_id): osds_in_node = [] osd_status_wise_counts = { 'total': 0, 'down': 0, pm_consts.CRITICAL_ALERTS: 0, pm_consts.WARNING_ALERTS: 0 } cluster_id = central_store_util.get_node_cluster_id( node_id ) node_ip = '' ip_indexes = etcd_read_key('/indexes/ip') for ip, indexed_node_id in ip_indexes.iteritems(): if node_id == indexed_node_id: node_ip = ip try: osds = etcd_read_key( '/clusters/%s/maps/osd_map/data/osds' % cluster_id ) osds = ast.literal_eval(osds.get('osds', '[]')) for osd in osds: if ( node_ip in osd.get('cluster_addr', '') or node_ip in osd.get('public_addr', '') ): osds_in_node.append(osd.get('osd')) if 'up' not in osd.get('state'): osd_status_wise_counts['down'] = \ osd_status_wise_counts['down'] + 1 osd_status_wise_counts['total'] = \ osd_status_wise_counts['total'] + 1 crit_alerts, warn_alerts = parse_resource_alerts( 'osd', pm_consts.CLUSTER, cluster_id=cluster_id ) count = 0 for alert in crit_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[ pm_consts.CRITICAL_ALERTS ] = count count = 0 for alert in warn_alerts: plugin_instance = alert['tags'].get('plugin_instance', '') if int(plugin_instance[len('osd_'):]) in osds_in_node: count = count + 1 osd_status_wise_counts[ pm_consts.WARNING_ALERTS ] = count except ( EtcdException, AttributeError, KeyError, ValueError, TendrlPerformanceMonitoringException ) as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Exception caught computing node osd " "counts", "exception": ex } ) ) return osd_status_wise_counts
def dispatch_notification(self, alert): server = None try: self.set_destinations() if (not self.user_configs or len(self.user_configs) == 0): log( "error", "notifier", { "message": 'No destinations configured to send' 'alert notification' }) return except (AttributeError, EtcdException, ValueError, KeyError, SyntaxError) as ex: Event( ExceptionMessage(priority="debug", publisher="notifier", payload={ "message": 'Exception caught attempting to set' ' %s email destinations' % str(alert.tags), "exception": ex })) return try: msg = self.format_message(alert) if not self.admin_config: log( "debug", "notifier", { "message": 'Detected alert %s.' 'But, admin config is a must to send' ' notification' % msg }) return server = self.get_mail_client() server.ehlo() if self.admin_config['auth'] != "": server.login(self.admin_config['email_id'], self.admin_config['email_pass']) server.sendmail(self.admin_config['email_id'], self.user_configs, msg) log( "debug", "notifier", { "message": 'Sent mail to %s to alert about %s' % (self.user_configs, msg) }) except (error, smtplib.SMTPException, smtplib.SMTPAuthenticationError, smtplib.socket.gaierror, smtplib.SMTPSenderRefused, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher="notifier", payload={ "message": 'Exception caught attempting to email' '%s' % msg, "exception": ex })) finally: if server: server.close()
def run(self): try: integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() # Lock nodes flow_utils.acquire_node_lock(self.parameters) NS.tendrl_context = NS.tendrl_context.load() # TODO(team) when Tendrl supports create/expand/shrink cluster # setup passwordless ssh for all gluster nodes with given # integration_id (check # /indexes/tags/tendrl/integration/$integration_id for list of # nodes in cluster node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except # $this payload = {"tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) NS.tendrl.objects.Job( job_id=_job_id, status="new", payload=payload ).save() logger.log( "info", NS.publisher_id, {"message": "ImportCluster %s (jobID: %s) :" "importing host %s" % (_cluster.short_name, _job_id, node)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) # Check if minimum required version of underlying gluster # cluster met. If not fail the import task # A sample output from "rpm -qa | grep glusterfs-server" # looks as below # `glusterfs-server-3.8.4-54.4.el7rhgs.x86_64` # In case of upstream build the format could be as below # `glusterfs-server-4.1dev-0.203.gitc3e1a2e.el7.centos.x86_64` # `glusterfs-server-3.12.8-0.0.el7.centos.x86_64.rpm` cmd = subprocess.Popen( 'rpm -q glusterfs-server', shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) out, err = cmd.communicate() if out in [None, ""] or err: logger.log( "error", NS.publisher_id, {"message": "Failed to detect underlying cluster version"}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False lines = out.split('\n') build_no = None req_build_no = None ver_det = lines[0].split('glusterfs-server-')[-1].split('.') maj_ver = ver_det[0] min_ver = ver_det[1] if 'dev' in min_ver: min_ver = min_ver[0] rel = ver_det[2] if '-' in rel: build_no = rel.split('-')[-1] rel = rel.split('-')[0] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs()[ 'namespace.tendrl' ]['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') if '-' in req_rel: req_build_no = req_rel.split('-')[-1] req_rel = req_rel.split('-')[0] logger.log( "info", NS.publisher_id, {"message": "Checking minimum required version (" "%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) ver_check_failed = False if int(maj_ver) < int(req_maj_ver): ver_check_failed = True else: if int(maj_ver) == int(req_maj_ver): if int(min_ver) < int(req_min_ver): ver_check_failed = True else: if int(min_ver) == int(req_min_ver): if int(rel) < int(req_rel): ver_check_failed = True else: if int(rel) == int(req_rel): if build_no is not None and \ req_build_no is not None and \ int(build_no) < int(req_build_no): ver_check_failed = True if ver_check_failed: logger.log( "error", NS.publisher_id, {"message": "Error: Minimum required version " "(%s.%s.%s) " "doesnt match that of detected Gluster " "Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False ret_val, err = import_gluster(self.parameters) if not ret_val: logger.log( "error", NS.publisher_id, {"message": "Error importing the cluster (integration_id:" " %s). Error: %s" % (integration_id, err) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False if len(node_list) > 1: logger.log( "info", NS.publisher_id, {"message": "ImportCluster %s waiting for hosts %s " "to be imported" % (_cluster.short_name, node_list)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = (len(node_list) - 1) * 36 while True: child_jobs_failed = [] parent_job = NS.tendrl.objects.Job( job_id=self.parameters['job_id'] ).load() if loop_count >= wait_count: logger.log( "error", NS.publisher_id, {"message": "Import jobs on cluster(%s) not yet " "complete on all nodes(%s). Timing out." % (_cluster.short_name, str(node_list))}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) # Marking child jobs as failed which did not complete # as the parent job has timed out. This has to be done # explicitly because these jobs will still be processed # by the node-agent, and will keep it busy, which might # defer the new jobs or lead to their timeout. for child_job_id in parent_job.children: child_job = NS.tendrl.objects.Job( job_id=child_job_id ).load() if child_job.status not in ["finished", "failed"]: child_job.status = "failed" child_job.save() return False time.sleep(10) completed = True for child_job_id in parent_job.children: child_job = NS.tendrl.objects.Job( job_id=child_job_id ).load() if child_job.status not in ["finished", "failed"]: completed = False elif child_job.status == "failed": child_jobs_failed.append(child_job.job_id) if completed: break else: loop_count += 1 continue if len(child_jobs_failed) > 0: _msg = "Child jobs failed are %s" % child_jobs_failed logger.log( "error", NS.publisher_id, {"message": _msg}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except Exception as ex: # For traceback Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex } ) ) # raising exception to mark job as failed raise ex finally: # release lock flow_utils.release_node_lock(self.parameters) return True
def run(self): try: # Lock nodes create_cluster_utils.acquire_node_lock(self.parameters) integration_id = self.parameters['TendrlContext.integration_id'] sds_name = self.parameters['DetectedCluster.sds_pkg_name'] if not self.parameters.get('import_after_expand', False) and \ not self.parameters.get('import_after_create', False): # check if gdeploy in already provisioned in this cluster # if no it has to be provisioned here if sds_name.find("gluster") > -1 and \ not self.parameters.get("gdeploy_provisioned", False) and \ not self._probe_and_mark_provisioner( self.parameters["Node[]"], integration_id ): create_cluster_utils.install_gdeploy() create_cluster_utils.install_python_gdeploy() ssh_job_ids = create_cluster_utils.gluster_create_ssh_setup_jobs( self.parameters) while True: gevent.sleep(3) all_status = {} for job_id in ssh_job_ids: all_status[job_id] = NS._int.client.read( "/queue/%s/status" % job_id).value _failed = { _jid: status for _jid, status in all_status.iteritems() if status == "failed" } if _failed: raise AtomExecutionFailedError( "SSH setup failed for jobs %s cluster %s" % (str(_failed), integration_id)) if all([ status == "finished" for status in all_status.values() ]): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "SSH setup completed for all nodes in cluster %s" % integration_id })) # set this node as gluster provisioner tags = ["provisioner/%s" % integration_id] NS.node_context = NS.node_context.load() tags += NS.node_context.tags NS.node_context.tags = list(set(tags)) NS.node_context.save() # set gdeploy_provisioned to true so that no other nodes # tries to configure gdeploy self.parameters['gdeploy_provisioned'] = True break NS.tendrl_context = NS.tendrl_context.load() NS.tendrl_context.integration_id = integration_id _detected_cluster = NS.tendrl.objects.DetectedCluster().load() NS.tendrl_context.cluster_id = _detected_cluster.detected_cluster_id NS.tendrl_context.cluster_name = _detected_cluster.detected_cluster_name NS.tendrl_context.sds_name = _detected_cluster.sds_pkg_name NS.tendrl_context.sds_version = _detected_cluster.sds_pkg_version NS.tendrl_context.save() Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Registered Node %s with cluster %s" % (NS.node_context.node_id, NS.tendrl_context.integration_id) })) node_list = self.parameters['Node[]'] cluster_nodes = [] if len(node_list) > 1: # This is the master node for this flow for node in node_list: if NS.node_context.node_id != node: new_params = self.parameters.copy() new_params['Node[]'] = [node] # create same flow for each node in node list except $this payload = { "tags": ["tendrl/node_%s" % node], "run": "tendrl.flows.ImportCluster", "status": "new", "parameters": new_params, "parent": self.parameters['job_id'], "type": "node" } _job_id = str(uuid.uuid4()) cluster_nodes.append(_job_id) Job(job_id=_job_id, status="new", payload=payload).save() Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Importing (job: %s) Node %s to cluster %s" % (_job_id, node, integration_id) })) if "ceph" in sds_name.lower(): node_context = NS.node_context.load() is_mon = False for tag in node_context.tags: mon_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags']['ceph-mon'] if mon_tag in tag: is_mon = True if is_mon: # Check if minimum required version of underlying ceph # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster( ).load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_ceph_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_ceph_ver'] req_maj_ver, req_min_ver, req_rel = reqd_ceph_ver.split( '.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Ceph Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Ceph Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected ceph version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_ceph_ver)) import_ceph(self.parameters) else: # Check if minimum required version of underlying gluster # cluster met. If not fail the import task detected_cluster = NS.tendrl.objects.DetectedCluster().load() detected_cluster_ver = detected_cluster.sds_pkg_version.split( '.') maj_ver = detected_cluster_ver[0] min_ver = detected_cluster_ver[1] reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['min_reqd_gluster_ver'] req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.') Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Check: Minimum required version (%s.%s.%s) of Gluster Storage" % (req_maj_ver, req_min_ver, req_rel) })) if int(maj_ver) < int(req_maj_ver) or \ int(min_ver) < int(req_min_ver): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="error", publisher=NS.publisher_id, payload={ "message": "Error: Minimum required version (%s.%s.%s) " "doesnt match that of detected Gluster Storage (%s.%s.%s)" % (req_maj_ver, req_min_ver, req_rel, maj_ver, min_ver, 0) })) raise FlowExecutionFailedError( "Detected gluster version: %s" " is lesser than required version: %s" % (detected_cluster.sds_pkg_version, reqd_gluster_ver)) import_gluster(self.parameters) Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Waiting for participant nodes %s to be " "imported %s" % (node_list, integration_id) })) # An import is sucessfull once all Node[] register to # /clusters/:integration_id/nodes/:node_id while True: _all_node_status = [] gevent.sleep(3) for node_id in self.parameters['Node[]']: _status = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).exists() \ and NS.tendrl.objects.ClusterTendrlContext( integration_id=integration_id ).exists() _all_node_status.append(_status) if _all_node_status: if all(_all_node_status): Event( Message( job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Import Cluster completed for all nodes " "in cluster %s" % integration_id })) break Event( Message(job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], priority="info", publisher=NS.publisher_id, payload={ "message": "Sucessfully imported cluster %s" % integration_id })) except Exception as ex: # For traceback Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": ex.message, "exception": ex })) # raising exception to mark job as failed raise ex finally: # release lock create_cluster_utils.release_node_lock(self.parameters) return True
def sync(): try: # platform plugins Event( Message( priority="debug", publisher=NS.publisher_id, payload={"message": "Running Platform detection" } ) ) try: p_mgr = platform_manager.PlatformManager() except ValueError as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to init PlatformManager. \Error %s', "exception": ex } ) ) return # execute the platform plugins for plugin in p_mgr.get_available_plugins(): platform_details = plugin.discover_platform() if len(platform_details.keys()) > 0: # update etcd try: NS.platform = NS.tendrl.objects.Platform( os=platform_details["Name"], os_version=platform_details["OSVersion"], kernel_version=platform_details["KernelVersion"], ) NS.platform.save() except etcd.EtcdException as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Failed to update etcd . \Error %s", "exception": ex } ) ) break except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "node_sync " "OS Platform detection failed: " + ex.message, "exception": ex} ) )
def calculate_host_summary(self, node): gevent.sleep(0.1) cpu_usage = self.get_net_host_cpu_utilization(node) memory_usage = self.get_net_host_memory_utilization(node) storage_usage = self.get_net_storage_utilization(node) swap_usage = self.get_net_host_swap_utilization(node) alert_count = self.get_alert_count(node) sds_det = NS.sds_monitoring_manager.get_node_summary(node) old_summary = NodeSummary(node_id=node, name='', status='', role='', cluster_name='', cpu_usage={ 'percent_used': '', 'updated_at': '' }, memory_usage={ 'percent_used': '', 'updated_at': '', 'used': '', 'total': '' }, storage_usage={ 'percent_used': '', 'total': '', 'used': '', 'updated_at': '' }, swap_usage={ 'percent_used': '', 'updated_at': '', 'used': '', 'total': '' }, sds_det={}, alert_count=alert_count) try: old_summary = old_summary.load() except EtcdKeyNotFound: pass except (EtcdConnectionFailed, Exception) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Failed to fetch previously computed ' 'summary from etcd.', "exception": ex })) return if cpu_usage is None: cpu_usage = old_summary.cpu_usage if memory_usage is None: memory_usage = old_summary.memory_usage if storage_usage is None: storage_usage = old_summary.storage_usage if swap_usage is None: swap_usage = old_summary.swap_usage try: summary = NodeSummary( name=central_store_util.get_node_name_from_id(node), node_id=node, status=self.get_node_status(node), role=central_store_util.get_node_role(node), cluster_name=central_store_util.get_node_cluster_name(node), cpu_usage=cpu_usage, memory_usage=memory_usage, storage_usage=storage_usage, swap_usage=swap_usage, selinux_mode=central_store_util.get_node_selinux_mode(node), sds_det=sds_det, alert_count=alert_count) summary.save(update=False) except Exception as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": 'Exception caught while trying to ' 'save summary for node %s' % str(node), "exception": ex }))
def load(self): if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store return self except TypeError: # no hash for this object, save the current hash as is pass _copy = self._copy_vars() for item in _copy.render(): try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": "Reading %s" % item['key']})) except KeyError: sys.stdout.write("Reading %s" % item['key']) try: etcd_resp = NS._int.client.read(item['key'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) == etcd.EtcdKeyNotFound: continue else: NS._int.reconnect() etcd_resp = NS._int.client.read(item['key'], quorum=True) value = etcd_resp.value if item['dir']: key = item['key'].split('/')[-1] dct = dict(key=value) if hasattr(_copy, item['name']): dct = getattr(_copy, item['name']) if type(dct) == dict: dct[key] = value else: setattr(_copy, item['name'], dct) else: setattr(_copy, item['name'], dct) continue # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if value: try: value = json.loads(value.decode('utf-8')) except ValueError as ex: _msg = "Error load() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) else: if _type.lower() == "list": value = list() if _type.lower() == "json": value = dict() setattr(_copy, item['name'], value) return _copy
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)] ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return except TypeError: # no hash for this object, save the current hash as is pass if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Writing %s to %s" % (item['key'], item['value']) })) except KeyError: sys.stdout.write("Writing %s to %s" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl)
def brick_status_alert(hostname): try: # fetching brick details of disconnected node lock = None path = "clusters/%s/Bricks/all/%s" % ( NS.tendrl_context.integration_id, hostname ) lock = etcd.Lock( NS._int.client, path ) lock.acquire( blocking=True, lock_ttl=60 ) if lock.is_acquired: bricks = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=hostname ).load_all() for brick in bricks: if brick.status.lower() == BRICK_STARTED: # raise an alert for brick msg = ( "Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, BRICK_STOPPED.title() ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path, ) event_utils.emit_event( "brick_status", BRICK_STOPPED.title(), msg, instance, 'WARNING', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": brick.vol_name, "node_id": brick.node_id, "fqdn": brick.hostname } ) # Update brick status as stopped brick.status = BRICK_STOPPED.title() brick.save() lock.release() except ( etcd.EtcdException, KeyError, ValueError, AttributeError ) as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={ "message": "Unable to raise an brick status " "alert for host %s" % hostname, "exception": ex } ) ) finally: if isinstance(lock, etcd.lock.Lock) and lock.is_acquired: lock.release()
def process_job(job): jid = job.key.split('/')[-1] job_status_key = "/queue/%s/status" % jid job_lock_key = "/queue/%s/locked_by" % jid NS.node_context = NS.node_context.load() # Check job not already locked by some agent try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: return except etcd.EtcdKeyNotFound: pass # Check job not already "finished", or "processing" try: _status = etcd_utils.read(job_status_key).value if _status in ["finished", "processing"]: return except etcd.EtcdKeyNotFound: pass try: _job_timeout_key = "/queue/%s/timeout" % jid _timeout = None _timeout = etcd_utils.read(_job_timeout_key).value if _timeout: _timeout = _timeout.lower() except etcd.EtcdKeyNotFound: pass # tendrl-node-agent tagged as tendrl/monitor will ensure # >10 min old "new" jobs are timed out and marked as # "failed" (the parent job of these jobs will also be # marked as "failed") if "tendrl/monitor" in NS.node_context.tags and \ _timeout == "yes": _job_valid_until_key = "/queue/%s/valid_until" % jid _valid_until = None try: _valid_until = etcd_utils.read( _job_valid_until_key).value except etcd.EtcdKeyNotFound: pass if _valid_until: _now_epoch = (time_utils.now() - datetime.datetime(1970, 1, 1).replace( tzinfo=utc)).total_seconds() if int(_now_epoch) >= int(_valid_until): # Job has "new" status since 10 minutes, # mark status as "failed" and Job.error = # "Timed out" try: etcd_utils.write(job_status_key, "failed", prevValue="new") except etcd.EtcdCompareFailed: pass else: job = NS.tendrl.objects.Job(job_id=jid).load() _msg = str("Timed-out (>10min as 'new')") job.errors = _msg job.save() if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job timed out (job_id: %s)" % jid, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) return else: _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10) _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc) # noinspection PyTypeChecker _now_plus_10_epoch = (_now_plus_10 - _epoch_start).total_seconds() etcd_utils.write(_job_valid_until_key, int(_now_plus_10_epoch)) job = NS.tendrl.objects.Job(job_id=jid).load() if job.payload["type"] == NS.type and \ job.status == "new": # Job routing # Flows created by tendrl-api use 'tags' from flow # definition to target jobs _tag_match = False if job.payload.get("tags", []): for flow_tag in job.payload['tags']: if flow_tag in NS.node_context.tags: _tag_match = True if not _tag_match: _job_tags = ", ".join(job.payload.get("tags", [])) _msg = "Node (%s)(type: %s)(tags: %s) will not " \ "process job-%s (tags: %s)" % \ (NS.node_context.node_id, NS.type, NS.node_context.tags, jid, _job_tags) logger.log( "info", NS.publisher_id, {"message": _msg} ) return job_status_key = "/queue/%s/status" % job.job_id job_lock_key = "/queue/%s/locked_by" % job.job_id try: lock_info = dict(node_id=NS.node_context.node_id, fqdn=NS.node_context.fqdn, tags=NS.node_context.tags, type=NS.type) etcd_utils.write(job_status_key, "processing", prevValue="new") etcd_utils.write(job_lock_key, json.dumps(lock_info)) except etcd.EtcdCompareFailed: # job is already being processed by some tendrl # agent return the_flow = None try: current_ns, flow_name, obj_name = \ _extract_fqdn(job.payload['run']) if obj_name: runnable_flow = current_ns.ns.get_obj_flow( obj_name, flow_name) else: runnable_flow = current_ns.ns.get_flow(flow_name) the_flow = runnable_flow(parameters=job.payload[ 'parameters'], job_id=job.job_id) logger.log( "info", NS.publisher_id, {"message": "Processing Job %s" % job.job_id}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) logger.log( "info", NS.publisher_id, {"message": "Running Flow %s" % job.payload['run']}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) the_flow.run() try: etcd_utils.write(job_status_key, "finished", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'finished', " \ "current job status invalid" raise FlowExecutionFailedError(_msg) logger.log( "info", NS.publisher_id, {"message": "Job (%s): Finished " "Flow %s" % ( job.job_id, job.payload['run'])}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'], ) if job.payload.get('parent') is None: alert_utils.alert_job_status( "finished", "Job finished successfully (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as e: _trace = str(traceback.format_exc(e)) _msg = "Failure in Job %s Flow %s with error:" % \ (job.job_id, job.payload['run']) Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": _msg + _trace, "exception": e } ) ) if the_flow: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace}, job_id=job.job_id, flow_id=the_flow.parameters['flow_id'] ) else: logger.log( "error", NS.publisher_id, {"message": _msg + "\n" + _trace} ) try: etcd_utils.write(job_status_key, "failed", prevValue="processing") except etcd.EtcdCompareFailed: # This should not happen! _msg = "Cannot mark job as 'failed', current" \ "job status invalid" raise FlowExecutionFailedError(_msg) else: job = job.load() job.errors = _trace if job.payload.get('parent') is None: alert_utils.alert_job_status( "failed", "Job failed (job_id: %s)" % job.job_id, integration_id=NS.tendrl_context.integration_id or job.payload['parameters'].get( 'TendrlContext.integration_id' ), cluster_name=NS.tendrl_context.cluster_name or job.payload['parameters'].get( 'TendrlContext.cluster_name' ) ) job.save()
def sync(): try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": "Running SDS detection"})) try: sds_discovery_manager = sds_manager.SDSDiscoveryManager() except ValueError as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Failed to init SDSDiscoveryManager.", "exception": ex })) return # Execute the SDS discovery plugins and tag the nodes with data for plugin in sds_discovery_manager.get_available_plugins(): sds_details = plugin.discover_storage_system() if ('detected_cluster_id' in sds_details and sds_details['detected_cluster_id'] != ""): if sds_details: try: dc = NS.tendrl.objects.DetectedCluster().load() dc_changed = False if dc.detected_cluster_id: if dc.detected_cluster_id != sds_details.get( 'detected_cluster_id'): dc_changed = True else: time.sleep(3) integration_index_key = \ "indexes/detected_cluster_id_to_integration_id/" \ "%s" % sds_details['detected_cluster_id'] try: if dc_changed: integration_id = \ NS.tendrl_context.integration_id NS._int.wclient.write(integration_index_key, integration_id) else: integration_id = str(uuid.uuid4()) NS._int.wclient.write(integration_index_key, integration_id, prevExist=False) except etcd.EtcdAlreadyExist: if not dc_changed: integration_id = NS._int.client.read( integration_index_key).value finally: NS.tendrl_context.integration_id = integration_id NS.tendrl_context.cluster_id = sds_details.get( 'detected_cluster_id') NS.tendrl_context.cluster_name = sds_details.get( 'detected_cluster_name') NS.tendrl_context.sds_name = sds_details.get( 'pkg_name') NS.tendrl_context.sds_version = sds_details.get( 'pkg_version') NS.tendrl_context.save() NS.node_context = NS.node_context.load() integration_tag = "tendrl/integration/%s" % \ integration_id detected_cluster_tag = "detected_cluster/%s" % \ sds_details[ 'detected_cluster_id'] NS.node_context.tags += [ detected_cluster_tag, integration_tag ] NS.node_context.tags = list(set(NS.node_context.tags)) NS.node_context.save() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() NS.tendrl.objects.DetectedCluster( detected_cluster_id=sds_details.get( 'detected_cluster_id'), detected_cluster_name=sds_details.get( 'detected_cluster_name'), sds_pkg_name=sds_details.get('pkg_name'), sds_pkg_version=sds_details.get('pkg_version'), ).save() if _cluster.is_managed == "yes": continue else: _cluster.is_managed = "no" _cluster.save() except (etcd.EtcdException, KeyError) as ex: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Failed SDS detection", "exception": ex })) break except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync " "SDS detection failed: " + ex.message, "exception": ex }))
def sync(sync_ttl=None): try: tags = [] # update node agent service details logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating Service data"}) for service in TENDRL_SERVICES: s = NS.tendrl.objects.Service(service=service) if s.running: service_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags'][service.strip("@*")] tags.append(service_tag) if service_tag == "tendrl/server": tags.append("tendrl/monitor") s.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() # Try to claim orphan "provisioner_%integration_id" tag _tag = "provisioner/%s" % _cluster.integration_id _is_new_provisioner = False NS.node_context = NS.tendrl.objects.NodeContext().load() if _tag not in NS.node_context.tags: try: _index_key = "/indexes/tags/%s" % _tag _node_id = json.dumps([NS.node_context.node_id]) etcd_utils.write(_index_key, _node_id, prevExist=False) etcd_utils.refresh(_index_key, sync_ttl + 50) tags.append(_tag) _is_new_provisioner = True except etcd.EtcdAlreadyExist: pass # updating node context with latest tags logger.log( "debug", NS.publisher_id, {"message": "node_sync, updating node context " "data with tags"}) NS.node_context = NS.tendrl.objects.NodeContext().load() current_tags = list(NS.node_context.tags) tags += current_tags NS.node_context.tags = list(set(tags)) NS.node_context.tags.sort() current_tags.sort() if NS.node_context.tags != current_tags: NS.node_context.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = _cluster.load() if _is_new_provisioner and _cluster.is_managed == "yes": _msg = "node_sync, NEW provisioner node found! "\ "re-configuring monitoring (job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % NS.node_context.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': NS.tendrl_context.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) # Update /indexes/tags/:tag = [node_ids] for tag in NS.node_context.tags: index_key = "/indexes/tags/%s" % tag _node_ids = [] try: _node_ids = etcd_utils.read(index_key).value _node_ids = json.loads(_node_ids) except etcd.EtcdKeyNotFound: pass if _node_ids: if "provisioner" in tag: # Check if this is a stale provisioner if NS.node_context.node_id != _node_ids[0]: NS.node_context.tags.remove(tag) NS.node_context.save() continue if NS.node_context.node_id in _node_ids: if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) continue else: _node_ids += [NS.node_context.node_id] else: _node_ids = [NS.node_context.node_id] _node_ids = list(set(_node_ids)) etcd_utils.write(index_key, json.dumps(_node_ids)) if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating detected " "platform"}) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync service and indexes " "sync failed: " + ex.message, "exception": ex }))
def load_definition(self): cls_name = self.__class__.__name__ if hasattr(self, "obj"): obj_name = self.obj.__name__ Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Load definitions for namespace.%s." "objects.%s.flows.%s" % (self._ns.ns_src, obj_name, cls_name) })) try: return self._ns.get_obj_flow_definition(obj_name, cls_name) except KeyError as ex: msg = "Could not find definitions for " \ "namespace.%s.objects.%s.flows.%s" % (self._ns.ns_src, obj_name, cls_name) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Error", "exception": ex })) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) raise Exception(msg) finally: self.to_str = "%s.objects.%s.flows.%s" % (self._ns.ns_name, obj_name, cls_name) else: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Load definitions for namespace.%s." "flows.%s" % (self._ns.ns_src, cls_name) })) try: return self._ns.get_flow_definition(cls_name) except KeyError as ex: msg = "Could not find definitions for namespace.%s.flows.%s" %\ (self._ns.ns_src, cls_name) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "Error", "exception": ex })) Event( Message(priority="debug", publisher=NS.publisher_id, payload={"message": msg})) raise Exception(msg) finally: self.to_str = "%s.flows.%s" % (self._ns.ns_name, cls_name)