def run(self): retry_count = 0 while True: _cluster = None try: _cluster = NS.tendrl.objects.Cluster( integration_id=self. parameters["TendrlContext.integration_id"]).load() except etcd.EtcdKeyNotFound: # pass and continue the time out below pass if _cluster.exists() and _cluster.is_managed == "yes": return True retry_count += 1 time.sleep(1) if retry_count == 600: logger.log("error", NS.publisher_id, { "message": "Cluster data sync still incomplete. " "Timing out" }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], integration_id=NS.tendrl_context.integration_id) raise AtomExecutionFailedError( "Cluster data sync still incomplete. Timing out")
def update_clusters_alert_count(self): cluster_ids = central_store_util.get_cluster_ids() for cluster_id in cluster_ids: try: crit_alerts, warn_alerts = parse_resource_alerts( None, alerting_consts.CLUSTER, cluster_id=cluster_id, ) ClusterAlertCounters( warn_count=len(warn_alerts), crit_count=len(crit_alerts), cluster_id=cluster_id ).save(update=False) except AlertingError as ex: logger.log( "error", NS.get( "publisher_id", None ), { "message": 'Failed to update cluster alert counter.' ' Exception %s' % str(ex) } ) continue
def __generate_executable_module(self): modname = os.path.basename(self.module_path) modname = os.path.splitext(modname)[0] try: (module_data, module_style, shebang) = \ module_common.modify_module( modname, self.module_path, self.argument_dict, None, task_vars={} ) except Exception as e: logger.log( "debug", self.publisher_id, {"message": "Could not generate ansible " "executable data " "for module : %s. Error: %s" % (self.module_path, str(e))}, node_id=self.node_id ) raise AnsibleExecutableGenerationFailed( module_path=self.module_path, err=str(e) ) return module_data
def __init__( self, module_path, publisher_id=None, node_id=None, **kwargs ): self.module_path = modules.__path__[0] + "/" + module_path self.publisher_id = publisher_id or NS.publisher_id self.node_id = node_id or NS.node_context.node_id if not os.path.isfile(self.module_path): logger.log( "debug", self.publisher_id, {"message": "Module path: %s does not exist" % self.module_path}, node_id=self.node_id ) raise AnsibleModuleNotFound(module_path=self.module_path) if kwargs == {}: logger.log( "debug", self.publisher_id, {"message": "Empty argument dictionary"}, node_id=self.node_id ) raise ValueError else: self.argument_dict = kwargs self.argument_dict['_ansible_selinux_special_fs'] = \ ['nfs', 'vboxsf', 'fuse', 'ramfs']
def node_wise_brick_count(cluster_detail): local_metrics = [ "clusters.$integration_id.nodes.$node_name.brick_count." "total.$brick_total_count", "clusters.$integration_id.nodes.$node_name.brick_count." "down.$brick_down_count", "clusters.$integration_id.nodes.$node_name.brick_count." "up.$brick_up_count" ] metrics = [] for metric in local_metrics: metric = metric.replace("$integration_id", str(cluster_detail["integration_id"])) for node in cluster_detail["Node"]: try: local_metric = metric.replace("$node_name", node["fqdn"].replace(".", "_")) local_metric = local_metric.replace( local_metric.rsplit(".", 1)[1], str(node[str( local_metric.rsplit(".", 1)[1].replace("$", ""))])) metrics.append(copy.deepcopy(local_metric)) except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to create brick metric " "for Node: {0} " "Metric: {1}".format(node, metric) + str(ex) }) return metrics
def update_nodes_alert_count(self): node_ids = central_store_util.get_node_ids() for node_id in node_ids: try: crit_alerts, warn_alerts = parse_resource_alerts( None, alerting_consts.NODE, node_id=node_id, ) NodeAlertCounters( warn_count=len(warn_alerts), crit_count=len(crit_alerts), node_id=node_id ).save(update=False) except AlertingError as ex: logger.log( "error", NS.get( "publisher_id", None ), { "message": 'Failed to update node alert counter.' ' Exception %s' % str(ex) } ) continue
def _derive_cluster_id(self): cmd = subprocess.Popen("gluster pool list", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = cmd.communicate() if err or out is None or "Connection failed" in out: _msg = "Could not detect SDS:Gluster installation" logger.log("debug", NS.publisher_id, {"message": _msg}) return "", {} lines = out.split('\n')[1:] gfs_peers_uuid = [] gfs_peer_data = {} for line in lines: if line != '': peer = line.split() # Use the gluster generated pool UUID as unique key gfs_peers_uuid.append(peer[0]) gfs_peer_data[peer[0]] = { "connected": peer[-1], "hostname": peer[-2] } gfs_peers_uuid.sort() return (hashlib.sha256("".join(gfs_peers_uuid)).hexdigest(), gfs_peer_data)
def run(self): logger.log( "debug", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) while not self._complete.is_set(): _job_sync_interval = 5 NS.node_context = NS.node_context.load() if "tendrl/monitor" in NS.node_context.tags: _job_sync_interval = 3 time.sleep(_job_sync_interval) try: jobs = etcd_utils.read("/queue") except etcd.EtcdKeyNotFound: continue for job in jobs.leaves: # Check job not already locked by some agent jid = job.key.split('/')[-1] job_lock_key = "/queue/%s/locked_by" % jid try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: continue except etcd.EtcdKeyNotFound: pass _job_thread = threading.Thread( target=process_job, args=(jid) ) _job_thread.daemon = True _job_thread.start() _job_thread.join()
def __init__(self, ns_name="tendrl", ns_src="tendrl.commons"): super(TendrlNS, self).__init__() if not hasattr(__builtin__, "NS"): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.wreconnect = cs_utils.wreconnect NS._int.reconnect = cs_utils.reconnect NS._int.watchers = dict() ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' logger.log("info", NS.get("publisher_id", None), {'message': "Creating namespace.%s from source %s" % (ns_name, ns_src)}) self.ns_name = ns_name self.ns_src = ns_src self._create_ns() self.current_ns = self._get_ns() logger.log("info", NS.get("publisher_id", None), {'message': "namespace.%s created!" % self.ns_name}) self._register_subclasses_to_ns() self.setup_definitions() self._validate_ns_definitions() self.setup_common_objects()
def georep_status(cluster_detail): local_metrics = [ "clusters.$integration_id.georep.total.$total", "clusters.$integration_id.georep.up.$up", "clusters.$integration_id.georep.down.$down", "clusters.$integration_id.georep.partial.$partial", "clusters.$integration_id.georep.stopped.$stopped", "clusters.$integration_id.georep.paused.$paused", "clusters.$integration_id.georep.created.$created" ] metrics = [] for metric in local_metrics: try: local_metric = metric.replace( "$integration_id", str(cluster_detail["integration_id"])) local_metric = local_metric.replace( local_metric.rsplit(".", 1)[1], str(cluster_detail["geo_rep"][str( local_metric.rsplit(".", 1)[1].replace("$", ""))])) metrics.append(copy.deepcopy(local_metric)) except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to create cluster metric {0} " "for cluster {1}".format( metric, str(cluster_detail["integration_id"])) + str(ex) }) return metrics
def node_wise_brick_status(cluster_detail): metric = "clusters.$integration_id.nodes.$node_name." \ "bricks.$brick_name.status.$status" metrics = [] for brick in cluster_detail["Brick"]: try: local_metric = metric.replace( "$integration_id", str(cluster_detail["integration_id"])) local_metric = local_metric.replace("$node_name", brick["host_name"]) local_metric = local_metric.replace( "$brick_name", brick["brick_name"].replace("/", "|")) local_metric = local_metric.replace( local_metric.rsplit(".", 1)[1], str(brick[str(local_metric.rsplit(".", 1)[1].replace("$", ""))])) metrics.append(copy.deepcopy(local_metric)) except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to create brick metric {0} " "for Brick :{1}".format(metric, brick) + str(ex) }) return metrics
def run(self): logger.log( "debug", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) while not self._complete.is_set(): _job_sync_interval = 5 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if "tendrl/monitor" not in NS.node_context.tags: if NS.tendrl_context.integration_id is None or \ NS.node_context.fqdn is None: time.sleep(_job_sync_interval) continue if "tendrl/monitor" in NS.node_context.tags: _job_sync_interval = 3 time.sleep(_job_sync_interval) try: jobs = NS.tendrl.objects.Job().load_all() except etcd.EtcdKeyNotFound: continue for job in jobs: # Check job not already locked by some agent if job.locked_by or job.job_id in [None, '']: continue _job_thread = threading.Thread( target=process_job, args=(job.job_id,) ) _job_thread.daemon = True _job_thread.start() _job_thread.join()
def run(self): if NS.gdeploy_plugin.rebalance_volume( self.parameters.get('Volume.volname'), "stop", force=self.parameters.get('Volume.force')): logger.log("info", NS.publisher_id, { "message": "Stopped the rebalance for volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) else: logger.log("error", NS.publisher_id, { "message": "Failed to stop rebalance for volume %s" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) return False return True
def get_conf(): try: # Graphite and Grafana will be running on localhost NS.config.data["grafana_host"] = "127.0.0.1" NS.config.data["grafana_port"] = 3000 # Default values for graphite datasource NS.config.data["datasource_type"] = "graphite" NS.config.data["basicAuth"] = False # Grafana related configs NS.config.data["datasource"] = [] NS.config.data["credentials"] = ( NS.config.data["credentials"]["user"], NS.config.data["credentials"]["password"]) except exceptions.InvalidConfigurationException: err = exceptions.InvalidConfigurationException( "Error in loading configuration" ) logger.log("info", NS.get("publisher_id", None), {'message': str(err)}) raise err
def _setup_gluster_native_message_reciever(self): service = svc.Service("glustereventsd") message, success = service.start() gluster_eventsd = svc_stat.ServiceStatus("glustereventsd") if not gluster_eventsd.status(): if not success: logger.log( "error", NS.publisher_id, { "message": "glustereventsd could" " not be started: %s" % message } ) return False url = "http://{0}:{1}{2}".format(self.host, str(self.port), self.path) cmd = cmd_utils.Command('gluster-eventsapi webhook-add %s' % url) out, err, rc = cmd.run() if rc != 0: severity = "debug" if "Webhook already exists" in err else "error" logger.log( severity, NS.publisher_id, { "message": "could not add webhook" " for glustereventsd. {0}: {1}".format( severity, err ) } ) return True
def run(self): if not self._setup_gluster_native_message_reciever(): logger.log( "error", NS.publisher_id, {"message": "gluster native message reciever setup failed"} ) return # Enable WSGI access logging via Paste app_logged = TransLogger(app) # Mount the WSGI callable object (app) on the root directory cherrypy.tree.graft(app_logged, '/') # Set the configuration of the web server cherrypy.config.update({ 'engine.autoreload_on': False, 'log.screen': True, 'server.socket_port': self.port, 'server.socket_host': self.host, 'log.access_file': '', 'log.error_file': '' }) # Start the CherryPy WSGI web server cherrypy.engine.start() cherrypy.engine.block()
def get_lvs(): _lvm_cmd = ("lvm vgs --unquoted --noheading --nameprefixes " "--separator $ --nosuffix --units m -o lv_uuid," "lv_name,data_percent,pool_lv,lv_attr,lv_size," "lv_path,lv_metadata_size,metadata_percent,vg_name") cmd = cmd_utils.Command(_lvm_cmd, True) out, err, rc = cmd.run() if rc != 0: logger.log("debug", NS.publisher_id, {"message": str(err)}) return None d = {} if str(out) != '': try: out = out.split('\n') lst = map( lambda x: dict(x), map(lambda x: [e.split('=') for e in x], map(lambda x: x.strip().split('$'), out))) for i in lst: if i['LVM2_LV_ATTR'][0] == 't': k = "%s/%s" % (i['LVM2_VG_NAME'], i['LVM2_LV_NAME']) else: k = os.path.realpath(i['LVM2_LV_PATH']) d.update({k: i}) except (ValueError, KeyError) as ex: # Keyerror will raise when any changes in attributes name # of lvm output # ValueError will raise when any problem in output format # Because parsing logic will raise error logger.log("debug", NS.publisher_id, {"message": str(ex)}) return d
def alert_job_status(curr_value, msg, integration_id=None, cluster_name=None): alert = {} alert['source'] = NS.publisher_id alert['classification'] = 'cluster' alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'STATUS' severity = "INFO" if curr_value.lower() == "failed": severity = "WARNING" alert['severity'] = severity alert['resource'] = 'job_status' alert['current_value'] = curr_value alert['tags'] = dict( message=msg, integration_id=integration_id or NS.tendrl_context.integration_id, cluster_name=cluster_name or NS.tendrl_context.cluster_name, sds_name=NS.tendrl_context.sds_name, fqdn=NS.node_context.fqdn ) alert['node_id'] = NS.node_context.node_id if not NS.node_context.node_id: return logger.log( "notice", "alerting", {'message': json.dumps(alert)} )
def volume_count(cluster_detail): local_metrics = [ "clusters.$integration_id.volume_count.total.$volume_total_count", "clusters.$integration_id.volume_count.down.$volume_down_count", "clusters.$integration_id.volume_count.up.$volume_up_count", "clusters.$integration_id.volume_count.partial.$volume_partial_count", "clusters.$integration_id.volume_count.degraded.$volume_degraded_count" ] metrics = [] for metric in local_metrics: try: local_metric = metric.replace( "$integration_id", str(cluster_detail["integration_id"])) local_metric = local_metric.replace( local_metric.rsplit(".", 1)[1], str(cluster_detail[str( local_metric.rsplit(".", 1)[1].replace("$", ""))])) metrics.append(copy.deepcopy(local_metric)) except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to create cluster metric " "{0} for cluster {1}".format( metric, str(cluster_detail["integration_id"])) + str(ex) }) return metrics
def run(self): retry_count = 0 while True: _cluster = None try: _cluster = NS.tendrl.objects.Cluster( integration_id=self.parameters[ "TendrlContext.integration_id" ] ).load() except etcd.EtcdKeyNotFound: # pass and continue the time out below pass if _cluster.exists() and _cluster.is_managed == "yes": return True retry_count += 1 time.sleep(1) if retry_count == 600: logger.log( "error", NS.publisher_id, {"message": "Cluster data sync still incomplete. " "Timing out"}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], integration_id=NS.tendrl_context.integration_id ) return False
def volume_wise_brick_count(cluster_detail): local_metrics = [ "clusters.$integration_id.volumes.$volume_name." "brick_count.total.$total", "clusters.$integration_id.volumes.$volume_name." "brick_count.down.$down", "clusters.$integration_id.volumes.$volume_name." "brick_count.up.$up" ] metrics = [] for metric in local_metrics: metric = metric.replace("$integration_id", str(cluster_detail["integration_id"])) for volume in cluster_detail["Volume"]: try: local_metric = metric.replace("$volume_name", volume["name"]) local_metric = local_metric.replace( local_metric.rsplit(".", 1)[1], str(cluster_detail["volume_level_brick_count"][str( volume["name"])][str( local_metric.rsplit(".", 1)[1].replace("$", ""))])) metrics.append(copy.deepcopy(local_metric)) except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to create volume metric {0} " "for Volume :{1}".format(metric, volume["name"]) + str(ex) }) return metrics
def _getBrickList(self, brick_count, sub_vol_len, volume_id): try: result = NS._int.client.read( "clusters/%s/Volumes/%s/Bricks" % (NS.tendrl_context.integration_id, volume_id), ) bricks = result.leaves except etcd.EtcdKeyNotFound: logger.log( "error", NS.publisher_id, { "message": "Volume %s does not have Bricks directory" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id, ) return [] b_list = ["" for el in range(brick_count)] for el in bricks: result = NS._int.client.read(el.key + "/" + "sequence_number") b_list[int(result.value) - 1] = el.key.split("/")[-1] brick_list = [] for i in range(brick_count / sub_vol_len): sub_vol = [] for b in b_list[i * sub_vol_len:(i + 1) * sub_vol_len]: sub_vol.append(b) brick_list.append(sub_vol) return brick_list
def check_service_status(self, services, node): required_services_running = True for service_name in services: service = NS.tendrl.objects.Service( service=service_name ) if not service.running: if len(service.error) > 0: msg = ("Failed to check status of %s " "on %s. Error: %s" % ( service_name, node, service.error )) else: msg = ("Service %s is not running on %s, " "Please check the log file to figure out the " "exact problem" % (service_name, node)) logger.log( "error", NS.get("publisher_id", None), { "message": msg }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) required_services_running = False return required_services_running
def load_definition(self): try: logger.log( "debug", NS.publisher_id, { "message": "Load definitions (.yml) for " "namespace.%s.objects.%s" % (self._ns.ns_name, self.__class__.__name__) }) except KeyError: sys.stdout.write("Load definitions (.yml) for namespace.%s.objects" ".%s \n" % (self._ns.ns_name, self.__class__.__name__)) try: return self._ns.get_obj_definition(self.__class__.__name__) except KeyError as ex: msg = "Could not find definitions (.yml) for " \ "namespace.%s.objects.%s" %\ (self._ns.ns_name, self.__class__.__name__) try: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "error", "exception": ex })) except KeyError: sys.stdout.write(str(ex) + "\n") try: logger.log("debug", NS.publisher_id, {"message": msg}) except KeyError: sys.stdout.write(msg + "\n") raise Exception(msg)
def _setup_gluster_native_message_reciever(self): service = svc.Service("glustereventsd") message, success = service.start() gluster_eventsd = svc_stat.ServiceStatus("glustereventsd") if not gluster_eventsd.status(): if not success: logger.log( "error", NS.publisher_id, { "message": "glustereventsd could" " not be started: %s" % message }) return False url = "http://{0}:{1}{2}".format(self.host, str(self.port), self.path) cmd = cmd_utils.Command('gluster-eventsapi webhook-add %s' % url) out, err, rc = cmd.run() if rc != 0: severity = "info" if "Webhook already exists" in err else "error" logger.log( severity, NS.publisher_id, { "message": "could not add webhook" " for glustereventsd. {0}: {1}".format(severity, err) }) return True
def update_cluster_alert_count(): cluster_alert_count = 0 severity = ["WARNING", "CRITICAL"] try: alert_counts = get_volume_alert_counts() alerts = NS.tendrl.objects.ClusterAlert( tags={'integration_id': NS.tendrl_context.integration_id} ).load_all() for alert in alerts: alert.tags = json.loads(alert.tags) if alert.severity in severity: cluster_alert_count += 1 if alert.resource in NS.gluster.objects.VolumeAlertCounters( )._defs['relationship'][alert.alert_type.lower()]: vol_name = alert.tags.get('volume_name', None) if vol_name and vol_name in alert_counts.keys(): alert_counts[vol_name]['alert_count'] += 1 # Update cluster alert count NS.tendrl.objects.ClusterAlertCounters( integration_id=NS.tendrl_context.integration_id, alert_count=cluster_alert_count ).save() # Update volume alert count for volume, vol_dict in alert_counts.iteritems(): NS.gluster.objects.VolumeAlertCounters( integration_id=NS.tendrl_context.integration_id, alert_count=vol_dict['alert_count'], volume_id=vol_dict['vol_id'] ).save() except (etcd.EtcdException, AttributeError) as ex: logger.log( "debug", NS.publisher_id, {"message": "Unable to update alert count.err: %s" % ex} )
def _add_metrics(objects, obj_name, metric, resource): metrics = [] for obj in objects[obj_name]["attrs"]: if obj == "name" or obj == "fqdn": continue local_metric = copy.deepcopy(metric) try: if isinstance(resource[obj],dict): for key, value in resource[obj].items(): if key == "details": continue new_metric = local_metric + "." + str(obj) + "." + str(key) metric_value = str(value) final_metric = {new_metric : metric_value} metrics.append(copy.deepcopy(final_metric)) else: metric_value = str(resource[obj]) if str(obj) == "status" and "volumes" in metric: obj = "vol_status" local_metric = local_metric + "." + str(obj) final_metric = {local_metric : metric_value} metrics.append(copy.deepcopy(final_metric)) except {AttributeError,KeyError} as ex: logger.log("error", NS.get("publisher_id", None), {'message': str(ex)}) pass return metrics
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] logger.log( "info", NS.get("publisher_id", None), { "message": "Setting cluster %s is_managed to \"no\":" % integration_id }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) try: _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.is_managed = "no" _cluster.save() except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Error setting cluster %s" "is_managed to \"no\":" % ( integration_id ) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False return True
def set_volume_count(self, cluster_data, resource_name): for cluster in cluster_data: resources = cluster[str(resource_name)] cluster[str(resource_name).lower() + "_total_count"] = len(resources) up = 0 down = 0 partial = 0 degraded = 0 for resource in resources: try: if resource["state"] == 0 or resource["state"] == 1: up = up + 1 elif resource["state"] == 4: partial = partial + 1 elif resource["state"] == 3: degraded = degraded + 1 else: down = down + 1 except KeyError as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to set resource count " "for {0}".format(resource_name) + str(ex) }) cluster[str(resource_name).lower() + "_up_count"] = up cluster[str(resource_name).lower() + "_down_count"] = down cluster[str(resource_name).lower() + "_partial_count"] = partial cluster[str(resource_name).lower() + "_degraded_count"] = degraded return cluster_data
def set_volume_level_brick_count(self, cluster_data): for cluster in cluster_data: volume_detail = {} for volume in cluster["Volume"]: try: volume_detail[volume["name"]] = { "total": 0, "up": 0, "down": 0 } except (AttributeError, KeyError): pass # Increment count using volume_details for brick in cluster["Brick"]: try: volume_detail[str(brick["vol_name"])]["total"] = \ volume_detail[str(brick["vol_name"])]["total"] + 1 if brick["status"] == 0 or brick["status"] == 1: volume_detail[str(brick["vol_name"])]["up"] = \ volume_detail[str(brick["vol_name"])]["up"] + 1 else: volume_detail[str(brick["vol_name"])]["down"] = \ volume_detail[str(brick["vol_name"])]["down"] + 1 except (AttributeError, KeyError) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to set volume level " "brick count" + str(ex) }) cluster["volume_level_brick_count"] = volume_detail return cluster_data
def find_node_id(integration_id, fqdn): try: nodes = etcd_utils.read("clusters/%s/nodes" % integration_id) for node in nodes.leaves: node_id = node.key.split('/')[-1] node_context = NS.tendrl.objects.ClusterNodeContext() # formating value here because render populate integration_id # from namespace node_context.value = node_context.value.format( integration_id, node_id) if fqdn == node_context.load().fqdn: return node_id raise NodeNotFound except (EtcdKeyNotFound, NodeNotFound) as ex: if type(ex) != EtcdKeyNotFound: logger.log("error", NS.publisher_id, {"message": "Failed to fetch fqdn for node %s" % fqdn}) else: logger.log( "error", NS.publisher_id, { "message": "Node with fqdn %s not found " "in cluster %s" % (fqdn, integration_id) }) raise ex
def run(self): try: runner = ansible_module_runner.AnsibleRunner( ANSIBLE_MODULE_PATH, **self.attributes) except ansible_module_runner.AnsibleModuleNotFound: # Backward compat ansible<=2.2 runner = ansible_module_runner.AnsibleRunner( "core/" + ANSIBLE_MODULE_PATH, **self.attributes) try: result, err = runner.run() try: logger.log("debug", NS.publisher_id, {"message": "Command Execution: %s" % result}) except KeyError: sys.stdout.write("Command Execution: %s \n" % result) except ansible_module_runner.AnsibleExecutableGenerationFailed as e: try: Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": "could not run the command %s. " % self.attributes["_raw_params"], "exception": e })) except KeyError: sys.stderr.write("could not run the command %s. Error: %s\n" % (self.attributes["_raw_params"], str(e))) return "", str(e.message), -1 stdout = result.get("stdout", "") stderr = result.get("stderr", "").encode("ascii") rc = result.get("rc", -1) return stdout, stderr, rc
def get_cluster_details(self, objects, cluster_key): cluster_detail = [] for obj in objects["Cluster"]: if obj in ["metric", "value"]: continue resource_detail = {} resource_detail[str(obj)] = {} obj_details = objects["Cluster"][str(obj)] obj_key = os.path.join(cluster_key, str(obj)) obj_attrs = obj_details["attrs"] for key, _ in obj_attrs.items(): try: attr_key = os.path.join(obj_key, key) attr_data = etcd_utils.read(attr_key) attr_value = self.cluster_status_mapper( str(attr_data.value)) resource_detail[str(obj)][key] = copy.deepcopy(attr_value) except (KeyError, etcd.EtcdKeyNotFound) as ex: integration_id = cluster_key.split("/")[-1] logger.log( "debug", NS.get("publisher_id", None), { 'message': "Cannot Find {0} in Cluster " "{1}".format(key, integration_id) + str(ex) }) if not resource_detail == {}: cluster_detail.append(resource_detail) return cluster_detail
def delete_volume(self, volume_name, host=None, force=None, format_bricks=None): args = {} if host: args.update({"host": host}) if force: args.update({"force": force}) out, err, rc = delete_volume.delete_volume( volume_name, **args ) if rc == 0: logger.log( "info", NS.publisher_id, {"message": "gluster volume %s deleted successfully" % volume_name}, integration_id=NS.tendrl_context.integration_id ) else: logger.log( "debug", NS.publisher_id, {"message": "Volume deletion failed for volume " "%s. Details: %s" % (volume_name, out)}, integration_id=NS.tendrl_context.integration_id ) return False if format_bricks: pass # TODO(darshan) Call gdeploy action to clear brick return True
def run(self): logger.log("info", NS.publisher_id, { "message": "Checking if volume %s doesnt exist" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) try: NS._int.client.read('clusters/%s/Volumes/%s' % (NS.tendrl_context.integration_id, self.parameters['Volume.vol_id'])) except etcd.EtcdKeyNotFound: logger.log("warning", NS.publisher_id, { "message": "Volume %s doesnt exist" % self.parameters['Volume.volname'] }, job_id=self.parameters["job_id"], flow_id=self.parameters["flow_id"], integration_id=NS.tendrl_context.integration_id) return True return False
def rebalance_volume(self, volume_name, action, host=None, force=None, fix_layout=None): args = {} if host: args.update({"host": host}) if force: args.update({"force": force}) if fix_layout and action == "start": action = "fix-layout" out, err, rc = rebalance_volume.rebalance_volume( volume_name, action, **args ) if rc == 0: logger.log( "info", NS.publisher_id, {"message": "Rebalance %s on volume %s performed" "successfully" % (action, volume_name)}, integration_id=NS.tendrl_context.integration_id ) else: logger.log( "debug", NS.publisher_id, {"message": "Rebalance %s failed for volume " "%s. Details: %s" % (action, volume_name, out)}, integration_id=NS.tendrl_context.integration_id ) return False return True
def gluster_provision_bricks(self, brick_dictionary, disk_type=None, disk_count=None, stripe_count=None): out, err, rc = gluster_brick_provision.provision_disks( brick_dictionary, disk_type, disk_count, stripe_count ) if rc == 0 and err == "": logger.log( "info", NS.publisher_id, {"message": "Bricks Provisioned successfully"}, integration_id=NS.tendrl_context.integration_id ) else: logger.log( "info", NS.publisher_id, {"message": "Bricks Provisioning Failed. Error %s" % ( str(out))}, integration_id=NS.tendrl_context.integration_id ) return False return True
def find_grafana_pid(): try: return check_output(["pidof", "grafana-server"]).strip() except CalledProcessError as ex: logger.log("error", NS.publisher_id, {"message": "unable to find grafana pid"}) raise ex
def stop_volume(self, volume_name, host=None, force=None): args = {} if host: args.update({"host": host}) if force: args.update({"force": force}) out, err, rc = stop_volume.stop_volume( volume_name, **args ) if rc == 0: logger.log( "info", NS.publisher_id, {"message": "Volume %s stopped successfully" % volume_name}, integration_id=NS.tendrl_context.integration_id ) else: logger.log( "debug", NS.publisher_id, {"message": "Volume stop failed for volume " "%s. Details: %s" % (volume_name, out)}, integration_id=NS.tendrl_context.integration_id ) return False return True
def get_volumes_details(cluster_key): volume_details = [] try: volume_list = utils.get_resource_keys(cluster_key, "Volumes") for volume_id in volume_list: deleted = etcd_utils.read(cluster_key + "/Volumes/" + str(volume_id) + "/" + "deleted").value if str(deleted).lower() != "true": volume_data = {} for attr in ATTRS["volumes"]: volume_data[attr] = etcd_utils.read(cluster_key + "/Volumes/" + str(volume_id) + "/" + attr).value subvolume_key = cluster_key + "/Volumes/" + str(volume_id) subvolume_details = get_subvolume_details(subvolume_key) volume_data["subvolume"] = subvolume_details volume_details.append(volume_data) except (KeyError, etcd.EtcdKeyNotFound) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Error while fetching " "volume id {}".format(volume_id) + str(ex) }) return volume_details
def run(self): retry_count = 0 while True: volumes = None try: volumes = NS._int.client.read("clusters/%s/Volumes" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: # ignore as no volumes available till now pass if volumes: for entry in volumes.leaves: volume = NS.gluster.objects.Volume( vol_id=entry.key.split("Volumes/")[-1]).load() if volume.name == self.parameters['Volume.volname']: return True retry_count += 1 time.sleep(1) if retry_count == 600: logger.log( "error", NS.publisher_id, { "message": "Volume %s not reflected in tendrl" " yet. Timing out" % self.parameters['Volume.volname'] }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], integration_id=NS.tendrl_context.integration_id) raise AtomExecutionFailedError( "Volume %s not reflected in tendrl yet. Timing out" % self.parameters['Volume.volname'])
def format_alert(self, alert_json): alert = self.parse_alert_metrics(alert_json) try: alert["alert_id"] = None alert["node_id"] = None alert["time_stamp"] = alert_json['NewStateDate'] alert["resource"] = self.representive_name alert['alert_type'] = constants.ALERT_TYPE alert['significance'] = constants.SIGNIFICANCE_HIGH alert['pid'] = utils.find_grafana_pid() alert['source'] = constants.ALERT_SOURCE alert['tags']['cluster_name'] = utils.find_cluster_name( alert['tags']['integration_id']) if alert_json['State'] == constants.GRAFANA_ALERT: if "critical" in alert_json['Name'].lower(): alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['critical'] else: alert['severity'] = \ constants.TENDRL_SEVERITY_MAP['warning'] alert['tags']['message'] = ( "Volume utilization of %s in " "cluster %s is %s %% which is above %s" " threshold (%s %%)" % (alert['tags']['volume_name'], alert['tags']['integration_id'], alert['current_value'], alert['severity'], alert['tags']['warning_max'])) elif alert_json['State'] == constants.GRAFANA_CLEAR_ALERT: # Identifying clear alert from which panel critical/warning if "critical" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['critical'] elif "warning" in alert_json['Name'].lower(): alert['tags']['clear_alert'] = \ constants.TENDRL_SEVERITY_MAP['warning'] alert['severity'] = constants.TENDRL_SEVERITY_MAP['info'] alert['tags']['message'] = ("Volume utilization of %s in " "cluster %s is back normal" % (alert['tags']['volume_name'], alert['tags']['integration_id'])) else: logger.log( "error", NS.publisher_id, { "message": "Alert %s have unsupported alert" "severity" % alert_json }) raise InvalidAlertSeverity return alert except (KeyError, CalledProcessError, EtcdKeyNotFound, NodeNotFound, InvalidAlertSeverity) as ex: Event( ExceptionMessage( "debug", NS.publisher_id, { "message": "Error in converting grafana" "alert into tendrl alert %s" % alert_json, "exception": ex }))
def test_log(): setattr(__builtin__, "NS", maps.NamedDict()) NS.publisher_id = 1 with mock.patch('tendrl.commons.event.Event.__init__', mock.Mock(return_value=None)): with mock.patch('tendrl.commons.message.Message.__init__', mock.Mock(return_value=None)): log_utils.log("info", "node_context", {"message": "test"}) log_utils.log("error", None, {"message": "test"})
def stop(self): if not self._cleanup_gluster_native_message_reciever(): logger.log( "error", NS.publisher_id, {"message": "gluster native message reciever cleanup failed"} ) cherrypy.engine.exit()
def reload_config(signum, frame): logger.log( "debug", NS.publisher_id, { "message": "Signal handler: SIGHUP," " reload service config" } ) NS.gluster.ns.setup_common_objects()
def shutdown(signum, frame): logger.log( "debug", NS.publisher_id, {"message": "Signal handler: stopping"} ) # Remove the node's name from gluster server tag try: gl_srvr_list = etcd_utils.read( "/indexes/tags/gluster/server" ).value gl_srvr_list = json.loads(gl_srvr_list) if NS.node_context.node_id in gl_srvr_list: gl_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/gluster/server", json.dumps(gl_srvr_list) ) node_tags = NS.node_context.tags if 'provisioner/%s' % NS.tendrl_context.integration_id \ in node_tags: etcd_utils.delete( "/indexes/tags/provisioner/%s" % NS.tendrl_context.integration_id, recursive=True ) int_srvr_list = etcd_utils.read( "/indexes/tags/tendrl/integration/gluster" ).value int_srvr_list = json.loads(int_srvr_list) if NS.node_context.node_id in int_srvr_list: int_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/tendrl/integration/gluster", json.dumps(int_srvr_list) ) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Couldnt remove node from " "gluster servers list tag." "integration_id: %s, node_id: %s" % ( NS.tendrl_context.integration_id, NS.node_context.node_id ) } ) pass complete.set() m.stop()
def start(self): logger.log( "debug", NS.publisher_id, {"message": "%s starting" % self.__class__.__name__} ) if self._message_handler_thread is not None: self._message_handler_thread.start() if self._sds_sync_thread is not None: self._sds_sync_thread.start() self._job_consumer_thread.start()
def load_definition(self): try: logger.log( "debug", NS.publisher_id, {"message": "Load definitions (.yml) for " "namespace.%s." "objects.%s.atoms.%s" % (self._ns.ns_name, self.obj.__name__, self.__class__.__name__)} ) except KeyError: sys.stdout.write( "Load definitions (.yml) for " "namespace.%s.objects.%s." "atoms.%s \n" % (self._ns.ns_name, self.obj.__name__, self.__class__.__name__) ) try: return self._ns.get_atom_definition( self.obj.__name__, self.__class__.__name__ ) except KeyError as ex: msg = "Could not find definitions (.yml) for" \ "namespace.%s.objects.%s.atoms.%s" % \ ( self._ns.ns_src, self.obj.__name__, self.__class__.__name__ ) try: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "Error", "exception": ex} ) ) except KeyError: sys.stderr.write("Error: %s \n" % ex) try: logger.log( "debug", NS.publisher_id, {"message": msg} ) except KeyError: sys.stderr.write(msg + "\n") raise Exception(msg)
def emit_event(resource, curr_value, msg, instance, severity, alert_notify=False, tags={}, integration_id=None, cluster_name=None, sds_name=None, node_id=None): alert = {} alert['source'] = NS.publisher_id alert['node_id'] = node_id alert['pid'] = os.getpid() alert['time_stamp'] = tendrl_now().isoformat() alert['alert_type'] = 'STATUS' alert['severity'] = severity alert['resource'] = resource alert['current_value'] = curr_value alert['tags'] = dict( plugin_instance=instance, message=msg, integration_id=integration_id or NS.tendrl_context.integration_id, cluster_name=cluster_name or NS.tendrl_context.cluster_name ) if "entity_type" in tags: if tags["entity_type"] == BRICK_ENTITY: alert['node_id'] = tags.get( "node_id", NS.node_context.node_id ) alert['tags']['fqdn'] = tags.get( "fqdn", NS.node_context.fqdn ) alert['tags']['volume_name'] = tags.get( 'volume_name', None ) elif tags["entity_type"] == VOLUME_ENTITY: alert['tags']['volume_name'] = tags.get( 'volume_name', None ) payload = {'message': json.dumps(alert)} payload['alert_condition_state'] = severity payload['alert_condition_status'] = resource if alert_notify: payload['alert_notify'] = alert_notify if severity == "INFO": payload['alert_condition_unset'] = True else: payload['alert_condition_unset'] = False logger.log( "notice", "alerting", payload, integration_id=integration_id )
def __run_module(self, attr): try: runner = ansible_module_runner.AnsibleRunner( ANSIBLE_MODULE_PATH, publisher_id=self.publisher_id, node_id=self.node_id, **attr ) except ansible_module_runner.AnsibleModuleNotFound: # Backward compat ansible<=2.2 runner = ansible_module_runner.AnsibleRunner( "core/" + ANSIBLE_MODULE_PATH, publisher_id=self.publisher_id, node_id=self.node_id, **attr ) try: result, err = runner.run() logger.log( "debug", self.publisher_id, {"message": "Service Management: %s" % result} ) except ansible_module_runner.AnsibleExecutableGenerationFailed as e: logger.log( "error", self.publisher_id, {"message": "Error switching the service: " "%s to %s state. Error: %s" % (self.attributes["name"], attr["state"], str(e) )}, node_id=self.node_id ) return e.message, False message = result.get("msg", "").encode("ascii") state = result.get("state", "").encode("ascii") if attr["state"] in ["started", "restarted", "reloaded"]: if state == "started": success = True else: success = False else: if attr["state"] == state: success = True else: success = False return message, success
def peer_detach(self, event): time.sleep(self.sync_interval) job_id = monitoring_utils.update_dashboard( event['message']['host'], RESOURCE_TYPE_PEER, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } )
def load_plugins(self): try: path = os.path.dirname(os.path.abspath(__file__)) + '/plugins' pkg = 'tendrl.gluster_integration.gdeploy_wrapper.plugins' plugins = self.list_modules_in_package_path(path, pkg) for name, plugin_fqdn in plugins: importlib.import_module(plugin_fqdn) except (SyntaxError, ValueError, ImportError) as ex: logger.log( "debug", NS.publisher_id, {"message": "Failed to load the gluster provisioner " "plugins. Error %s" % ex}, integration_id=NS.tendrl_context.integration_id ) raise ex
def _validate_ns_definitions(self): raw_ns = "namespace.%s" % self.ns_name try: defs = self.current_ns.definitions.get_parsed_defs()[raw_ns] except KeyError: msg = "%s definitions (.yml) not found" % raw_ns logger.log("error", NS.get("publisher_id", None), {"message": msg}) raise Exception(msg) ''' Flow/Object/Atom classes with class variable "internal=True" will not be validated and have to define their own self._defs (i.e. definitions dict as per latest Tendrl schema) ''' self._validate_ns_flow_definitions(raw_ns, defs) self._validate_ns_obj_definitions(raw_ns, defs)
def release_node_lock(parameters): for node_id in parameters['Node[]']: nc = NS.tendrl.objects.NodeContext(node_id=node_id).load() try: lock_owner_job = nc.locked_by if lock_owner_job == parameters['job_id']: nc.locked_by = None nc.save() logger.log( "info", NS.publisher_id, {"message": "Released lock (%s) on (%s)" % (lock_owner_job, node_id)}, job_id=parameters['job_id'], flow_id=parameters['flow_id'] ) except EtcdKeyNotFound: continue
def _cleanup_gluster_native_message_reciever(self): url = "http://{0}:{1}{2}".format(self.host, str(self.port), self.path) cmd = cmd_utils.Command('gluster-eventsapi webhook-del %s' % url) out, err, rc = cmd.run() if rc != 0: severity = "debug" if "Webhook does not exists" in err else "error" logger.log( severity, NS.publisher_id, { "message": "could not delete webhook from" " glustereventsd. {0}: {1}".format( severity, err ) } ) return True
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get( item['name'], {} ).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log( "debug", NS.publisher_id, {"message": _msg} ) etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def _create_node_id(self): node_id = str(uuid.uuid4()) try: logger.log( "debug", NS.publisher_id, {"message": "Registered Node (%s) with " % node_id} ) except KeyError: sys.stdout.write("message: Registered Node (%s) \n" % node_id) local_node_id = "/var/lib/tendrl/node_id" if not os.path.exists(os.path.dirname(local_node_id)): os.makedirs(os.path.dirname(local_node_id)) with open(local_node_id, 'wb+') as f: f.write(node_id) global NODE_ID NODE_ID = node_id return node_id
def run(self): try: runner = ansible_module_runner.AnsibleRunner( ANSIBLE_MODULE_PATH, **self.attributes ) except ansible_module_runner.AnsibleModuleNotFound: # Backward compat ansible<=2.2 runner = ansible_module_runner.AnsibleRunner( "core/" + ANSIBLE_MODULE_PATH, **self.attributes ) try: result, err = runner.run() try: logger.log( "debug", NS.publisher_id, {"message": "Command Execution: %s" % result} ) except KeyError: sys.stdout.write("Command Execution: %s \n" % result) except ansible_module_runner.AnsibleExecutableGenerationFailed as e: try: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={"message": "could not run the command %s. " % self.attributes["_raw_params"], "exception": e } ) ) except KeyError: sys.stderr.write("could not run the command %s. Error: %s\n" % (self.attributes["_raw_params"], str(e)) ) return "", str(e.message), -1 stdout = result.get("stdout", "") stderr = result.get("stderr", "").encode("ascii") rc = result.get("rc", -1) return stdout, stderr, rc